In [1]:
from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.simplefilter("ignore", ResourceWarning)

# Vectrix Demo 👨🏻‍💻
This notebook demonstrates the functions for importing data from various sources. 
Loading it into a VectorStore, and then using it to answer questions with a Retrieval Augemented Reasoning  🦜🔗 LangGraph.

## Creating a new project



In [4]:
from vectrix import DB
db = DB()

print(db.list_projects())
db.create_project("Test", description="This is a test project")

[32m2024-08-28 00:27:27,496 - DB - INFO - Added new project: Test[0m


[]


9

In [3]:
db.remove_project("Test")

[32m2024-08-28 00:27:26,068 - DB - INFO - Removed project: Test[0m


True

## Importing Data
### 1. From a URL 🔗

**Web Crawling and Data Extraction Example**


In [5]:
from vectrix.importers import WebScraper

scraper = WebScraper(project_name='Test')
scraper.get_all_links("https://vectrix.ai/")

[32m2024-08-28 00:27:30,302 - DB - INFO - Adding links to confirm for base URL: https://vectrix.ai/[0m


Added links to confirm for base URL: https://vectrix.ai/


ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://vectrix.ai/robots.txt
[32m2024-08-28 00:27:54,835 - DB - INFO - Updated indexed link status: https://vectrix.ai/[0m
INFO:DB:Updated indexed link status: https://vectrix.ai/
[32m2024-08-28 00:27:54,836 - DB - INFO - Adding links to confirm for base URL: https://vectrix.ai/[0m
INFO:DB:Adding links to confirm for base URL: https://vectrix.ai/


Added links to confirm for base URL: https://vectrix.ai/


In [16]:
print(db.get_links_to_confirm("Test"))

[32m2024-08-28 00:31:19,243 - DB - INFO - Getting links to confirm for project: Test[0m
INFO:DB:Getting links to confirm for project: Test


[]


In [11]:
db.add_links_to_index(links=["https://langchain-ai.github.io/langgraph/reference/graphs/"],base_url="https://langchain-ai.github.io/langgraph/reference/graphs/", project_name="Test", status="Confirm page import")

[32m2024-08-28 00:30:29,205 - DB - INFO - Adding links to confirm for base URL: https://langchain-ai.github.io/langgraph/reference/graphs/[0m
INFO:DB:Adding links to confirm for base URL: https://langchain-ai.github.io/langgraph/reference/graphs/


Added links to confirm for base URL: https://langchain-ai.github.io/langgraph/reference/graphs/


In [15]:
print(db.get_indexing_status("Test"))

[32m2024-08-28 00:31:06,510 - DB - INFO - Getting scrape status for project: Test[0m
INFO:DB:Getting scrape status for project: Test


[{'project_name': 'Test', 'status': 'Indexed', 'base_url': 'https://vectrix.ai/', 'last_update': datetime.datetime(2024, 8, 27, 22, 28, 23, 228971, tzinfo=datetime.timezone.utc)}, {'project_name': 'Test', 'status': 'Indexed', 'base_url': 'https://langchain-ai.github.io/langgraph/reference/graphs/', 'last_update': datetime.datetime(2024, 8, 27, 22, 30, 55, 733013, tzinfo=datetime.timezone.utc)}]


In [14]:
confirmed_links = [link['url'] for link in db.get_links_to_confirm("Test")]
scraper.download_pages(confirmed_links, project_name="Test")

[32m2024-08-28 00:30:51,472 - DB - INFO - Getting links to confirm for project: Test[0m
INFO:DB:Getting links to confirm for project: Test


Already downloaded 21
To download ['https://langchain-ai.github.io/langgraph/reference/graphs/']


[32m2024-08-28 00:30:55,733 - DB - INFO - Updated indexed link status: https://langchain-ai.github.io/langgraph/reference/graphs/[0m
INFO:DB:Updated indexed link status: https://langchain-ai.github.io/langgraph/reference/graphs/


### 2. Upload files ⬆️
You can also upload files and add them to the vector store, Vectrix will automaticly detect the file type extract the text and chunk the content into blocks.

In [None]:
from vectrix.importers import Files
files = Files()

files.upload_file(
    document_paths=["./files/pdf_with_scannedtext.pdf"],
    project_name="Test"
)

In [None]:
files = db.list_files("Test")
print(files)

In [None]:
db.remove_file(project_name="Test", file_name="pdf_with_scannedtext.pdf")