In [2]:
from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.simplefilter("ignore", ResourceWarning)

# SearchFlow Demo 👨🏻‍💻
This notebook demonstrates the functions for importing data from various sources. 
Loading it into a VectorStore, and then using it to answer questions with a Retrieval Augemented Reasoning  🦜🔗 LangGraph.

## Creating a new project



In [4]:
from searchflow.db import DB
db = DB()

print(db.list_projects())
db.create_project("Vectrix", description="This is a test project")

['Test']


'Vectrix'

In [5]:
print(db.list_projects())

['Test', 'Vectrix']


In [None]:
db.remove_project("Test")

## Importing Data
### 1. From a URL 🔗

**Web Crawling and Data Extraction Example**


In [6]:
from searchflow.importers import WebScraper

scraper = WebScraper(project_name='Vectrix', db=db)
scraper.get_all_links("https://vectrix.ai")

  prompt = loads(json.dumps(prompt_object.manifest))
ERROR:trafilatura.downloads:not a 200 response: 404 for URL https://vectrix.ai/robots.txt


In [None]:
from searchflow.importers import WebScraper

scraper = WebScraper(project_name='Vectrix', db=db)
scraper.full_import("https://dataframe.be", max_pages=100)

In [7]:
print(db.get_links_to_confirm("Vectrix"))

[{'url': 'https://vectrix.ai/', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/career', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/offerings/chat-ui', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/blog-post/understanding-large-and-small-language-models-key-differences-and-applications', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/blog-post/google-deepminds-searchless-chess-engine---part-1', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/blog-post/are-llm-benchmarks-and-leaderboards-just-marketing-tools', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/about-us', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/blog-post/your-ai-might-be-misleading-you-understanding-the-dual-nature-of-llm-outputs', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/offerings/advice', 'base_url': 'https://vectrix.ai'}, {'url': 'https://vectrix.ai/blog', 'base_url': 'https://vec

In [None]:
db.add_links_to_index(links=["https://langchain-ai.github.io/langgraph/reference/graphs/"],base_url="https://langchain-ai.github.io/langgraph/reference/graphs/", project_name="Test", status="Confirm page import")

In [10]:
print(db.get_indexing_status("Vectrix"))

[{'project_name': 'Vectrix', 'status': 'Indexed', 'base_url': 'https://vectrix.ai', 'last_update': datetime.datetime(2024, 9, 12, 12, 45, 44, 656497, tzinfo=datetime.timezone.utc)}]


In [9]:
from searchflow.importers import WebScraper

scraper = WebScraper(project_name='Test', db=db)

confirmed_links = [link['url'] for link in db.get_links_to_confirm("Vectrix")]
scraper.download_pages(confirmed_links, project_name="Vectrix")

Already downloaded 0
To download ['https://vectrix.ai/', 'https://vectrix.ai/career', 'https://vectrix.ai/offerings/chat-ui', 'https://vectrix.ai/blog-post/understanding-large-and-small-language-models-key-differences-and-applications', 'https://vectrix.ai/blog-post/google-deepminds-searchless-chess-engine---part-1', 'https://vectrix.ai/blog-post/are-llm-benchmarks-and-leaderboards-just-marketing-tools', 'https://vectrix.ai/about-us', 'https://vectrix.ai/blog-post/your-ai-might-be-misleading-you-understanding-the-dual-nature-of-llm-outputs', 'https://vectrix.ai/offerings/advice', 'https://vectrix.ai/blog', 'https://vectrix.ai/contact-us', 'https://vectrix.ai/privacy-policy', 'https://vectrix.ai/offerings/projects', 'https://vectrix.ai/offerings/products', 'https://vectrix.ai/offerings', 'https://vectrix.ai/job-list/open-application---create-your-own-dream-job', 'https://vectrix.ai/job-list/junior-ai-researcher', 'https://vectrix.ai/job-list/internship', 'https://vectrix.ai/job-list/sof

### 2. Upload files ⬆️
You can also upload files and add them to the vector store, Vectrix will automaticly detect the file type extract the text and chunk the content into blocks.

In [None]:
from searchflow.importers import Files
file = './files/pdf_with_scannedtext.pdf'

# Load bytes data
with open(file, "rb") as f:
    bytes_data = f.read()

files = Files()
files.upload_file(
    document_data=[(bytes_data, "test.pdf")],
    project_name="Test",
    inference_type="local"
)

In [None]:
# List files
db.list_files("Vectrix")

In [None]:
# Remove a file
db.remove_file("Test", "test.pdf")

### 3. Chrome Plugin 🦊

In [None]:
# Launch the FastAPI server
#!python src/searchflow/api.py

