In [1]:
from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.simplefilter("ignore", ResourceWarning)

# SearchFlow Demo 👨🏻‍💻
This notebook demonstrates the functions for importing data from various sources. 
Loading it into a VectorStore, and then using it to answer questions with a Retrieval Augemented Reasoning  🦜🔗 LangGraph.

## Creating a new project



In [5]:
from searchflow.db import DB
db = DB()

print(db.list_projects())
db.create_project("Test", description="This is a test project")

[]


'Test'

In [4]:
db.remove_project("Test")

True

## Importing Data
### 1. From a URL 🔗

**Web Crawling and Data Extraction Example**


In [None]:
from searchflow.importers import WebScraper

scraper = WebScraper(project_name='Test', db=db)
scraper.get_all_links("https://vectrix.ai")

In [None]:
from searchflow.importers import WebScraper

scraper = WebScraper(project_name='Test', db=db)
scraper.full_import("https://dataframe.be", max_pages=100)

In [None]:
print(db.get_links_to_confirm("Test"))

In [None]:
db.add_links_to_index(links=["https://langchain-ai.github.io/langgraph/reference/graphs/"],base_url="https://langchain-ai.github.io/langgraph/reference/graphs/", project_name="Test", status="Confirm page import")

In [None]:
print(db.get_indexing_status("Test"))

In [None]:
from searchflow.importers import WebScraper

scraper = WebScraper(project_name='Test', db=db)

confirmed_links = [link['url'] for link in db.get_links_to_confirm("Test")]
scraper.download_pages(confirmed_links, project_name="Test")

### 2. Upload files ⬆️
You can also upload files and add them to the vector store, Vectrix will automaticly detect the file type extract the text and chunk the content into blocks.

In [8]:
from searchflow.importers import Files
file = './files/pdf_with_scannedtext.pdf'

# Load bytes data
with open(file, "rb") as f:
    bytes_data = f.read()

files = Files()
files.upload_file(
    document_data=[(bytes_data, "test.pdf")],
    project_name="Test",
    inference_type="local"
)

[32m2024-09-03 17:52:34,360 - Files - INFO - Processing files locally[0m
[32m2024-09-03 17:52:34,909 - Files - INFO - Processing file 1 of 1[0m
[32m2024-09-03 17:52:34,960 - Files - INFO - Uploaded test.pdf to object storage[0m
[32m2024-09-03 17:53:00,509 - Files - INFO - Chunked the document into 1 parts[0m


In [None]:
# Remove a file
db.remove_file("Test", "test.pdf")

### 3. Chrome Plugin 🦊

In [None]:
# Launch the FastAPI server
#!python src/searchflow/api.py

