In [7]:
import os
import sys
from dotenv import load_dotenv


from loaders import HuggingFaceBlogLoader, HuggingFaceDocsLoader, ArxivLoader
from data.utils import load_dataframe_from_sheet, get_docs

load_dotenv()

sys.path.append(os.path.abspath(".."))
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
GOOGLE_SHEETS_ID = os.getenv("GOOGLE_SHEETS_ID")
GOOGLE_SHEETS_NAME = os.getenv("GOOGLE_SHEETS_NAME")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
df = load_dataframe_from_sheet(GOOGLE_SHEETS_ID, GOOGLE_SHEETS_NAME)

In [24]:
hf_blogs_urls = get_docs(df, source="Hugging Face", doc_type="blog", urls_only=True)
hf_blogs_loader = HuggingFaceBlogLoader(firecrawl_api_key=FIRECRAWL_API_KEY, urls=hf_blogs_urls)
hf_blogs_docs = hf_blogs_loader.load()
hf_blogs_docs = hf_blogs_loader.clean(hf_blogs_docs)


Loading Hugging Face blogs: 100%|██████████| 5/5 [00:07<00:00,  1.44s/it]
Cleaning Hugging Face blogs: 100%|██████████| 5/5 [00:00<00:00, 4081.65it/s]


In [25]:
hf_docs_urls = get_docs(df, source="Hugging Face", doc_type="docs", urls_only=True)
hf_docs_loader = HuggingFaceDocsLoader(firecrawl_api_key=FIRECRAWL_API_KEY, urls=hf_docs_urls)
hf_docs_docs = hf_docs_loader.load()
hf_docs_docs = hf_docs_loader.clean(hf_docs_docs)

Loading Hugging Face docs: 100%|██████████| 23/23 [00:59<00:00,  2.59s/it]
Cleaning Hugging Face docs: 100%|██████████| 23/23 [00:00<00:00, 3062.99it/s]


In [31]:
arxiv_urls = get_docs(df, source="arxiv", doc_type="paper", urls_only=True)
arxiv_loader = ArxivLoader(urls=arxiv_urls)
arxiv_docs = arxiv_loader.load()
arxiv_docs = arxiv_loader.clean(arxiv_docs)

Loading Arxiv papers: 100%|██████████| 3/3 [00:10<00:00,  3.56s/it]
Cleaning Arxiv papers: 100%|██████████| 3/3 [00:00<00:00, 648.64it/s]


In [57]:
from langchain_chroma import Chroma

In [61]:
from endpoints.tei import LocalTEIEmbeddings

In [None]:
hf_embeddings = LocalTEIEmbeddings("http://127.0.0.1:8080")
texts = ["Hello, world!", "How are you?"]
vectors = hf_embeddings.embed_documents(texts)
print(len(vectors))

2


In [56]:
vectore_store = Chroma(
    collection_name="trainwise_data",
    embedding_function=hf_embeddings,
    persist_directory="./chroma_db",
)