In [1]:
import os
import sys
from dotenv import load_dotenv
from uuid import uuid4

from loaders import HuggingFaceBlogLoader, HuggingFaceDocsLoader, ArxivLoader
from data.utils import load_dataframe_from_sheet, get_docs

load_dotenv()

sys.path.append(os.path.abspath(".."))
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
GOOGLE_SHEETS_ID = os.getenv("GOOGLE_SHEETS_ID")
GOOGLE_SHEETS_NAME = os.getenv("GOOGLE_SHEETS_NAME")

%load_ext autoreload
%autoreload 2

In [2]:
df = load_dataframe_from_sheet(GOOGLE_SHEETS_ID, GOOGLE_SHEETS_NAME)

In [3]:
hf_blogs_urls = get_docs(df, source="Hugging Face", doc_type="blog", urls_only=True)
hf_blogs_loader = HuggingFaceBlogLoader(firecrawl_api_key=FIRECRAWL_API_KEY, urls=hf_blogs_urls)
hf_blogs_docs = hf_blogs_loader.load()
hf_blogs_docs = hf_blogs_loader.clean(hf_blogs_docs)


Loading Hugging Face blogs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:06<00:00,  1.21s/it]
Cleaning Hugging Face blogs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 10412.87it/s]


In [4]:
hf_docs_urls = get_docs(df, source="Hugging Face", doc_type="docs", urls_only=True)
hf_docs_loader = HuggingFaceDocsLoader(firecrawl_api_key=FIRECRAWL_API_KEY, urls=hf_docs_urls)
hf_docs_docs = hf_docs_loader.load()
hf_docs_docs = hf_docs_loader.clean(hf_docs_docs)

Loading Hugging Face docs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [00:28<00:00,  1.26s/it]
Cleaning Hugging Face docs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [00:00<00:00, 4419.71it/s]


In [5]:
arxiv_urls = get_docs(df, source="arxiv", doc_type="paper", urls_only=True)
arxiv_loader = ArxivLoader(urls=arxiv_urls)
arxiv_docs = arxiv_loader.load()
arxiv_docs = arxiv_loader.clean(arxiv_docs)

Loading Arxiv papers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [01:30<00:00, 30.00s/it]
Cleaning Arxiv papers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 743.67it/s]


In [7]:
all_docs = hf_blogs_docs + hf_docs_docs + arxiv_docs
uuids = [str(uuid4()) for _ in range(len(all_docs))]

print(f"Total documents loaded: {len(all_docs)}")

Total documents loaded: 31


In [31]:
from transformers import AutoTokenizer
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
tokenizer = AutoTokenizer.from_pretrained("nomic-ai/modernbert-embed-base")


text_splitter = SentenceTransformersTokenTextSplitter.from_huggingface_tokenizer(
    tokenizer=tokenizer,
    tokens_per_chunk=1500,
    chunk_overlap=200,
)

chunks = text_splitter.split_documents(all_docs)
print(f"Total chunks created: {len(chunks)}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [30]:
len(tokenizer.encode(chunks[113].page_content))
print(chunks[113].page_content)

‚ÄîLarge models represent a groundbreaking advance-
ment in multiple application fields, enabling remarkable achieve-
ments across various tasks. However, their unprecedented scale
comes with significant computational costs. These models, often
consisting of billions of parameters, require vast amounts of
computational resources for execution. Especially, the expansive
scale and computational demands pose considerable challenges
when customizing them for particular downstream tasks, particu-
larly over the hardware platforms constrained by computational
capabilities.
Parameter Efficient Fine-Tuning (PEFT) provides a practical
solution by efficiently adjusting the large models over the various
downstream tasks. In particular, PEFT refers to the process of
adjusting the parameters of a pre-trained large model to adapt
it to a specific task or domain while minimizing the number
of additional parameters introduced or computational resources
required. This approach is particularly important

In [24]:
lengths = [len(tokenizer.encode(chunk.page_content)) for chunk in chunks]
idx = [str(i) for i in range(len(lengths))]
lengths_map = dict(zip(lengths, idx))
print(f"Average chunk length (in tokens): {sum(lengths) / len(lengths)}")
print(f"Max chunk length (in tokens): {max(lengths)}")
print(f"Min chunk length (in tokens): {min(lengths)}")
print(f"Chunk with max length: {lengths_map[max(lengths)]}")

Average chunk length (in tokens): 2466.4210526315787
Max chunk length (in tokens): 77516
Min chunk length (in tokens): 111
Chunk with max length: 111


In [13]:
from langchain_chroma import Chroma
from endpoints.tei import LocalTEIEmbeddings

hf_embeddings = LocalTEIEmbeddings("http://127.0.0.1:8080")
vector_store = Chroma(
    collection_name="trainwise_data",
    embedding_function=hf_embeddings,
    persist_directory="./chroma_db",
)


In [14]:
vector_store.add_documents(documents=all_docs, ids=uuids)

KeyboardInterrupt: 