# Finetuning embeddings model

In [1]:
import nest_asyncio

nest_asyncio.apply()

In [2]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader

path = "data/"
text_loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyMuPDFLoader)

# Load documents
training_documents = text_loader.load()
print(f"Successfully loaded {len(training_documents)} documents")

# Get paths of loaded documents
document_paths = [doc.metadata['source'] for doc in training_documents]

# Print paths
for path in document_paths:
    print(f"Loaded: {path}")

Successfully loaded 86 documents
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stripe API Reference.pdf
Loaded: data/Stri

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)

In [4]:
training_documents = text_splitter.split_documents(text_loader.load())

In [5]:
len(training_documents)

186

In [6]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [7]:
training_split_documents = training_documents[:len(training_documents) - 24]
val_split_documents = training_documents[len(training_documents) - 24:102-12]
test_split_documents = training_documents[102-12:]

In [8]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [9]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [10]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [11]:
import tqdm

async def create_questions(documents, n_questions):
    questions = {}  # Will store question_id -> question text
    relevant_docs = {}  # Will store question_id -> list of relevant doc ids
    
    # Create a progress bar
    for document in tqdm.tqdm(documents, desc="Processing documents"):
        # Generate questions for this document
        response = await question_generation_chain.ainvoke({
            "context": document.page_content,
            "n_questions": n_questions
        })
        
        # Parse the numbered questions from response
        response_lines = response.content.strip().split('\n')
        parsed_questions = [line.split('. ', 1)[1] for line in response_lines 
                          if line and line[0].isdigit()]
        
        # For each generated question
        for question in parsed_questions:
            # Generate unique ID for this question
            question_id = str(uuid.uuid4())
            
            # Store the question
            questions[question_id] = question
            
            # Store the document ID this question was generated from
            relevant_docs[question_id] = [document.metadata["id"]]
    
    return questions, relevant_docs

In [12]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Processing documents:   0%|          | 0/162 [00:00<?, ?it/s]

Processing documents: 100%|██████████| 162/162 [02:25<00:00,  1.11it/s]


In [13]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Processing documents: 0it [00:00, ?it/s]


In [14]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing documents: 100%|██████████| 96/96 [01:31<00:00,  1.05it/s]


In [15]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [16]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [17]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [18]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [20]:
BATCH_SIZE = 10

In [21]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [22]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [23]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [24]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [25]:
EPOCHS = uv 

In [26]:
import wandb
wandb.init(mode="disabled")

In [27]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)



Step,Training Loss,Validation Loss


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/walidkoleilat/aie5-midterm/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code
  File "/var/folders/xp/fw_gm3jn2xq5ngzrpc74m4lm0000gn/T/ipykernel_33451/189950450.py", line 3, in <module>
    model.fit(
    ~~~~~~~~~^
        train_objectives=[(loader, train_loss)],
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        evaluation_steps=50
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Users/walidkoleilat/aie5-midterm/.venv/lib/python3.13/site-packages/sentence_transformers/fit_mixin.py", line 385, in fit
    trainer.train()
    ~~~~~~~~~~~~~^^
  File "/Users/walidkoleilat/aie5-midterm/.venv/lib/python3.13/site-packages/transformers/trainer.py", line 2241, in train
    return inner_training_loop(
        args=args,
    ...<2 lines>...
        ignore_keys_for_eval=ignore_keys_for_eval,
    )
  File "/Users/walidkoleilat/aie5-midterm/.venv/lib/python3.13/site-packages/transfo

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
hf_username = "wkoleilat-happytitan"

In [None]:
model.push_to_hub(f"{hf_username}/build-or-buy-v1")

In [None]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

In [None]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

In [None]:
te3_openai = OpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

In [None]:
te3_results_df = pd.DataFrame(te3_results)

In [None]:
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

In [None]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)

In [None]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

In [None]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic_ft")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

In [None]:
finetune_results_df = pd.DataFrame(finetune_results)

In [None]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap  = 50,
    length_function = len
)

training_documents = text_splitter.split_documents(text_loader.load())

In [None]:
from langchain_community.vectorstores import FAISS

base_vectorstore = FAISS.from_documents(training_documents, huggingface_embeddings)
base_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 6})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

In [None]:
rag_llm =  ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [None]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

base_rag_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [None]:
base_rag_chain.invoke({"question" : "What is an agent?"})["response"]

In [None]:
finetune_vectorstore = FAISS.from_documents(training_documents, finetune_embeddings)
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

In [None]:
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [None]:
finetune_rag_chain.invoke({"question" : "What is an Agent?"})["response"]