In [1]:
from dotenv import load_dotenv
load_dotenv()

from src.data_loaders.document_processor import DocumentProcessor
document_processor = DocumentProcessor('au-blog-rag')

Index 'au-blog-rag' loaded.
Total vectors: 1041


In [2]:
# documents = [document['content'] for document in document_processor.get_all_documents()]
documents = document_processor.get_all_documents()

Found 1041 documents.


In [3]:
print(documents[0]['metadata']['source'])

https://tech.appunite.com/blog/how-to-become-junior-elixir-developer


In [4]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import Document
import random


def load_corpus_from_string(custom_docs: list[dict], verbose=False):
    # Create a Document objects from the content and metadata
    docs = []
    for custom_doc in custom_docs:
        docs.append(Document(text=custom_doc['content'], metadata=custom_doc['metadata']))

    # Use SentenceSplitter to split the content into nodes
    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes


def split_documents(docs: list[dict], val_ratio=0.2):
    """Split a list of documents into training and validation sets."""
    random.shuffle(docs)
    split_index = int(len(docs) * (1 - val_ratio))
    train_docs = docs[:split_index]
    val_docs = docs[split_index:]
    return train_docs, val_docs

documents_train, documents_val = split_documents(documents, val_ratio=0.2)

train_nodes = load_corpus_from_string(documents_train)
val_nodes = load_corpus_from_string(documents_val)
print(len(train_nodes), len(val_nodes))

832 209


In [5]:
# from llama_index.finetuning import generate_qa_embedding_pairs
# from llama_index.llms.openai import OpenAI
# from dotenv import load_dotenv
# load_dotenv() 
# 
# train_dataset = generate_qa_embedding_pairs(
#     llm=OpenAI(model="gpt-3.5-turbo"),
#     nodes=train_nodes,
#     output_path="train_dataset.json",
# )
# val_dataset = generate_qa_embedding_pairs(
#     llm=OpenAI(model="gpt-3.5-turbo"),
#     nodes=val_nodes,
#     output_path="val_dataset.json",
# )


from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [6]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="sentence-transformers/all-MiniLM-L6-v2",
    model_output_path="fine-tuned-model",
    val_dataset=val_dataset,
    epochs=10,
)

INFO:datasets:PyTorch version 2.5.1 available.
PyTorch version 2.5.1 available.
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [7]:
finetune_engine.finetune()

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,No log,0.791866,0.906699,0.940191,0.978469,0.791866,0.302233,0.188038,0.097847,0.791866,0.906699,0.940191,0.978469,0.885835,0.856108,0.857098


INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Information Retrieval Evaluation of the model on the  dataset:
Information Retrieval Evaluation of the model on the  dataset:
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Queries: 418
Queries: 418
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Corpus: 209

Corpus: 209

INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Score-Function: cosine
Score-Function: cosine
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@1: 79.19%
Accuracy@1: 79.19%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@3: 90.67%
Accuracy@3: 90.67%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@5: 94.02%
Accuracy@5: 94.02%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@10: 97.85%
Accuracy@10: 97.85%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Precision@

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,No log,0.791866,0.906699,0.940191,0.978469,0.791866,0.302233,0.188038,0.097847,0.791866,0.906699,0.940191,0.978469,0.885835,0.856108,0.857098
100,No log,No log,0.839713,0.937799,0.961722,0.985646,0.839713,0.3126,0.192344,0.098565,0.839713,0.937799,0.961722,0.985646,0.914705,0.891648,0.892356
150,No log,No log,0.832536,0.944976,0.971292,0.992823,0.832536,0.314992,0.194258,0.099282,0.832536,0.944976,0.971292,0.992823,0.916319,0.891314,0.891637
167,No log,No log,0.844498,0.947368,0.9689,0.988038,0.844498,0.315789,0.19378,0.098804,0.844498,0.947368,0.9689,0.988038,0.920891,0.89873,0.899468
200,No log,No log,0.84689,0.961722,0.976077,0.992823,0.84689,0.320574,0.195215,0.099282,0.84689,0.961722,0.976077,0.992823,0.926239,0.904176,0.904535
250,No log,No log,0.84689,0.95933,0.983254,0.995215,0.84689,0.319777,0.196651,0.099522,0.84689,0.95933,0.983254,0.995215,0.926597,0.903853,0.904117
300,No log,No log,0.84689,0.961722,0.976077,0.992823,0.84689,0.320574,0.195215,0.099282,0.84689,0.961722,0.976077,0.992823,0.926249,0.904139,0.904595
334,No log,No log,0.861244,0.964115,0.980861,0.995215,0.861244,0.321372,0.196172,0.099522,0.861244,0.964115,0.980861,0.995215,0.93334,0.912873,0.913121
350,No log,No log,0.861244,0.961722,0.980861,0.995215,0.861244,0.320574,0.196172,0.099522,0.861244,0.961722,0.980861,0.995215,0.933291,0.912808,0.913034
400,No log,No log,0.854067,0.961722,0.985646,0.995215,0.854067,0.320574,0.197129,0.099522,0.854067,0.961722,0.985646,0.995215,0.930217,0.908638,0.908889


INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Information Retrieval Evaluation of the model on the  dataset:
Information Retrieval Evaluation of the model on the  dataset:
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Queries: 418
Queries: 418
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Corpus: 209

Corpus: 209

INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Score-Function: cosine
Score-Function: cosine
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@1: 83.97%
Accuracy@1: 83.97%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@3: 93.78%
Accuracy@3: 93.78%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@5: 96.17%
Accuracy@5: 96.17%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Accuracy@10: 98.56%
Accuracy@10: 98.56%
INFO:sentence_transformers.evaluation.InformationRetrievalEvaluator:Precision@

In [8]:
embed_model = finetune_engine.get_finetuned_model()
embed_model

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: fine-tuned-model
Load pretrained SentenceTransformer: fine-tuned-model
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


HuggingFaceEmbedding(model_name='fine-tuned-model', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f6e2ced1650>, num_workers=None, max_length=256, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

In [9]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd


def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs
    
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True)
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc
    
        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [None]:
embedding_openai = OpenAIEmbedding(model="text-embedding-3-small")
openai_res = evaluate(val_dataset, embedding_openai)

In [None]:
finetuned = "local:fine-tuned-model"
fine_tuned_res = evaluate(val_dataset, finetuned)

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings

original = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
original_res = evaluate(val_dataset, original)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Generating embeddings:   0%|          | 0/209 [00:00<?, ?it/s]

  0%|          | 0/418 [00:00<?, ?it/s]

In [22]:
df_openai = pd.DataFrame(openai_res)
hit_rate_openai = df_openai["is_hit"].mean()

df_ft = pd.DataFrame(fine_tuned_res)
hit_rate_ft = df_ft["is_hit"].mean()

df_og = pd.DataFrame(original_res)
hit_rate_og = df_og["is_hit"].mean()

print(f"text-embedding-3-small:\t\t{hit_rate_openai}\n"
      f"all-MiniLM-L6-v2:\t\t\t{hit_rate_og}\n"
      f"all-MiniLM-L6-v2-fine-tuned\t{hit_rate_ft}")

text-embedding-3-small:		0.930622009569378
all-MiniLM-L6-v2:			0.8947368421052632
all-MiniLM-L6-v2-fine-tuned	0.9593301435406698


In [1]:
from huggingface_hub import HfApi, HfRepository

repo_name = "wylupek/au-blog-rag-embedder"
repo = HfRepository(local_dir="./src/fine-tuned-model")
repo.push_to_hub(repo_name)

ImportError: cannot import name 'HfRepository' from 'huggingface_hub' (/home/filip/Documents/github/au-blog-rag/.venv/lib/python3.11/site-packages/huggingface_hub/__init__.py)