### Load data from vectorstore and create QA dataset

In [None]:
from src.data_loaders.document_processor import DocumentProcessor
from dotenv import load_dotenv
load_dotenv()

document_processor = DocumentProcessor('au-blog-rag', dimension=1536)

In [None]:
# documents = [document['content'] for document in document_processor.get_all_documents()]
documents = document_processor.get_all_documents()

In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import Document
import random


def load_corpus_from_string(custom_docs: list[dict], verbose=False):
    # Create a Document objects from the content and metadata
    docs = []
    for custom_doc in custom_docs:
        docs.append(Document(text=custom_doc['content'], metadata=custom_doc['metadata']))

    # Use SentenceSplitter to split the content into nodes
    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes


def split_documents(docs: list[dict], val_ratio=0.2):
    """Split a list of documents into training and validation sets."""
    random.shuffle(docs)
    split_index = int(len(docs) * (1 - val_ratio))
    train_docs = docs[:split_index]
    val_docs = docs[split_index:]
    return train_docs, val_docs

documents_train, documents_val = split_documents(documents, val_ratio=0.2)

train_nodes = load_corpus_from_string(documents_train)
val_nodes = load_corpus_from_string(documents_val)
print(len(train_nodes), len(val_nodes))

In [None]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.llms.openai import OpenAI
from dotenv import load_dotenv
load_dotenv() 

train_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"),
    nodes=train_nodes,
    output_path="train_dataset.json",
)
val_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"),
    nodes=val_nodes,
    output_path="val_dataset.json",
)

### Load QA data from .json and fine-tune all-MiniLM-L6-v2

In [1]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [2]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="sentence-transformers/all-MiniLM-L6-v2",
    model_output_path="fine-tuned-model",
    val_dataset=val_dataset,
    epochs=1,
)

INFO:datasets:PyTorch version 2.5.1 available.
PyTorch version 2.5.1 available.
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [None]:
finetune_engine.finetune()

In [4]:
embed_model = finetune_engine.get_finetuned_model()
embed_model

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: fine-tuned-model
Load pretrained SentenceTransformer: fine-tuned-model
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


HuggingFaceEmbedding(model_name='fine-tuned-model', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f4c202dc390>, num_workers=None, max_length=256, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

### Evaluate models

In [1]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [2]:
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd


def evaluate(
    dataset,
    embed_model,
    top_k=5,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs
    
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True)
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids
    
        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [3]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

embedding_openai = OpenAIEmbeddings(model="text-embedding-3-small")
openai_res = evaluate(val_dataset, embedding_openai)

Generating embeddings:   0%|          | 0/209 [00:00<?, ?it/s]

  0%|          | 0/418 [00:00<?, ?it/s]

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

# fine_tuned = "local:fine-tuned-model"
fine_tuned = HuggingFaceEmbeddings(model_name="wylupek/au-blog-rag-embedder")
fine_tuned_res = evaluate(val_dataset, fine_tuned)

Generating embeddings:   0%|          | 0/209 [00:00<?, ?it/s]

  0%|          | 0/418 [00:00<?, ?it/s]

In [5]:
original = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
original_res = evaluate(val_dataset, original)

Generating embeddings:   0%|          | 0/209 [00:00<?, ?it/s]

  0%|          | 0/418 [00:00<?, ?it/s]

In [6]:
df_openai = pd.DataFrame(openai_res)
hit_rate_openai = df_openai["is_hit"].mean()

df_ft = pd.DataFrame(fine_tuned_res)
hit_rate_ft = df_ft["is_hit"].mean()

df_og = pd.DataFrame(original_res)
hit_rate_og = df_og["is_hit"].mean()

print(
    f"text-embedding-3-small:\t\t{hit_rate_openai}\n"
    f"all-MiniLM-L6-v2:\t\t\t{hit_rate_og}\n"
    f"all-MiniLM-L6-v2-fine-tuned:\t{hit_rate_ft}"
)

text-embedding-3-small:		0.9282296650717703
all-MiniLM-L6-v2:			0.8947368421052632
all-MiniLM-L6-v2-fine-tuned:	0.9593301435406698
