# Install Packages and Setup Variables


In [None]:
!pip install -q llama-index==0.12.21 llama-index-finetuning==0.3.1 llama-index-embeddings-adapter==0.3.0 openai==1.59.8 tiktoken==0.8.0 chromadb==0.6.0 llama-index-vector-stores-chroma==0.4.1 cohere==5.6.2 llama-index-llms-gemini==0.4.1 html2text==2024.2.26 llama-index-llms-openai==0.3.13 llama-index-embeddings-huggingface==0.5.0 llama-index-embeddings-openai==0.3.0 llama-index-llms-azure-openai==0.3.1

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.6/455.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os

# Set the following API Keys in the Python environment. Will be used later.
os.environ["OPENAI_API_KEY"] = "<YOUR_API_KEY>"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY1')

# Download the Dataset


In [None]:
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="ai_tutor_knowledge.jsonl",repo_type="dataset",local_dir="/content")

ai_tutor_knowledge.jsonl:   0%|          | 0.00/6.96M [00:00<?, ?B/s]

In [None]:
import json
with open(file_path, "r") as file:
    ai_tutor_knowledge = [json.loads(line) for line in file]

len(ai_tutor_knowledge)

762

## LlamaIndex Document

In [None]:
from typing import List
from llama_index.core import Document

def create_docs_from_list(data_list: List[dict]) -> List[Document]:
    documents = []
    for data in data_list:
        documents.append(
            Document(
                doc_id=data["doc_id"],
                text=data["content"],
                metadata={  # type: ignore
                    "url": data["url"],
                    "title": data["name"],
                    "tokens": data["tokens"],
                    "source": data["source"],
                },
                excluded_llm_metadata_keys=[
                    "title",
                    "tokens",
                    "source",
                ],
                excluded_embed_metadata_keys=[
                    "url",
                    "tokens",
                    "source",
                ],
            )
        )
    return documents

doc = create_docs_from_list(ai_tutor_knowledge)

### Splitting Dataset


In [None]:
import random

random.shuffle(doc)
split_index = int(len(doc) * 0.9)

# TRAIN_DOCs and VALIDATION_DOCs
TRAIN_DOCs = doc[:split_index]
VALIDATION_DOCs = doc[split_index:]

# Chunking


In [None]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Document

# Now use the parser
parser = SimpleNodeParser.from_defaults(chunk_size=768, chunk_overlap=64)
TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)
VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)

print(len(TRAIN_NODEs), len(VALIDATION_NODEs))

2838 276


In [None]:
# Use a subset of the dataset if testing.

# Test with a few sample, processing dataset fully can be costly depanding on the size.
# NOTE: Checkpoints are provided in the lesson, so no need to run the code on full dataset.

testing =False

if testing:
    TRAIN_NODEs = TRAIN_NODEs[0:10]
    VALIDATION_NODEs = VALIDATION_NODEs[0:5]

# Generate Question


We use a Large Language Model (LLM) to produce questions for each chunk of the dataset. Then we can use these data to train the model to develop embeddings that more accurately represent the types of questions users may ask.


In [None]:
# Use this block of code if you don't want to generate the questions for the dataset. (Avoid API call charges!)
# Uncomment the following code, and keep in mind to comment all the contents in the next coding block.


# from llama_index.finetuning import generate_qa_embedding_pairs
# from llama_index.llms.openai import OpenAI

# llm = OpenAI(model="gpt-4o-mini", temperature=1, max_tokens=512)

# Generate questions for each chunk.

# TRAIN_DATASET = generate_qa_embedding_pairs(TRAIN_NODEs, llm=llm, output_path="./train_dataset.json")

# VALIDATION_DATASET = generate_qa_embedding_pairs(VALIDATION_NODEs, llm=llm, output_path="./val_dataset.json")

In [None]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id="jaiganesan/Embedding-model-fine-tuning-dataset", repo_type="dataset",local_dir="/content/")


from llama_index.finetuning import EmbeddingQAFinetuneDataset

# Load the pre-generated questions json files.
TRAIN_DATASET = EmbeddingQAFinetuneDataset.from_json("./train_dataset.json")
VALIDATION_DATASET = EmbeddingQAFinetuneDataset.from_json("./val_dataset.json")

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

val_dataset.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

train_dataset.json:   0%|          | 0.00/8.23M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

# Load an Embedding Model


In [None]:
from llama_index.core.embeddings import resolve_embed_model

# Load an existing embedding model with a adapter layer on top.
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
import torch

# Finetune the adapter
finetune_engine = EmbeddingAdapterFinetuneEngine(
    TRAIN_DATASET,
    base_embed_model,
    model_output_path="model_output_test",
    epochs=2,
    verbose=True,
    bias=True,
)

In [13]:
# Initiate the Finetuning process
finetune_engine.finetune()

[1;3;34m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/545 [00:00<?, ?it/s]

[1;3;34m> [Epoch 0] Current loss: 1.556119680404663
[0m[1;3;34m> [Epoch 0] Current loss: 1.5454347133636475
[0m[1;3;34m> [Epoch 0] Current loss: 1.3019955158233643
[0m[1;3;34m> [Epoch 0] Current loss: 1.210197925567627
[0m[1;3;34m> [Epoch 0] Current loss: 1.039259672164917
[0m[1;3;34m> [Epoch 0] Current loss: 1.0869500637054443
[0m[1;3;34m> [Epoch 0] Current loss: 1.6084486246109009
[0m[1;3;34m> [Epoch 0] Current loss: 1.3948438167572021
[0m[1;3;34m> [Epoch 0] Current loss: 1.6246013641357422
[0m[1;3;34m> [Epoch 0] Current loss: 1.833229660987854
[0m[1;3;34m> [Epoch 0] Current loss: 2.35227108001709
[0m[1;3;34m> [Epoch 0] Current loss: 2.3650951385498047
[0m[1;3;34m> [Epoch 0] Current loss: 2.2668750286102295
[0m[1;3;34m> [Epoch 0] Current loss: 1.8664634227752686
[0m[1;3;34m> [Epoch 0] Current loss: 2.3703010082244873
[0m[1;3;34m> [Epoch 0] Current loss: 2.346954345703125
[0m[1;3;34m> [Epoch 0] Current loss: 2.422718048095703
[0m[1;3;34m> [Epoch 0] C

Iteration:   0%|          | 0/545 [00:00<?, ?it/s]

[1;3;34m> [Epoch 1] Current loss: 1.5034737586975098
[0m[1;3;34m> [Epoch 1] Current loss: 1.4798121452331543
[0m[1;3;34m> [Epoch 1] Current loss: 1.2930065393447876
[0m[1;3;34m> [Epoch 1] Current loss: 1.163027048110962
[0m[1;3;34m> [Epoch 1] Current loss: 1.029118537902832
[0m[1;3;34m> [Epoch 1] Current loss: 1.0037192106246948
[0m[1;3;34m> [Epoch 1] Current loss: 1.5205395221710205
[0m[1;3;34m> [Epoch 1] Current loss: 1.3807651996612549
[0m[1;3;34m> [Epoch 1] Current loss: 1.6184269189834595
[0m[1;3;34m> [Epoch 1] Current loss: 1.8021923303604126
[0m[1;3;34m> [Epoch 1] Current loss: 2.352696418762207
[0m[1;3;34m> [Epoch 1] Current loss: 2.3439202308654785
[0m[1;3;34m> [Epoch 1] Current loss: 2.264328718185425
[0m[1;3;34m> [Epoch 1] Current loss: 1.8568460941314697
[0m[1;3;34m> [Epoch 1] Current loss: 2.3480916023254395
[0m[1;3;34m> [Epoch 1] Current loss: 2.338205575942993
[0m[1;3;34m> [Epoch 1] Current loss: 2.3973591327667236
[0m[1;3;34m> [Epoch 1

In [None]:
embed_model = finetune_engine.get_finetuned_model()

# Or, import model from the directory whenever required.
# from llama_index.core.embeddings import LinearAdapterEmbeddingModel
# embed_model = LinearAdapterEmbeddingModel(base_embed_model, "model_output_test")

In [29]:
embed_model

AdapterEmbeddingModel(model_name='Adapter for BAAI/bge-small-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f05dcc6b890>, num_workers=None)

## Fine tuning OpenAI Embedding Model using Adapter method

In [16]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings.openai import OpenAIEmbedding

openai_finetune_engine = EmbeddingAdapterFinetuneEngine(
    TRAIN_DATASET,
    OpenAIEmbedding(model="text-embedding-3-small"),
    model_output_path="model_output_test_openai",
    bias=True,
    epochs=2,
    verbose=True,
)

In [None]:
openai_finetune_engine.finetune()

openai_embed_model = openai_finetune_engine.get_finetuned_model()

[1;3;34m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/545 [00:00<?, ?it/s]

[1;3;34m> [Epoch 0] Current loss: 1.198973298072815
[0m[1;3;34m> [Epoch 0] Current loss: 1.2279342412948608
[0m[1;3;34m> [Epoch 0] Current loss: 0.9637897610664368
[0m[1;3;34m> [Epoch 0] Current loss: 0.9325153231620789
[0m[1;3;34m> [Epoch 0] Current loss: 1.1188467741012573
[0m[1;3;34m> [Epoch 0] Current loss: 0.9120743870735168
[0m[1;3;34m> [Epoch 0] Current loss: 0.847815990447998
[0m[1;3;34m> [Epoch 0] Current loss: 1.1642076969146729
[0m[1;3;34m> [Epoch 0] Current loss: 1.1622381210327148
[0m[1;3;34m> [Epoch 0] Current loss: 1.7474429607391357
[0m[1;3;34m> [Epoch 0] Current loss: 2.2916181087493896
[0m[1;3;34m> [Epoch 0] Current loss: 2.366750717163086
[0m[1;3;34m> [Epoch 0] Current loss: 2.324557304382324
[0m[1;3;34m> [Epoch 0] Current loss: 1.8044850826263428
[0m[1;3;34m> [Epoch 0] Current loss: 2.359330415725708
[0m[1;3;34m> [Epoch 0] Current loss: 2.3149125576019287
[0m[1;3;34m> [Epoch 0] Current loss: 2.3385021686553955
[0m[1;3;34m> [Epoch 0

Iteration:   0%|          | 0/545 [00:00<?, ?it/s]

[1;3;34m> [Epoch 1] Current loss: 1.14235520362854
[0m[1;3;34m> [Epoch 1] Current loss: 1.1969153881072998
[0m[1;3;34m> [Epoch 1] Current loss: 0.9483484029769897
[0m[1;3;34m> [Epoch 1] Current loss: 0.9086048007011414
[0m[1;3;34m> [Epoch 1] Current loss: 1.1212831735610962
[0m[1;3;34m> [Epoch 1] Current loss: 0.9033600091934204
[0m[1;3;34m> [Epoch 1] Current loss: 0.8295100331306458
[0m[1;3;34m> [Epoch 1] Current loss: 1.1796770095825195
[0m[1;3;34m> [Epoch 1] Current loss: 1.1545650959014893
[0m[1;3;34m> [Epoch 1] Current loss: 1.684569001197815
[0m[1;3;34m> [Epoch 1] Current loss: 2.2842726707458496
[0m[1;3;34m> [Epoch 1] Current loss: 2.3601670265197754
[0m[1;3;34m> [Epoch 1] Current loss: 2.3164868354797363
[0m[1;3;34m> [Epoch 1] Current loss: 1.7936798334121704
[0m[1;3;34m> [Epoch 1] Current loss: 2.347806453704834
[0m[1;3;34m> [Epoch 1] Current loss: 2.3034379482269287
[0m[1;3;34m> [Epoch 1] Current loss: 2.335414171218872
[0m[1;3;34m> [Epoch 1

In [28]:
openai_embed_model

AdapterEmbeddingModel(model_name='Adapter for text-embedding-3-small', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f05dcc6b890>, num_workers=None)

# Evaluate


## Define the Evaluation Functions


In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.core import Settings
from tqdm import tqdm
import pandas as pd

def evaluate(dataset, embedding_model, top_k=5, verbose=False):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    Settings.embed_model = embedding_model

    # Chunking the documents and generating embeddings
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, show_progress=True)

    # Define a retriever to answer the questions
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []

    # Look into each response sources to see if the chunk that contains the answer is retrieved.
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]

        try:
            rank = retrieved_ids.index(expected_id) + 1
            reciprocal_rank = 1 / rank
        except ValueError:
            rank = None
            reciprocal_rank = 0

        is_hit = expected_id in retrieved_ids

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
            "rank": rank,
            "reciprocal_rank": reciprocal_rank
        }
        eval_results.append(eval_result)

    return eval_results

## OpenAI Embedding Model Evaluation


In [20]:
# Load the OpenAI Ada model and evaluate it.
openai_text_embedding_small = OpenAIEmbedding(model="text-embedding-3-small")
openai_embedding_val_results = evaluate(VALIDATION_DATASET, embedding_model=openai_text_embedding_small)

Generating embeddings:   0%|          | 0/248 [00:00<?, ?it/s]

100%|██████████| 496/496 [02:57<00:00,  2.79it/s]


In [21]:
openai_embedding_val_results = [
    result for result in openai_embedding_val_results if isinstance(result, dict)
]

df_openai = pd.DataFrame(openai_embedding_val_results)

hit_rate_openai = df_openai["is_hit"].mean()
mrr_openai = df_openai["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_openai}")
print(f"MRR: {mrr_openai}")

Hit rate: 0.9354838709677419
MRR: 0.788877688172043


### OpenAI Embedding Model with Fine Tuned Adapter Model Evaluation

In [22]:
from llama_index.embeddings.adapter import AdapterEmbeddingModel

openai_embed_model = AdapterEmbeddingModel(openai_text_embedding_small, "model_output_test_openai")

val_results_ft_openai = evaluate(VALIDATION_DATASET, embedding_model = openai_embed_model)

Generating embeddings:   0%|          | 0/248 [00:00<?, ?it/s]

100%|██████████| 496/496 [03:16<00:00,  2.52it/s]


In [23]:
val_results_ft_openai = [
    result for result in val_results_ft_openai if isinstance(result, dict)
]

df_openai_ft = pd.DataFrame(val_results_ft_openai)

hit_rate_openai_ft = df_openai_ft["is_hit"].mean()
mrr_openai_ft = df_openai_ft["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_openai_ft}")
print(f"MRR: {mrr_openai_ft}")

Hit rate: 0.9637096774193549
MRR: 0.8220766129032258


## Open Source BAAI Model Evaluation


In [24]:
# Load the Base model without fine-tuning
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")
bge_val_results = evaluate(VALIDATION_DATASET, embedding_model=base_embed_model)

Generating embeddings:   0%|          | 0/248 [00:00<?, ?it/s]

100%|██████████| 496/496 [00:40<00:00, 12.19it/s]


In [25]:
bge_val_results = [
    result for result in bge_val_results if isinstance(result, dict)
]

df_bge = pd.DataFrame(bge_val_results)

hit_rate_bge = df_bge["is_hit"].mean()
mrr_bge = df_bge["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_bge}")
print(f"MRR: {mrr_bge}")

Hit rate: 0.8387096774193549
MRR: 0.7207325268817205


## FineTuned BAAI Adapter Embedding Model Evaluation


In [26]:
from llama_index.embeddings.adapter import AdapterEmbeddingModel

# Load the Fine-tuned model.
embed_model = AdapterEmbeddingModel(base_embed_model, "model_output_test")

val_results_finetuned = evaluate(VALIDATION_DATASET, embedding_model=embed_model)

Generating embeddings:   0%|          | 0/248 [00:00<?, ?it/s]

100%|██████████| 496/496 [00:40<00:00, 12.37it/s]


In [27]:
val_results_finetuned = [
    result for result in val_results_finetuned if isinstance(result, dict)
]

df_finetuned = pd.DataFrame(val_results_finetuned)

hit_rate_finetuned = df_finetuned["is_hit"].mean()
mrr_finetuned = df_finetuned["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_finetuned}")
print(f"MRR: {mrr_finetuned}")

Hit rate: 0.8629032258064516
MRR: 0.7499663978494624
