In [1]:
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-finetuning

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

In [8]:
TRAIN_FILES = ["/teamspace/studios/this_studio/pp.pdf"]
VAL_FILES = ["/teamspace/studios/this_studio/val.pdf"]

In [3]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [5]:
%pip install llama-index
%pip install llama-index-readers-file


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [9]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['/teamspace/studios/this_studio/pp.pdf']
Loaded 44 docs


Parsing nodes:   0%|          | 0/44 [00:00<?, ?it/s]

Parsed 44 nodes
Loading files ['/teamspace/studios/this_studio/val.pdf']
Loaded 25 docs


Parsing nodes:   0%|          | 0/25 [00:00<?, ?it/s]

Parsed 25 nodes


In [5]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

In [6]:
import os

OPENAI_API_KEY = "" # ADD YOUR OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [5]:
%pip install google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.6.0-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.4 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.4-py3-none-any.whl.metadata (5.6 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.19.0-py3-none-any.whl.metadata (2.7 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.133.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.4->google-generativeai)
  Downloading proto_plus-1.23.0-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core->google-generativeai)
  Downloading googleapis_common_protos-1.63.1-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting httplib2<1.dev0,>=0.19.0 (from google-api-python-client->google-generativeai)
  Downloading http

In [10]:
from llama_index.llms.openai import OpenAI


train_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"), nodes=train_nodes
)
val_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"), nodes=val_nodes
)

train_dataset.save_json("train_dataset1.json")
val_dataset.save_json("val_dataset.json")

100%|██████████| 44/44 [00:52<00:00,  1.19s/it]


In [11]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [12]:
# [Optional] Load
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [13]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test1_model",
    val_dataset=val_dataset,
)



In [14]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9 [00:00<?, ?it/s]

In [15]:
embed_model = finetune_engine.get_finetuned_model()

In [1]:
from sentence_transformers import SentenceTransformer

In [17]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [2]:
embed_model = SentenceTransformer("/teamspace/studios/this_studio/test1_model")

In [3]:
print(embed_model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [18]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [19]:
finetuned = "local:test1_model"
val_results_finetuned = evaluate(val_dataset, finetuned)

Generating embeddings:   0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [20]:
df_finetuned = pd.DataFrame(val_results_finetuned)

In [21]:
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

0.94

In [22]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path


def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [23]:
evaluate_st(val_dataset, "test1_model", name="finetuned")

0.789219298245614

In [24]:
df_st_finetuned = pd.read_csv(
    "results/Information-Retrieval_evaluation_finetuned_results.csv"
)

In [25]:
df_st_finetuned["model"] = "fine_tuned"


In [26]:
print(df_st_finetuned["model"] )

0    fine_tuned
Name: model, dtype: object


In [3]:
!python --version

Python 3.10.10
