# Finetuning With LlamaIndex and GPT

## Imports And Config

In [2]:
%pip install llama-index --upgrade
%pip install llama-index-llms-gradient
%pip install llama-index-finetuning





[notice] A new release of pip is available: 23.0 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





In [None]:
import json
import logging
import os
import sys

import chromadb
import chromadb.utils.embedding_functions as embedding_functions
import nest_asyncio
import openai
import pandas as pd
from dotenv import load_dotenv
from IPython.display import Markdown, display

# import
from llama_index import (
    KnowledgeGraphIndex,
    ServiceContext,
    SimpleDirectoryReader,
    SimpleKeywordTableIndex,
    StorageContext,
    SummaryIndex,
    VectorStoreIndex,
    get_response_synthesizer,
    load_index_from_storage,
    set_global_service_context,
)
from llama_index.agent import ReActAgent
from llama_index.callbacks import CallbackManager
from llama_index.composability import ComposableGraph
from llama_index.embeddings import (
    AdapterEmbeddingModel,
    HuggingFaceEmbedding,
    OpenAIEmbedding,
    resolve_embed_model,
)
from llama_index.embeddings.adapter_utils import TwoLayerNN
from llama_index.evaluation import (
    DatasetGenerator,
    EmbeddingQAFinetuneDataset,
    QueryResponseDataset,
    generate_question_context_pairs,
)
from llama_index.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.finetuning.cross_encoders.cross_encoder import (
    CrossEncoderFinetuneEngine,
)
from llama_index.finetuning.cross_encoders.dataset_gen import (
    generate_ce_fine_tuning_dataset,
    generate_synthetic_queries_over_documents,
)
from llama_index.ingestion import IngestionPipeline
from llama_index.llama_dataset import (
    CreatedBy,
    CreatedByType,
    LabelledRagDataExample,
    LabelledRagDataset,
)
from llama_index.llama_dataset.generator import RagDatasetGenerator
from llama_index.llama_pack import download_llama_pack
from llama_index.llms import OpenAI
from llama_index.node_parser import MarkdownNodeParser, SentenceSplitter
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index.prompts import PromptTemplate
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.readers import (
    NotionPageReader,
    SimpleDirectoryReader,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.storage.storage_context import StorageContext
from llama_index.text_splitter import SentenceSplitter, TokenTextSplitter
from llama_index.tools import QueryEngineTool
from llama_index.tools.query_engine import QueryEngineTool
from llama_index.vector_stores import ChromaVectorStore
from openai import OpenAI
from tqdm.notebook import tqdm

client = OpenAI()



In [None]:
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
os.environ["COHERE_API_KEY"] = COHERE_API_KEY
integration_token = os.getenv("NOTION_API_KEY")
os.environ["NOTION_INTEGRATION_TOKEN"] = os.getenv("NOTION_API_KEY")
necromunda_db_id = os.getenv("NECROMUNDA_DB")

### Environment Congifuration

In [None]:
openai.api_key = os.getenv("OPENAI_API_KEY")
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-0613")
# llm = OpenAI(temperature=0, model="gpt-4-0613")
service_context = ServiceContext.from_defaults(llm=llm)

gpt_35_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-0613", temperature=0.3)
)
gpt4_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4-0613", temperature=0.3)
)


load_dotenv()

callback_manager = CallbackManager([])

gpt_35_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo-0613", temperature=0.3),
    callback_manager=callback_manager,
)
gpt_4_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4-0613", temperature=0.3),
    callback_manager=callback_manager,
)
nest_asyncio.apply()


logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


reader = SimpleDirectoryReader("../necrovox_docs/")
documents = reader.load_data()


embed_model = OpenAIEmbedding(embed_batch_size=10)

service_context = ServiceContext.from_defaults(embed_model=embed_model)

# optionally set a global service context
set_global_service_context(service_context)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY, model_name="text-embedding-ada-002"
)

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))




## Load Documents

### Load from Notion

In [None]:
notion_reader = NotionPageReader(integration_token=integration_token)


In [None]:
notion_reader.integration_token = integration_token


In [None]:
import requests
import json

url = "https://api.notion.com/v1/search"

payload = json.dumps({
  "query": "Necromunda",
  "sort": {
    "direction": "descending",
    "timestamp": "last_edited_time"
  }
})
headers = {
  'Content-Type': 'application/json',
  'Authorization': f'Bearer {integration_token}',
  'Cookie': '__cf_bm=_54FCr6wYRyRIFeuVVYGX7RNbPAjRv4BoeQsCp1BnF8-1705702350-1-AZnzjU/14Xpspdgy0vIMtwduuhle3eq1wIEW1yvdkpUGDwwhMxIlKwpLiN55PCVPq4x1zRbZVS7L+3FdCqXqzY0='
}

response = requests.request("POST", url, headers=headers, data=payload)




In [None]:
results = response.json()['results']

In [None]:
page_ids = [result['id'] for result in results]

In [None]:
documents = notion_reader.load_data(page_ids=page_ids)

In [None]:
reader = SimpleDirectoryReader("../necrovox_docs/")
documents = reader.load_data()

### Check Sample Document

In [None]:
documents[0]

### Sentence Tokenize with SentencePiece

In [None]:
len(documents)

In [None]:
nodes = SentenceSplitter().get_nodes_from_documents(documents)



### Tokenize Documents with Markdown Parser

In [None]:


# If you want to try the MarkdownNodeParser, you can use the following code:
md_parser = MarkdownNodeParser()
nodes = md_parser.get_nodes_from_documents(documents)

In [None]:
nodes

## Load Vectorstore and Embeddings with ChromaDB

In [None]:
chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="necromunda_notion", embedding_function=openai_ef)
vector_store = ChromaVectorStore(chroma_collection=collection)
embed_model = OpenAIEmbedding(embed_batch_size=10)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
# load from disk
db = chromadb.PersistentClient(path="../necromunda_db/")
# chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    service_context=service_context,
)

In [None]:
necromunda_index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
[index.insert(document) for document in documents]

### Set up the Query Engine

In [None]:
query_engine = index.as_query_engine()

### Using A Retriever

In [None]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [None]:
query = query_engine.query("What is a Goliath?")

In [None]:
print(query)

### Using Query Engine as Tool

In [None]:
query_tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="necromunda_query_engine",
    description=(
        "Provides information about the Necromunda rules"
    )
)
query_engine_tools = [query_tool]


In [None]:
# node_parser = SentenceSplitter()
# nodes = md_parser.get_nodes_from_documents(documents)
extractor = TitleExtractor()

In [None]:


# node_parser = SentenceSplitter(chunk_size=512)


# use transforms directly
# nodes = node_parser(documents)

In [None]:


transformations = [
    TokenTextSplitter(chunk_size=512, chunk_overlap=128),
    TitleExtractor(nodes=5),
    QuestionsAnsweredExtractor(questions=3),
]
text_splitter =  TokenTextSplitter(chunk_size=512, chunk_overlap=128)
title_extractor = TitleExtractor(nodes=5)
qa_extractor = QuestionsAnsweredExtractor(questions=3)

service_context = ServiceContext.from_defaults(
    transformations=[text_splitter, title_extractor, qa_extractor]
)

index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)

In [None]:
# index.storage_context.persist("./necrovox_index")
index.storage_context.persist("./necrovox_index")

In [None]:


# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./necrovox_index2")

# load index
index = load_index_from_storage(storage_context)
index.storage_context.persist(persist_dir="./necrovox_index2")

In [None]:
documents[0].dict()

In [None]:
# get 'queries' from train_dataset.json

with open("./train_dataset.json", "r") as f:
    train_dataset = json.load(f)
    # queries = [q["query"] for q in train_dataset]

In [None]:
train_dataset

In [None]:
# Add the values of train_dataset['queries'] to a list
queries = []
for q in train_dataset['queries'].values():
    queries.append(q)

In [None]:


qa_prompt_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)


In [None]:
queries[:500]

In [None]:



def augment_data_with_retrieval(dataset, retriever, separate_context=False):
    data_list = dataset.qr_pairs
    new_data_list = []
    for query_str, response in tqdm(data_list):
        retrieved_nodes = retriever.retrieve(query_str)
        retrieved_txts = [n.get_content() for n in retrieved_nodes]
        if separate_context:
            for retrieved_txt in retrieved_txts:
                fmt_query_str = qa_prompt_tmpl.format(
                    query_str=query_str, context_str=retrieved_txt
                )
                new_data_list.append((fmt_query_str, response))
        else:
            context_str = "\n\n".join(retrieved_txts)
            fmt_query_str = qa_prompt_tmpl.format(
                query_str=query_str, context_str=context_str
            )
            new_data_list.append((fmt_query_str, response))
    return new_data_list

## Benchmarking RAG Pipelines With A LabelledRagDatatset
The LabelledRagDataset is meant to be used for evaluating any given RAG pipeline, for which there could be several configurations (i.e. choosing the LLM, values for the similarity_top_k, chunk_size, and others). We’ve likened this abstract to traditional machine learning datastets, where X features are meant to predict a ground-truth label y. In this case, we use the query as well as the retrieved contexts as the “features” and the answer to the query, called reference_answer as the ground-truth label.

And of course, such datasets are comprised of observations or examples. In the case of LabelledRagDataset, these are made up with a set of LabelledRagDataExample’s.

In this notebook, we will show how one can construct a LabelledRagDataset from scratch. Please note that the alternative to this would be to simply download a community supplied LabelledRagDataset from llama-hub in order to evaluate/benchmark your own RAG pipeline on it.

In [None]:
dataset_generator = RagDatasetGenerator.from_documents(
    documents=documents,
    service_context=service_context,
    num_questions_per_chunk=1,  # set the number of questions per nodes
    show_progress=True
)

In [None]:
eval_dataset = dataset_generator.generate_dataset_from_nodes()

In [None]:
eval_dataset.save_json("./eval_dataset.json")

In [None]:
rag_dataset = dataset_generator.generate_dataset_from_nodes()

In [None]:

df  = rag_dataset.to_pandas()
rag_dataset.save_json("rag_dataset2.json")

In [None]:
retriever = index.as_retriever(similarity_top_k=2)

In [None]:
retrieved_nodes = retriever.retrieve("How do I use a blast template?")

In [None]:
from llama_index.response.notebook_utils import display_source_node

for node in retrieved_nodes:
    display_source_node(node, source_length=1000)

### Create Context Pairs

In [None]:
rag_dataset.make_predictions_with_retriever(retriever)

In [None]:
len(nodes)

In [None]:
qa_dataset = generate_question_context_pairs(
    nodes, llm=llm, num_questions_per_chunk=1,
)

In [None]:
qa_dataset.save_json("./qa_dataset2.json")

In [None]:
queries = qa_dataset.queries.values()
print(list(queries)[2])

# Finetuning

## Two-Layer NN Apadpter

In [None]:
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en")
adapter_model = TwoLayerNN(
    384,  # input dimension
    1024,  # hidden dimension
    384,  # output dimension
    bias=True,
    add_residual=True,
)

finetune_engine = EmbeddingAdapterFinetuneEngine(
    qa_dataset,
    base_embed_model,
    model_output_path="model5_output_test",
    model_checkpoint_path="model5_ck",
    adapter_model=adapter_model,
    epochs=25,
    verbose=True,
)

In [None]:
finetune_engine.finetune()

### Second Layer

In [None]:
embed_model_2layer = finetune_engine.get_finetuned_model(
    adapter_cls=TwoLayerNN
)

In [None]:
embed_model_2layer.json

In [None]:
# load model from checkpoint in the midde
embed_model_2layer_adapter = AdapterEmbeddingModel(
    base_embed_model,
    "model5_output_test",
    TwoLayerNN,
)

In [None]:
%pip install eval_utils

In [None]:
%pip install tensorflow

In [None]:
# save model from embed_model_2layer
embed_model_2layer.to_json()


### Evaluate

In [None]:
# load model from checkpoint in the midde
embed_model_2layer = AdapterEmbeddingModel(
    base_embed_model,
    "model5_output_test",
    TwoLayerNN,
)

In [None]:
from eval_utils import evaluate, display_results

In [None]:
ft_val_results_2layer = evaluate(qa_dataset, embed_model_2layer)

In [None]:
# [optional] save
qa_dataset.save_json("pg_eval_dataset1.json")

In [None]:
from llama_index.evaluation import EmbeddingQAFinetuneDataset

with open("./pg_eval_dataset1.json", "r") as f:
    qa_dataset = EmbeddingQAFinetuneDataset.from_json(f.read())

In [None]:
queries = dict(qa_dataset.dict()['queries'])
corpus = dict(qa_dataset.dict()['corpus'])
relevant_docs = dict(qa_dataset.dict()['relevant_docs'])

In [None]:
query_engine = index.as_query_engine()

In [None]:
res = str(query_engine.query('How do I use a blast template?'))

In [None]:
results = []
with open("./qa_eval_dataset1.json", "w") as f:
    # eval_dataset = json.load(f)
    for k,v in queries.items():
        print(f"ID:{k}\n\nQuestion: {v}\n\n""")
        relevant = relevant_docs[k]
        print(f"Relevant Documents: {relevant}\n\n")
        doc = corpus[relevant[0]]

        # doc = corpus[relevant[]]
        question = f"Relevant Documentation{doc}\n\nQuestion: {v}\n\n"
        response = str(query_engine.query(question))
        print(f"Response: {response}\n\n")
        out_dict = {"question": v, "response": response}
        results.append(out_dict)
        f.writelines(json.dumps(out_dict))
    

In [None]:
from llama_index.evaluation import RelevancyEvaluator, FaithfulnessEvaluator
from llama_index import PromptTemplate
query_eval_tmpl = PromptTemplate(
    "Your task is to evaluate the following: If the response for the query"
    " isn't able to answer the question provided.\nIf query isn't able to"
    " answer the question, answer NO.\nOtherwise answer YES.\nTo elaborate,"
    " you might get an answer like the following: 'The context does not"
    " contain the answer to this question.'Please return NO in that case. You"
    " be given the query and response. Return YES or NO as the answer.\nQuery:"
    " \n {query_str}\nResponse: \n {response_str}\nAnswer: "
)

eval_llm = OpenAI(model="gpt-4-0613")

In [None]:
def filter_data(path: str, out_path: str):
    with open(path, "r") as fp, open(out_path, "w") as out_fp:
        lines = fp.readlines()
        new_lines = []
        for idx, line in enumerate(lines):
            qa_pair = json.loads(line)
            eval = eval_llm.complete(
                query_eval_tmpl.format(
                    query_str=qa_pair["query"], response_str=qa_pair["response"]
                )
            )

            print(f"[{idx}] QA Pair: {qa_pair} \n Eval: {eval}")
            if "NO" in str(eval):
                continue
            else:
                new_lines.append(line)
        out_fp.writelines(new_lines)
        return new_lines


In [None]:
import re

# Read the file
with open("./qa_eval_dataset1.json", "r") as f:
    data = f.read()

# Correct the formatting using regex
data = re.sub(r"}{", "}\n{", data)

# Write the corrected data back to the file
with open("./qa_eval_dataset1.json", "w") as f:
    f.write(data)


In [None]:
filter_data("./qa_eval_dataset1.json", "./filtered_qa_eval_dataset1.jsonl")

In [None]:
from copy import deepcopy
import random


def split_train_val(
    path: str, out_train_path: str, out_val_path: str, train_split=0.7
):
    with open(path, "r") as fp:
        lines = fp.readlines()

        # shuffle the lines to make sure that the "train questions" cover most fo the context
        shuffled_lines = deepcopy(lines)
        random.shuffle(shuffled_lines)

        split_idx = int(train_split * len(shuffled_lines))
        train_lines = shuffled_lines[:split_idx]
        val_lines = shuffled_lines[split_idx:]
        with open(out_train_path, "w") as out_fp:
            out_fp.write("".join(train_lines))

        with open(out_val_path, "w") as out_fp:
            out_fp.write("".join(val_lines))

In [None]:
split_train_val(
    "./filtered_qa_eval_dataset1.jsonl",
    "./qa_pairs_train1.jsonl",
    "./qa_pairs_val1.jsonl",
)

In [None]:
fp = open("./qa_pairs_train1.jsonl", "r")
out_fp = open("./qa_pairs_openai1.jsonl", "w")
# TODO: try with different system prompts
system_prompt = {
    "role": "system",
    "content": (
        "You are a helpful assistant helping to answer questions about the"
        " Necromunda rules."
    ),
}
for line in fp:
    try:
        qa_pair = json.loads(line)
        user_prompt = {"role": "user", "content": qa_pair["query"]}
        assistant_prompt = {"role": "assistant", "content": qa_pair["response"]}
        out_dict = {
            "messages": [system_prompt, user_prompt, assistant_prompt],
        }
        out_fp.write(json.dumps(out_dict) + "\n")
    except Exception as e:
        print(f"Error processing line: {line}")
        print(f"Error message: {str(e)}")


In [None]:
from llama_index.finetuning import OpenAIFinetuneEngine
finetune_engine = OpenAIFinetuneEngine(
    "gpt-3.5-turbo",
    "./qa_pairs_openai1.jsonl",
    # start_job_id="<start-job-id>"  # if you have an existing job, can specify id here
)

In [None]:
finetune_engine.finetune()

In [None]:
client.fine_tuning.jobs.list(limit=10)

In [None]:
ft_model = finetune_engine.get_finetuned_model()

In [None]:
ft_json = ft_model.to_json()

In [None]:
ft_json

In [None]:
from llama_index import ServiceContext

ft_context = ServiceContext.from_defaults(
    llm=ft_model,
    callback_manager=callback_manager,

)
# baseline RAG system
ft_index = VectorStoreIndex(nodes, service_context=ft_context)
ft_query_engine = ft_index.as_query_engine()

## Reranking

In [None]:
include_cohere_rerank = True

if include_cohere_rerank:
    %pip install cohere -q

In [None]:
from llama_index.evaluation import RetrieverEvaluator

metrics = ["mrr", "hit_rate"]

if include_cohere_rerank:
    metrics.append(
        "cohere_rerank_relevancy"  # requires COHERE_API_KEY environment variable to be set
    )

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever
)

In [None]:
sample_id, sample_query = list(qa_dataset.queries.items())[20]
sample_expected = qa_dataset.relevant_docs[sample_id]

eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)

In [None]:
# try it out on an entire dataset
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [None]:
def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()
    columns = {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}

    if include_cohere_rerank:
        crr_relevancy = full_df["cohere_rerank_relevancy"].mean()
        columns.update({"cohere_rerank_relevancy": [crr_relevancy]})

    metric_df = pd.DataFrame(columns)

    return metric_df

In [None]:
display_results("top-2 eval", eval_results)

In [None]:


RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")


In [None]:
rag_evaluator = RagEvaluatorPack(
    query_engine=query_engine,  # built with the same source Documents as the rag_dataset
    rag_dataset=eval_dataset,
)

In [None]:
eval_dataset = QueryResponseDataset.from_json("./eval_dataset.json")

In [None]:
ce_datadet = generate_ce_fine_tuning_dataset(documents=documents, questions_list=queries[:500], max_chunk_length=1000, llm=llm, qa_doc_relevance_prompt=qa_prompt_tmpl, top_k=5,)

In [None]:

final_finetuning_data_list = []
for doc in documents:
    questions_list = doc["questions"]
    documents = [Document(text=doc['text'])]
    local_finetuning_dataset = generate_ce_fine_tuning_dataset(
        documents=documents,
        questions_list=questions_list,
        max_chunk_length=256,
        top_k=5,
    )
    final_finetuning_data_list.extend(local_finetuning_dataset)

## Context with GPT-4

In [None]:

eval_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4", temperature=0), callback_manager=callback_manager
)

## Context With GPT-3.5

In [None]:

eval_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0), callback_manager=callback_manager
)

In [None]:
dataset_generator = DatasetGenerator(
    nodes[:39],
    service_context=eval_context,
    show_progress=True,
    num_questions_per_chunk=20,
)