# Set Configuration

In [28]:
import nest_asyncio
nest_asyncio.apply()

In [21]:
import os
import getpass

api_key = "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"

os.environ["OPENAI_API_KEY"] = getpass.getpass(api_key)

In [18]:
def generate_text(system, user, model_name="gpt-3.5-turbo", max_tokens=20, temperature=1, logprobs=False, top_logprobs=None):
    response = openai.chat.completions.create(
        model=model_name,
        messages=[
            {"content": system, "role": "system"},
            {"role": "user", "content": user}
        ],
        max_tokens=max_tokens,
        temperature=temperature,
        logprobs=logprobs,
        top_logprobs=top_logprobs
    )
    return response


In [22]:
system = 'You are an assistant.'
user = '1 + 1 = ?'
response = generate_text(system, user)
response

ChatCompletion(id='chatcmpl-93TcLDIpwdmi62U6F7UdUS8zHPSny', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='1 + 1 = 2', role='assistant', function_call=None, tool_calls=None))], created=1710615861, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_4f2ebda25a', usage=CompletionUsage(completion_tokens=7, prompt_tokens=22, total_tokens=29))

## Loading Data

In [8]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter, SimpleNodeParser
from llama_index.core.schema import MetadataMode

def load_corpus(directory, verbose=False):
    if verbose:
        print(f"Loading files in {directory}")

    reader = SimpleDirectoryReader(directory)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [9]:
TRAIN_FILES = "Camel Papers Train"
VAL_FILES = "Camel Papers Test"
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files in Camel Papers Train
Loaded 91 docs


Parsing nodes:   0%|          | 0/91 [00:00<?, ?it/s]

Parsed 156 nodes
Loading files in Camel Papers Test
Loaded 9 docs


Parsing nodes:   0%|          | 0/9 [00:00<?, ?it/s]

Parsed 17 nodes


In [23]:
print(val_nodes[0].text)  

78 CVVO 5JUL 2011Case Report  Rapport de cas
Acute respiratory distress syndrome in an alpaca cria
KatharineMSimpson,RobertNStreeter,SuzanneGGenova
Abstract —  A 7-hour-old alpaca was presented for lethargy and depression. The cria responded favorably to initial 
treatment but developed acute-onset dyspnea 48 hours later. Acute respiratory distress syndrome was diagnosed 
by thoracic imaging and blood gas analysis. The cria was successfully treated with corticosteroids and discharged 
from the hospital.
Résumé —  Syndrome de détresse respiratoire aiguë chez un cria alpaga.  Un alpaga âgé de 7 heures a été présenté 
en raison d’abattement et de dépression. Le cria a réagi favorablement au traitement initial mais a développé une 
dyspnée d’apparition aiguë 48 heures plus tard. Le syndrome de détresse respiratoire aiguë a été diagnostiqué par imagerie thoracique et gazométrie sanguine. Le cria a été traité avec succès à l’aide de corticostéroïdes et a reçu son 
congé.
(Traduit par Isabell

## Constructing a Fine-tuning Dataset

Using the nodes to construct the dataset utilizing OpenAI's `gpt-3.5-turbo`.

The basic idea:

1. We look at a node
2. We generate a question that could be answered by that node

In [3]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)

In [29]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo")

train_dataset = generate_question_context_pairs(
    llm=llm, nodes=train_nodes
)

val_dataset = generate_question_context_pairs(
    llm=llm, nodes=val_nodes
)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

100%|██████████| 156/156 [03:52<00:00,  1.49s/it]
100%|██████████| 17/17 [00:23<00:00,  1.40s/it]


In [4]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

## Fine-tuning `BAAI/bge-small-en-v1.5`

Use  [`bge-small-en-v1.5`](https://huggingface.co/BAAI/bge-small-en-v1.5) as the backbone.

In [5]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en-v1.5", 
    model_output_path="my_model_v1",
    val_dataset=val_dataset, 
    epochs=2
)

In [6]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

In [39]:
finetuned_embedding_model = finetune_engine.get_finetuned_model()

In [40]:
finetuned_embedding_model.to_json()

'{"model_name": "my_model_v1", "embed_batch_size": 10, "tokenizer_name": "my_model_v1", "max_length": 512, "pooling": "cls", "normalize": true, "query_instruction": null, "text_instruction": null, "cache_folder": null, "class_name": "HuggingFaceEmbedding"}'

In [43]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="my_model_v1")

## Evaluating Embeddings Model

In [45]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path

def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [46]:
evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")

0.6446428571428571

In [47]:
evaluate_st(val_dataset, "my_model_v1", name="finetuned")

0.7118814192343603