[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/GPT_4o_mini_Fine_Tuning.ipynb)

In [1]:
!pip install -q llama-index==0.10.57 openai==1.59.8 chromadb==0.5.5 pydantic==2.10.5 llama-index-vector-stores-chroma==0.1.10 jsonlines==4.0.0 tensorflow==2.17.1

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.6/455.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.3/584.3 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m601.3/601.3 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m68.1 MB/s[0m eta [36m0

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_API_KEY>"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY1')

In [3]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.
import nest_asyncio

nest_asyncio.apply()

## Create Vector Store

In [4]:
# Downloading Vector store from Hugging face hub
from huggingface_hub import hf_hub_download
vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip",repo_type="dataset",local_dir="/content")

vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

In [5]:
!unzip vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


In [6]:
# Setup an Embedding Model

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

# Default Embedding model
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [7]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Create your index
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create your index

vector_index = VectorStoreIndex.from_vector_store(vector_store)

### GPT-4o

In [8]:
from llama_index.llms.openai import OpenAI

llm_gpt_4o = OpenAI(temperature=1, model="gpt-4o")

In [9]:
# Query Engine
query_engine_0 = vector_index.as_query_engine(llm= llm_gpt_4o , similarity_top_k=5)

response_gpt_4o = query_engine_0.query("Compare the knowledge retention abilities of a RAG model versus a BERT-based model that has been extensively fine-tuned using PEFT techniques. How do their outputs differ when the knowledge source is removed?")

response_gpt_4o.response

'The knowledge retention abilities of a RAG model and a BERT-based model fine-tuned with PEFT techniques differ primarily in their reliance on external versus internalized knowledge. A RAG model is designed to combine retrieval and generation, using a dense vector index for accessing knowledge dynamically. This model excels in generating outputs that are specific and diverse, particularly in knowledge-intensive tasks, because it can access real-time information from the index, effectively "augmenting" its responses with external data.\n\nOn the other hand, a BERT-based model fine-tuned using PEFT techniques relies more on the knowledge embedded in its parameters. When fine-tuned extensively, it can perform well on specific tasks using the knowledge it has learned during training. However, its ability to adapt to new or unseen data without external information is limited compared to the RAG model. \n\nIn scenarios where the knowledge source is removed, a RAG model would likely struggle 

In [10]:
for src in response_gpt_4o.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Metadata\t", src.metadata)
    print("-_" * 20)

Node ID	 727ac73b-1bfd-4782-8523-fa38b48cec50
Title	 Adaptive Retrieval-Augmented Generation for Conversational Systems:Model Training and Evaluation Setups
Text	 We evaluate the performance of introducing RAGate according to its binary classification performance and the effectiveness of the resulting response generation. Specifically, we use the KE-TOD dataset (Chen et al., 2022), which has fully annotated 5,324 dialogues and 52,063 turns of conversations. In particular, it is associated with 33,761 knowledge snippets to be retrieved and augmented. In addition, KETOD was developed with human labels on turns of conversations (around 12.1% of turns) about the need for augmenting with retrieved knowledge snippets for a natural and informative system response. Hence, we use these human labels as natural ground truths when evaluating RAGate. It is worth indicating that many current knowledge-augmented conversational datasets often ground their conversations on the knowledge snippet, such a

### GPT-4o-mini

In [11]:
from llama_index.llms.openai import OpenAI

llm_gpt_4o_mini = OpenAI(temperature=1, model="gpt-4o-mini")

In [12]:
# Query Engine
query_engine_1 = vector_index.as_query_engine(llm= llm_gpt_4o_mini , similarity_top_k=5)

response_gpt_4o_mini = query_engine_1.query("Compare the knowledge retention abilities of a RAG model versus a BERT-based model that has been extensively fine-tuned using PEFT techniques. How do their outputs differ when the knowledge source is removed?")

response_gpt_4o_mini.response

"The knowledge retention abilities of a RAG model and a BERT-based model fine-tuned with parameter-efficient fine-tuning (PEFT) techniques differ significantly, particularly when the knowledge source is removed. \n\nRAG models integrate a retrieval mechanism that utilizes non-parametric memory, such as a dense vector index of external knowledge (e.g., Wikipedia), to enhance their outputs. When the knowledge source is available, RAG models retrieve relevant information to generate responses that are more specific, diverse, and factually accurate, which is particularly beneficial for knowledge-intensive tasks.\n\nIn contrast, a BERT-based model, even one that has been well fine-tuned with PEFT techniques, primarily relies on its internal parameters for knowledge. Without an external knowledge source, it may struggle to provide accurate or contextually rich responses, as it doesn't have the same direct access to external information that RAG models utilize for generation. The outputs from

In [13]:
for src in response_gpt_4o_mini.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Metadata\t", src.metadata)
    print("-_" * 20)

Node ID	 727ac73b-1bfd-4782-8523-fa38b48cec50
Title	 Adaptive Retrieval-Augmented Generation for Conversational Systems:Model Training and Evaluation Setups
Text	 We evaluate the performance of introducing RAGate according to its binary classification performance and the effectiveness of the resulting response generation. Specifically, we use the KE-TOD dataset (Chen et al., 2022), which has fully annotated 5,324 dialogues and 52,063 turns of conversations. In particular, it is associated with 33,761 knowledge snippets to be retrieved and augmented. In addition, KETOD was developed with human labels on turns of conversations (around 12.1% of turns) about the need for augmenting with retrieved knowledge snippets for a natural and informative system response. Hence, we use these human labels as natural ground truths when evaluating RAGate. It is worth indicating that many current knowledge-augmented conversational datasets often ground their conversations on the knowledge snippet, such a

### Dataset Format validation & Number of tokens in training data

In [14]:
# Format error checks

# https://cookbook.openai.com/examples/chat_finetuning_data_prep

from collections import defaultdict
format_errors = defaultdict(int)

def validate_dataset(output_data):

  for ex in output_data:
      if not isinstance(ex, dict):
          format_errors["data_type"] += 1
          continue

      messages = ex.get("messages", None)
      if not messages:
          format_errors["missing_messages_list"] += 1
          continue

      for message in messages:
          if "role" not in message or "content" not in message:
              format_errors["message_missing_key"] += 1

          if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
              format_errors["message_unrecognized_key"] += 1

          if message.get("role", None) not in ("system", "user", "assistant", "function"):
              format_errors["unrecognized_role"] += 1

          content = message.get("content", None)
          function_call = message.get("function_call", None)

          if (not content and not function_call) or not isinstance(content, str):
              format_errors["missing_content"] += 1

      if not any(message.get("role", None) == "assistant" for message in messages):
          format_errors["example_missing_assistant_message"] += 1

  if format_errors:
      print("Found errors:")
      for k, v in format_errors.items():
          print(f"{k}: {v}")
  else:
      print("\nNo errors found in the Formatted dataset \n")

In [15]:
import tiktoken

def counting_no_tokens(output_data):

  tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")

  total_tokens = sum(len(tokenizer.encode(" ".join(message['content'] for message in entry['messages']))) for entry in output_data)

  print(f"Total number of tokens in the Dataset: {total_tokens} \n")

In [16]:
from huggingface_hub import hf_hub_download
import json
import jsonlines
from pprint import pprint

def dataset_preparation(file_name):
    file_path = hf_hub_download(
        repo_id="jaiganesan/GPT_4o_mini_Fine_tune",
        filename=file_name,
        repo_type="dataset",
        local_dir="/content"
    )

    with open(file_path, "r") as file:
        data = [json.loads(line) for line in file]

    print("Total entries in the dataset:", len(data))
    print("-_"*30)
    print(data[4])

    output_data = []

    for entry in data:
        formatted_entry = {
            "messages": [
                {"role": "system", "content": "As AI Tutor, answer questions related to AI topics in an in-depth and factual manner."},
                {"role": "user", "content": entry['question']},
                {"role": "assistant", "content": entry['answer']}
            ]
        }
        output_data.append(formatted_entry)

    # Validate and analyze the output data
    validate_dataset(output_data)
    counting_no_tokens(output_data)

    print("-_"*30)
    print(output_data[4])

    base_file_name = os.path.splitext(file_name)[0]
    output_file_path = f'formatted_{base_file_name}.jsonl'

    with jsonlines.open(output_file_path, mode='w') as writer:
        writer.write_all(output_data)

    print(f"\nFormatted dataset has been saved to {output_file_path}.")


In [17]:
# Training Dataset
dataset_preparation("question_answers_data_100.jsonl")

question_answers_data_100.jsonl:   0%|          | 0.00/276k [00:00<?, ?B/s]

Total entries in the dataset: 100
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
{'source': 'tai_blog', 'question': 'What are the key advantages of using BiFPN in object detection compared to conventional methods?', 'answer': "BiFPN, or Bi-directional Feature Pyramid Network, offers several advantages in object detection when compared to conventional methods. It's part of the EfficientDet family of object detectors developed by Google Research and is designed to enhance the efficiency and scalability of object detection models.\n\n### Key Advantages of BiFPN:\n\n1. **Weighted Feature Fusion:**\n   Unlike conventional methods that simply sum up input features during feature fusion, BiFPN introduces learnable weights to adjust the importance of different input features. This means that during multi-scale fusion, input features are not merely combined indiscriminately but are weighted according to their relevance, which enhances the accuracy of the fusion process.\n\n2. **Bi

In [18]:
# Evaluation Dataset
dataset_preparation("question_answers_data_30.jsonl")

question_answers_data_30.jsonl:   0%|          | 0.00/81.6k [00:00<?, ?B/s]

Total entries in the dataset: 30
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
{'source': 'openai_cookbooks', 'question': 'How can creating high-quality evaluations for large language models like GPT-4 improve the stability and reliability of AI applications?', 'answer': "Creating high-quality evaluations for large language models (LLMs), like GPT-4, significantly enhances the stability and reliability of AI applications. Evaluations serve as a robust mechanism to monitor and assess how well these models perform across various scenarios, ultimately leading to improvements in model robustness and reliability.\n\nFirstly, high-quality evaluations can help identify and address areas where models may be underperforming. For instance, systematic evaluations can uncover issues such as drifting performance or deteriorating accuracy over time. By regularly evaluating LLMs against a comprehensive set of benchmarks, developers can detect and correct potential degradation in model 

**This Formatted Training and Evaluation Datasets are being used in the OpenAI Models Fine tuning**

**Up until now, we have explored response generation in RAG System using GPT-4o and GPT-4o-mini and Formatting Training data. Moving forward, we will focus on response generation using a newly fine-tuned model. In our lesson, we explored the process of fine-tuning through the OpenAI UI. However, if you want to learn about Fine tuning OpenAI Models using Code, You can explore the code sections in the notebook.**

## Response Generation Using New Fine Tuned Model

In [22]:
# Fine Tuned Model

from llama_index.llms.openai import OpenAI

llm_gpt_fine_tuned_model = OpenAI(temperature=1, model="ft:gpt-4o-mini-2024-07-18:towards-ai:ai-tutor:AIc6MRSB")

In [23]:
# Query Engine
query_engine_2 = vector_index.as_query_engine(llm= llm_gpt_fine_tuned_model , similarity_top_k=5)

response_fine_tuned_model = query_engine_2.query("Compare the knowledge retention abilities of a RAG model versus a BERT-based model that has been extensively fine-tuned using PEFT techniques. How do their outputs differ when the knowledge source is removed?")

response_fine_tuned_model.response

'The performance of a RAG model in terms of knowledge retention abilities shows a significant improvement over a BERT-based model that is extensively fine-tuned using PEFT techniques, especially in knowledge-intensive tasks. RAG models are designed to integrate both parametric and non-parametric memory components, utilizing a dense vector index of external knowledge (like Wikipedia) and enhancing their outputs with this retrieved information. This feature allows RAG models to accurately recall and generate specific, diverse, and factual content, resulting in superior performance on tasks requiring up-to-date knowledge.\n\nBERT-based models, even when fine-tuned with PEFT techniques, tend to retain knowledge primarily through their model parameters but cannot dynamically access external knowledge bases. This lack of an external knowledge link means that BERT models face limitations in accessing real-time or extensive domain knowledge, leading to outdated or generalized generative output

In [21]:
for src in response_fine_tuned_model.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Metadata\t", src.metadata)
    print("-_" * 20)

Node ID	 727ac73b-1bfd-4782-8523-fa38b48cec50
Title	 Adaptive Retrieval-Augmented Generation for Conversational Systems:Model Training and Evaluation Setups
Text	 We evaluate the performance of introducing RAGate according to its binary classification performance and the effectiveness of the resulting response generation. Specifically, we use the KE-TOD dataset (Chen et al., 2022), which has fully annotated 5,324 dialogues and 52,063 turns of conversations. In particular, it is associated with 33,761 knowledge snippets to be retrieved and augmented. In addition, KETOD was developed with human labels on turns of conversations (around 12.1% of turns) about the need for augmenting with retrieved knowledge snippets for a natural and informative system response. Hence, we use these human labels as natural ground truths when evaluating RAGate. It is worth indicating that many current knowledge-augmented conversational datasets often ground their conversations on the knowledge snippet, such a

# Fine Tuning OpenAI Models Using OpenAI API (Code)

### Upload file

In [None]:
from openai import OpenAI
client = OpenAI()

fine_tune_file = client.files.create(
    file=open("/content/formatted_training_data.json", "rb"),
    purpose="fine-tune"
)

In [None]:
pprint(fine_tune_file)

FileObject(id='file-yjrbFrpGxfte1fIrnwgAX12A', bytes=291689, created_at=1728892283, filename='formatted_training_data.json', object='file', purpose='fine-tune', status='processed', status_details=None)


In [None]:
param_training_file_name = fine_tune_file.id
pprint(param_training_file_name)

'file-yjrbFrpGxfte1fIrnwgAX12A'


### Create Fine tune model

In [None]:
result_job = client.fine_tuning.jobs.create(
    training_file=param_training_file_name,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={ "n_epochs":2 }
)

pprint(result_job)

In [None]:
param_file_tune_job_id = result_job.id
pprint(param_file_tune_job_id)

### Model Fine tuning

In [None]:
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve(param_file_tune_job_id)

In [None]:
# Retrieve the status of a fine-tune
client.fine_tuning.jobs.retrieve(param_file_tune_job_id).status

In [None]:
import time
from datetime import datetime

while True:
  time.sleep(20)
  try:
      job_status = client.fine_tuning.jobs.retrieve(param_file_tune_job_id)
      print(f"------------ Job Status: {job_status.status} --------------")

      if job_status.status in ["failed", "succeeded", "cancelled"]:
          print("Job Completed. Detailed Events list:")
          events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=param_file_tune_job_id)
          for event in events:
              print(f'{datetime.fromtimestamp(event.created_at)} {event.message}')

          print("######## Fine-tuned model ###########")
          print(f"{job_status.fine_tuned_model}")
          print("#####################################")
          break
  except Exception as e:
      print(f"Error monitoring job: {e}")
      break

In [None]:
# # Retrieve the state of a fine-tune

# while client.fine_tuning.jobs.retrieve(param_file_tune_job_id).status != 'succeeded':
#   sleep(10)

In [None]:
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve(param_file_tune_job_id).status

In [None]:
param_file_tune_model = client.fine_tuning.jobs.retrieve(param_file_tune_job_id).fine_tuned_model
pprint(param_file_tune_model)

### We can delete the fine tuned model

In [None]:
# Delete a fine-tuned model (must be an owner of the org the model was created in)
# result_delete = client.models.delete(param_file_tune_model)
# pprint(result_delete)