In [None]:
# Capstone Project: Building an Interactive AI Rainbow on a Budget Chatbot

In [None]:
# Import libraries, API and set filepath

In [1]:
#!pip install llama_index==0.8.64
#!pip install openai==1.19.0
!pip install spacy
!pip install llama-index==0.8.64 pypdf sentence-transformers ragas openai



In [2]:
import os

from llama_index import Document, GPTVectorStoreIndex, ServiceContext, VectorStoreIndex
from llama_index.readers import BeautifulSoupWebReader, SimpleDirectoryReader
from llama_index.llms import OpenAI
from llama_index.evaluation import DatasetGenerator

import openai

from pathlib import Path
from llama_index import download_loader

from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness
from llama_index.response.notebook_utils import display_response
from llama_index.callbacks import OpenAIFineTuningHandler
from llama_index.callbacks import CallbackManager

import random
import nest_asyncio

In [3]:
# set filepath to my data directory 

current_dir = os.getcwd()
data_dir = os.path.join(current_dir, "./data")

In [4]:
#Load the data

In [5]:
from pathlib import Path 
from llama_index import download_loader

PagedCSVReader = download_loader("PagedCSVReader")

loader = PagedCSVReader(encoding="utf-8")
docs = loader.load_data(file=Path('./data/products.csv')) 

#read and load the csv file into the model

## Build Index

#With all the data loaded, we can construct the index for the chatbot. There are 4 types of indexing: Summary index, VectorStore Index, Tree Index and Keyword Table Index. Here we are using VectorStore Index, which is also one of the most common types of indexing.

In [7]:
openai.api_key = os.environ['OPENAI_API_KEY']

service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0) # degree of randomness from 0 to 1. 
)

index = GPTVectorStoreIndex.from_documents(documents=docs, service_context=service_context)


#method is used to create an index from a set of documents (docs).

In [8]:
# saving the output as a vector store so that we can refer to this 
# instead of running the embedding model above again

index.storage_context.persist(persist_dir="./data/index.vecstore")  

#the data is stored in that location, making it easier to load the index from storage later on without having to re-index the documents. 

In [9]:
##Train Generation

In [11]:
# Shuffle the documents
import random

random.seed(42)
random.shuffle(docs)

gpt_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0)
)

In [16]:
question_gen_query=(
    "Craft a series of questions that customers might ask about what fruits and vegetables to add to their diet to achieve the optimal nutrition. The fruits and vegetables in their cart should include at least 1 item from the Red column, Yellow & Orange column, White, Tan & Brown column, Green column and Blue & Purple column of the same row of the data file. Check if the user input has at least one item from each of these columns and then provide the full row of values that are closest to the user input which must include the full recipe in the corresponding Recipes column. You must only use the data from the dataset provided"
)
# find out more about question generation from 
# https://gpt-index.readthedocs.io/en/latest/examples/evaluation/QuestionGeneration.html

dataset_generator = DatasetGenerator.from_documents(
    docs[:24],
    question_gen_query=question_gen_query,
    service_context=gpt_context,
)

In [17]:
nest_asyncio.apply()

#The code snippet import nest_asyncio; nest_asyncio.apply() is used to enable nested event loops in asyncio, which is the default behavior in Python's asyncio library.

In [18]:
questions = dataset_generator.generate_questions_from_nodes(num=25)   
print("Generated ", len(questions), " questions")
#generates a list of questions based on the documents. 25 questions generated

Generated  25  questions


In [23]:
with open("train_questions.txt", "w") as f:
    for question in questions:
        f.write(question + "\n")
        print(question)
        
# writes a list of questions to a file named train_questions.txt and prints each question to the console.

What fruits and vegetables should I add to my diet to achieve optimal nutrition?
Can you suggest a combination of fruits and vegetables from the Red, Yellow & Orange, White, Tan & Brown, Green, and Blue & Purple columns for a balanced diet?
How can I incorporate items from the Red, Yellow & Orange, White, Tan & Brown, Green, and Blue & Purple columns into my meals for optimal nutrition?
What recipe can I make using ingredients from the Red, Yellow & Orange, White, Tan & Brown, Green, and Blue & Purple columns for a nutritious meal?
Are there any specific fruits and vegetables I should focus on from the Red, Yellow & Orange, White, Tan & Brown, Green, and Blue & Purple columns for a well-rounded diet?
Can you provide a list of fruits and vegetables from the Red, Yellow & Orange, White, Tan & Brown, Green, and Blue & Purple columns that are recommended for optimal nutrition?
How can I ensure I am getting a variety of nutrients by including items from the Red, Yellow & Orange, White, Tan 

In [None]:
input_file_path = 'train_questions.txt'
output_file_path = 'modified_train_questions.txt'

def postprocess(input_file_path, output_file_path):
    with open(input_file_path, 'r') as file:
        modified_lines = [line.replace("Question:", "").strip() for line in file]

    with open(output_file_path, 'w') as new_file:
        for line in modified_lines:
            new_file.write(line + '\n')
            
#modifies the content by removing a specific string ("Question:") from each line, and writes the modified lines to a new output file 

In [84]:
##Eval Generation

In [26]:
dataset_generator = DatasetGenerator.from_documents(
    docs[
        25:49
    ],
    question_gen_query=question_gen_query,
    service_context=gpt_context,
)

# generate datasets (potentially question-answer pairs) from a specified subset of documents. The subset is defined by slicing the docs list from index 30 to 54 (Python slicing is exclusive of the end index).

In [27]:
questions = dataset_generator.generate_questions_from_nodes(num=25)
print("Generated ", len(questions), " questions")
#generates a list of questions based on the documents. 25 question generated

Generated  25  questions


In [28]:
with open("eval_questions.txt", "w") as f:
    for question in questions:
        f.write(question + "\n")
#writes a list of questions to a file named eval_questions.txt

In [34]:
input_file_path = 'eval_questions.txt'
output_file_path = 'modified_eval_questions.txt'

postprocess(input_file_path, output_file_path)

#modify the content of a file named eval_questions.txt and write the modified content to a new file named modified_eval_questions.txt. 

NameError: name 'postprocess' is not defined

In [32]:
print("Total number of documents:", len(docs))
#prints the total number of documents in the docs list.

Total number of documents: 50


## Initial Evaluation

For this evaluation with GPT-3.5 Query Engine, we will be using the [`ragas` evaluation library](https://github.com/explodinggradients/ragas).

For this notebook, we will be using the following two metrics:

- `answer_relevancy` - This measures how relevant is the generated answer to the prompt. If the generated answer is incomplete or contains redundant
- information the score will be low. This is quantified by working out the chance of an LLM generating the given question using the generated answer. Values range (0,1), higher the better.  
- `faithfulness` - This measures the factual consistency of the generated answer against the given context. This is done using a multi step paradigm that includes creation of statements from the generated answer followed by verifying each of these statements against the context. The answer is scaled to (0,1) range. Higher the better.

In [37]:
questions = []
with open("eval_questions.txt", "r") as f:
    for line in f:
        questions.append(line.strip())
#reads questions from a file named modified_eval_questions.txt and stores them in a list named questions.

In [39]:
from llama_index import VectorStoreIndex

# limit the context window to 2048 tokens so that refine is used
gpt_context = ServiceContext.from_defaults(
    llm=OpenAI(model="ft:gpt-3.5-turbo-1106:personal:capstone-exp-3:9vgnLOrh", temperature=0), context_window=2048
)

index = VectorStoreIndex.from_documents(docs, service_context=gpt_context)

query_engine = index.as_query_engine(similarity_top_k=2)

#GPT-3.5-turbo model to understand the semantic content of documents and then uses this understanding to find documents that are semantically similar to a given query. 

In [40]:
contexts = []
answers = []


for question in questions:
    response = query_engine.query(question)
    contexts.append([x.node.get_content() for x in response.source_nodes])
    answers.append(str(response))

#store the contexts and answers of the responses

In [41]:
questions[:25]

['What fruits and vegetables should I add to my diet to achieve optimal nutrition?',
 'Can you suggest a combination of fruits and vegetables from the Red, Yellow & Orange, White, Tan & Brown, Green, and Blue & Purple columns for a balanced diet?',
 'How can I incorporate strawberries, yellow watermelon, garlic, celery, and Concord grapes into my meals for optimal nutrition?',
 'What recipe can I make using strawberries, yellow watermelon, garlic, celery, and Concord grapes for a nutritious meal?',
 'Are there any specific dishes or recipes that include a variety of fruits and vegetables like strawberries, yellow watermelon, garlic, celery, and Concord grapes for a well-rounded diet?',
 'Can you provide a meal plan that includes strawberries, yellow watermelon, garlic, celery, and Concord grapes for a healthy lifestyle?',
 'What are the health benefits of consuming a combination of strawberries, yellow watermelon, garlic, celery, and Concord grapes regularly?',
 'How can I ensure I am 

## Create Fine Tuned Engine



In [44]:
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from llama_index.callbacks import OpenAIFineTuningHandler
from llama_index.callbacks import CallbackManager

finetuning_handler = OpenAIFineTuningHandler()
callback_manager = CallbackManager([finetuning_handler])

gpt_35_context = ServiceContext.from_defaults(
    llm=OpenAI(model="ft:gpt-3.5-turbo-1106:personal:capstone-exp-3:9vgnLOrh", temperature=0),
    context_window=2048,  # limit the context window artifically to test refine process
    callback_manager=callback_manager,
)
#GPT-3.5-turbo model to understand the semantic content of documents and then uses this understanding to find documents that are semantically similar to a given query

In [45]:
from llama_index import VectorStoreIndex

index = VectorStoreIndex.from_documents(docs, service_context=gpt_35_context)

query_engine = index.as_query_engine(similarity_top_k=2)
#create a VectorStoreIndex from a collection of documents (docs) using the LlamaIndex library, and then convert this index into a query engine 

In [46]:
questions = []
with open("train_questions.txt", "r") as f:
    for line in f:
        questions.append(line.strip())
#reads questions from a file named modified_train_questions.txt and stores them in a list named questions.

In [47]:
for question in questions:
    response = query_engine.query(question)

# loop that iterates over a list of questions, querying a query_engine for each question and storing the response in a variable named respons

In [48]:
finetuning_handler.save_finetuning_events("finetune.jsonl")
#save fine-tuning events to a JSONL file called finetune.

Wrote 25 examples to finetune.jsonl


## Evaluating Fine Tuned Engine

After some time, your model will be done training!

The next step is running our fine-tuned model on our eval dataset again to measure any performance increase.

In [50]:
questions = []
with open("eval_questions.txt", "r") as f:
    for line in f:
        questions.append(line.strip())
#reads a text file named eval_questions.txt line by line, strips any leading or trailing whitespace (including newlines) from each line, and appends each line to a list named questions

In [51]:
from llama_index import VectorStoreIndex

ft_context = ServiceContext.from_defaults(
    llm=OpenAI(model="ft:gpt-3.5-turbo-1106:personal:capstone-exp-3:9vgnLOrh",temperature=0, openai_api_key=openai.api_key), context_window=2048
)
index = VectorStoreIndex.from_documents(docs, service_context=ft_context)

query_engine = index.as_query_engine(similarity_top_k=2)

#query engine using the VectorStoreIndex from the llama_index library, specifically tailored for fine-tuning a language model (LLM) like GPT-3.5 Turbo. 

In [52]:
contexts = []
answers = []

for question in questions:
    response = query_engine.query(question)
    contexts.append([x.node.get_content() for x in response.source_nodes])
    answers.append(str(response))
#query engine to process a list of questions and collect both the contexts (source nodes) and answers from the responses.

In [53]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness

ds = Dataset.from_dict(
    {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
    }
)

result = evaluate(ds, [answer_relevancy, faithfulness])
print(result)
#evaluate result on answer_relevancy & faithfulness using ragas. Ragas score  = (answer_relevancy + faithfulness) / 2

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

{'answer_relevancy': 0.9128, 'faithfulness': 0.5017}


Second evaluation - Baseline Model - GPT 4.0 (non-finetuned)

In [54]:
questions = []
with open("eval_questions.txt", "r") as f:
    for line in f:
        questions.append(line.strip())
#reads questions from a file named modified_eval_questions.txt and stores them in a list named questions.

In [55]:
from llama_index import VectorStoreIndex

ft_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4",temperature=0, openai_api_key=openai.api_key), context_window=2048
)
index = VectorStoreIndex.from_documents(docs, service_context=ft_context)

query_engine = index.as_query_engine(similarity_top_k=2)

#query engine using the VectorStoreIndex from the llama_index library, specifically tailored for fine-tuning a language model (LLM) like GPT-3.5 Turbo. 

In [56]:
contexts = []
answers = []

for question in questions:
    response = query_engine.query(question)
    contexts.append([x.node.get_content() for x in response.source_nodes])
    answers.append(str(response))
#query engine to process a list of questions and collect both the contexts (source nodes) and answers from the responses.

In [57]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness

ds = Dataset.from_dict(
    {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
    }
)

result = evaluate(ds, [answer_relevancy, faithfulness])
print(result)
#evaluate result on answer_relevancy & faithfulness using ragas. Ragas score  = (answer_relevancy + faithfulness) / 2

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

{'answer_relevancy': 0.6685, 'faithfulness': 0.7370}


In [None]:
Third evaluation - Baseline Model - GPT 4o mini (non-finetuned)

In [63]:
questions = []
with open("eval_questions.txt", "r") as f:
    for line in f:
        questions.append(line.strip())
#reads questions from a file named modified_eval_questions.txt and stores them in a list named questions.

In [64]:
from llama_index import VectorStoreIndex

ft_context = ServiceContext.from_defaults(
    llm=OpenAI(model="ft:gpt-4o-mini-2024-07-18:personal::9yERXbv7",temperature=0, openai_api_key=openai.api_key), context_window=2048
)
index = VectorStoreIndex.from_documents(docs, service_context=ft_context)

query_engine = index.as_query_engine(similarity_top_k=2)

#query engine using the VectorStoreIndex from the llama_index library, specifically tailored for fine-tuning a language model (LLM) like GPT-3.5 Turbo. 

ValueError: Unknown model 'gpt-4o-mini-2024-07-18'. Please provide a valid OpenAI model name in: gpt-4, gpt-4-32k, gpt-4-1106-preview, gpt-4-vision-preview, gpt-4-0613, gpt-4-32k-0613, gpt-4-0314, gpt-4-32k-0314, gpt-3.5-turbo, gpt-3.5-turbo-16k, gpt-3.5-turbo-1106, gpt-3.5-turbo-0613, gpt-3.5-turbo-16k-0613, gpt-3.5-turbo-0301, text-davinci-003, text-davinci-002, gpt-3.5-turbo-instruct, text-ada-001, text-babbage-001, text-curie-001, ada, babbage, curie, davinci, gpt-35-turbo-16k, gpt-35-turbo

In [None]:
contexts = []
answers = []

for question in questions:
    response = query_engine.query(question)
    contexts.append([x.node.get_content() for x in response.source_nodes])
    answers.append(str(response))
#query engine to process a list of questions and collect both the contexts (source nodes) and answers from the responses.

In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness

ds = Dataset.from_dict(
    {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
    }
)

result = evaluate(ds, [answer_relevancy, faithfulness])
print(result)
#evaluate result on answer_relevancy & faithfulness using ragas. Ragas score  = (answer_relevancy + faithfulness) / 2

## {Tentative] Baseline Model - GPT 3.5 Turbo (Non-finetuned)

In [39]:
ds = Dataset.from_dict(
    {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
    }
)


result = evaluate(ds,[answer_relevancy, faithfulness])
print(result)

# Evaluate the answer_relevancy & faithfulness using ragas. Ragas score  = (answer_relevancy + faithfulness) / 2

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

{'answer_relevancy': 0.9391, 'faithfulness': 0.4543}
