## Library

In [None]:
pip install langchain openai google-search-results chromadb pypdf sentence_transformers

In [None]:
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.llms import OpenAI as l_OpenAI

## Enter Keys

In [None]:
SERPAPI_API_KEY = "xxx"
OPENAI_API_KEY = "sk-xxx"

In [None]:
import openai

In [None]:
from typing import List, Dict, Any

In [None]:
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)

## Approach 1

This is API call to request fine tuned model.

In [None]:
def model_finetune(query: str) -> str:
    completion = openai_client.completions.create(
        model="ft:davinci-002:personal::8JEsV0S6", # fine tuned model using 12 csvs
        prompt=query
    )

    return completion.choices[0].text

## Approach 2

This is API call to ask `chatgpt` directly.

In [None]:
def call_chatgpt(query: str, model: str = "gpt-3.5-turbo") -> str:
    """
    Generates a response to a query using the specified language model.

    Args:
        query (str): The user's query that needs to be processed.
        model (str, optional): The language model to be used. Defaults to "gpt-3.5-turbo".

    Returns:
        str: The generated response to the query.
    """

    # Prepare the conversation context with system and user messages.
    messages: List[Dict[str, str]] = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Question: {query}."},
    ]

    # Use the OpenAI client to generate a response based on the model and the conversation context.
    response: Any = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )

    # Extract the content of the response from the first choice.
    content: str = response.choices[0].message.content

    # Return the generated content.
    return content


## Approach 3

This is to use `langchain` to have internet access.

In [None]:
def call_langchain(prompt: str) -> str:
    llm = l_OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
    tools = load_tools(["serpapi", "llm-math"], llm=llm, serpapi_api_key=SERPAPI_API_KEY)
    agent = initialize_agent(
        tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True
    )
    output = agent.run(prompt)

    return output

## Approach 4

Here we use RAG.

PDF -> Chroma DB -> Vector DB -> A list numbers and document IDs

User asks: query

`chroma_collection.query`: This function will give you the relevant text about query based on the Vector DB. Results: -> A list of relevant documents and their numerical form.

In [None]:
pdf_path = "/path/to/file/file_name.pdf"

In [None]:
import os

In [None]:
os.chdir("/content/drive/MyDrive/Colab Notebooks/AI Research/Students/xxx/lectures/2024")

In [None]:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from helper_utils import load_chroma, word_wrap, project_embeddings

In [None]:
pdf_path.split('/')[-1].split('.')[0]

In [None]:
%%time
embedding_function = SentenceTransformerEmbeddingFunction()

nom = pdf_path.split('/')[-1].split('.')[0]
chroma_collection = load_chroma(filename=pdf_path, collection_name=f'{nom}', embedding_function=embedding_function)
chroma_collection.count()

In [None]:
def rag(query: str) -> str:
    results = chroma_collection.query(query_texts=query, n_results=10, include=['documents', 'embeddings'])
    retrieved_doc = results['documents'][0]

    updated_query = f"""
        Answer the question: {query}
        Based on the document provided: {retrieved_doc}
    """
    response = call_chatgpt(updated_query)
    return response

## Get Data

This assumes we get `.csv` per topic.

In [None]:
print(nom)

In [None]:
path_of_csv = f"file/path/{nom}.csv"

In [None]:
import pandas as pd

In [None]:
current_data = pd.read_csv(path_of_csv)

In [None]:
current_data.head(2)

## Test

In [None]:
query = current_data.questions[0]
true_ans = current_data.answers[0]

In [None]:
ans_finetune = model_finetune(query)
ans_finetune

In [None]:
ans_langchain = call_langchain(query)

In [None]:
ans_langchain

In [None]:
ans_chatgpt = call_chatgpt(query)
ans_chatgpt

In [None]:
ans_rag = rag(query)
ans_rag

## Measure it

In [None]:
import numpy as np
from scipy.spatial.distance import cosine

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return openai_client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
    # Compute sentence embeddings
    embedding1 = get_embedding(sentence1)  # Flatten the embedding array
    embedding2 = get_embedding(sentence2)  # Flatten the embedding array

    # Convert to array
    embedding1 = np.asarray(embedding1)
    embedding2 = np.asarray(embedding2)

    # Calculate cosine similarity between the embeddings
    similarity_score = 1 - cosine(embedding1, embedding2)

    return similarity_score

In [None]:
print(calculate_sts_openai_score(ans_finetune, true_ans))
print(calculate_sts_openai_score(ans_langchain, true_ans))
print(calculate_sts_openai_score(ans_chatgpt, true_ans))
print(calculate_sts_openai_score(ans_rag, true_ans))

## Test on Entire `.csv`

In [None]:
from tqdm import tqdm

In [None]:
current_ans = []

for i in tqdm(range(len(current_data))):
    query = current_data.questions[i]

    # Approach #1: model_finetune
    pred = model_finetune(query)
    current_ans.append(pred)

current_data['approach_1'] = current_ans

In [None]:
current_ans = []

for i in tqdm(range(len(current_data))):
    query = current_data.questions[i]

    # Approach #2: call_langchain
    try:
        pred = call_langchain(query)
    except:
        pred = ""
        print("Error")
    current_ans.append(pred)

current_data['approach_2'] = current_ans

In [None]:
current_ans = []

for i in tqdm(range(len(current_data))):
    query = current_data.questions[i]

    # Approach #3: call_chatgpt
    pred = call_chatgpt(query)
    current_ans.append(pred)

current_data['approach_3'] = current_ans

In [None]:
current_ans = []

for i in tqdm(range(len(current_data))):
    query = current_data.questions[i]

    # Approach #4: rag
    pred = rag(query)
    current_ans.append(pred)

current_data['approach_4'] = current_ans

In [None]:
%%time

current_data['score_approach_1'] = current_data.apply(lambda x: calculate_sts_openai_score(x['approach_1'], x['answers']), axis=1)
current_data['score_approach_2'] = current_data.apply(lambda x: calculate_sts_openai_score(x['approach_2'], x['answers']), axis=1)
current_data['score_approach_3'] = current_data.apply(lambda x: calculate_sts_openai_score(x['approach_3'], x['answers']), axis=1)
current_data['score_approach_4'] = current_data.apply(lambda x: calculate_sts_openai_score(x['approach_4'], x['answers']), axis=1)

In [None]:
current_data.to_csv(f"/content/drive/MyDrive/Colab Notebooks/AI Research/Students/xxx/data/final_score_{nom}.csv")