## Library

In [None]:
pip install langchain openai google-search-results chromadb pypdf sentence_transformers

In [None]:
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.llms import OpenAI as l_OpenAI

## Enter Keys

In [None]:
SERPAPI_API_KEY = "xxx"
OPENAI_API_KEY = "sk-xxx"

In [None]:
import openai

In [None]:
from typing import List, Dict, Any

In [None]:
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)

## Approach 1

This is API call to request fine tuned model.

In [None]:
def model_finetune(query: str) -> str:
    completion = openai_client.completions.create(
        model="ft:davinci-002:personal::8JEsV0S6", # fine tuned model using 12 csvs
        prompt=query
    )

    return completion.choices[0].text

## Approach 2

This is API call to ask `chatgpt` directly.

In [None]:
def call_chatgpt(query: str, model: str = "gpt-3.5-turbo") -> str:
    """
    Generates a response to a query using the specified language model.

    Args:
        query (str): The user's query that needs to be processed.
        model (str, optional): The language model to be used. Defaults to "gpt-3.5-turbo".

    Returns:
        str: The generated response to the query.
    """

    # Prepare the conversation context with system and user messages.
    messages: List[Dict[str, str]] = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Question: {query}."},
    ]

    # Use the OpenAI client to generate a response based on the model and the conversation context.
    response: Any = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )

    # Extract the content of the response from the first choice.
    content: str = response.choices[0].message.content

    # Return the generated content.
    return content


## Approach 3

This is to use `langchain` to have internet access.

In [None]:
def call_langchain(prompt: str) -> str:
    llm = l_OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
    tools = load_tools(["serpapi", "llm-math"], llm=llm, serpapi_api_key=SERPAPI_API_KEY)
    agent = initialize_agent(
        tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True
    )
    output = agent.run(prompt)

    return output

## Approach 4

Here we use RAG.

PDF -> Chroma DB -> Vector DB -> A list numbers and document IDs

User asks: query

`chroma_collection.query`: This function will give you the relevant text about query based on the Vector DB. Results: -> A list of relevant documents and their numerical form.

In [None]:
pdf_path = "/path/to/file/file_name.pdf"

In [None]:
import os

In [None]:
os.chdir("/content/drive/MyDrive/Colab Notebooks/AI Research/Students/xxx/lectures/2024")

In [None]:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from helper_utils import load_chroma, word_wrap, project_embeddings

In [None]:
pdf_path.split('/')[-1].split('.')[0]

'Definition_Of_Homeless'

In [None]:
%%time
embedding_function = SentenceTransformerEmbeddingFunction()

nom = pdf_path.split('/')[-1].split('.')[0]
chroma_collection = load_chroma(filename=pdf_path, collection_name=f'{nom}', embedding_function=embedding_function)
chroma_collection.count()

CPU times: user 1.65 s, sys: 749 ms, total: 2.4 s
Wall time: 3.73 s


5

In [None]:
def rag(query: str) -> str:
    results = chroma_collection.query(query_texts=query, n_results=10, include=['documents', 'embeddings'])
    retrieved_doc = results['documents'][0]

    updated_query = f"""
        Answer the question: {query}
        Based on the document provided: {retrieved_doc}
    """
    response = call_chatgpt(updated_query)
    return response

## Get Data

This assumes we get `.csv` per topic.

In [None]:
print(nom)

Definition_Of_Homeless


In [None]:
path_of_csv = f"file/path/{nom}.csv"

In [None]:
import pandas as pd

In [None]:
current_data = pd.read_csv(path_of_csv)

In [None]:
current_data.head(2)

Unnamed: 0.1,Unnamed: 0,context,questions,answers
0,0,Interested\nin\napplying\nto\nbe\na\nresident\...,1. What are the eligibility requirements for t...,1. The eligibility requirements for the Tiny H...
1,1,The\nfirst\ndefinition\nof\na\n“literally”\nho...,1. What is the first definition of a homeless ...,"1. The first definition of a ""literally"" homel..."


## Test

In [None]:
query = current_data.questions[0]
true_ans = current_data.answers[0]

In [None]:
ans_finetune = model_finetune(query)
ans_finetune

'1. The focus of the YSA program is on homeless and low-income young'

In [None]:
ans_langchain = call_langchain(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I should use a search engine to find information about the YSA program
Action: Search
Action Input: YSA program[0m
Observation: [36;1m[1;3m['YSA provides grant funding to support state & local partner organizations and youth-led projects. Over the last 10 years, YSA awarded over $5.8 million in ...', 'Earn college credit during your last two years of high school! ... The Young Scholars Academy is a selective partnership between the Northern Kentucky Cooperative ...', 'YSA is a leading global nonprofit that activates young people, ages 5-25, to find their voice, take action, and acquire powerful civic and 21st Century skills ...', 'Our after-school and summer programs for youth ranging from elementary to middle levels. YSA strives to provide both laboratory and field research ...', 'Pompano Beach High School will host a two-week; four-day per week summer camp for gifted students entering grades four through eight. This pro

In [None]:
ans_langchain

'The YSA program focuses on activating young people to find their voice, take action, and acquire civic and 21st century skills. The target communities of the YSA program are underserved, low-income communities and the program helps young people by providing resources, training, and recognition opportunities for youth-led projects and initiatives.'

In [None]:
ans_chatgpt = call_chatgpt(query)
ans_chatgpt

'1. The focus of the YSA (Youth Service America) program is to engage and empower young people to create positive change in their communities through service and volunteering. The program aims to inspire young individuals to take action on pressing social issues and make a difference in the world.\n\n2. The YSA program targets young people between the ages of 5 and 25 from diverse backgrounds and communities. It seeks to involve youth from various socioeconomic statuses, ethnicities, and geographic locations. The program is open to young individuals globally, as it aims to foster a sense of global citizenship and encourage youth engagement on a global scale.\n\n3. The YSA program helps young people in several ways. Firstly, it provides them with opportunities to contribute to their communities and make a meaningful impact through service projects. By participating in these projects, young people develop valuable skills such as leadership, problem-solving, teamwork, and empathy.\n\nAddi

In [None]:
ans_rag = rag(query)
ans_rag



'1. The focus of the YSA program is to help young people from bipoc and lgbtqia+ communities gain important art and business skills that can lead to job opportunities. \n2. The target communities of the YSA program are bipoc and lgbtqia+ communities. \n3. The YSA program helps young people by providing job training that understands their life experiences and helps them develop skills for earning income. It also assists youth in achieving their personal and professional goals in life, teaches youth about managing money and understanding finances, builds confidence and helps young people transform their lives, encourages positive relationships within families and communities as examples for young people, advocates for the well-being of youth and promotes nonviolence, especially for people of color and lgbtqia+ community members, and empowers youth by helping them realize their own ability to make positive changes.'

## Measure it

In [None]:
import numpy as np
from scipy.spatial.distance import cosine

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return openai_client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
def calculate_sts_openai_score(sentence1: str, sentence2: str) -> float:
    # Compute sentence embeddings
    embedding1 = get_embedding(sentence1)  # Flatten the embedding array
    embedding2 = get_embedding(sentence2)  # Flatten the embedding array

    # Convert to array
    embedding1 = np.asarray(embedding1)
    embedding2 = np.asarray(embedding2)

    # Calculate cosine similarity between the embeddings
    similarity_score = 1 - cosine(embedding1, embedding2)

    return similarity_score

In [None]:
print(calculate_sts_openai_score(ans_finetune, true_ans))
print(calculate_sts_openai_score(ans_langchain, true_ans))
print(calculate_sts_openai_score(ans_chatgpt, true_ans))
print(calculate_sts_openai_score(ans_rag, true_ans))

0.8358783850038842
0.8190586759217205
0.7988415981725308
0.8750538851770955


## Test on Entire `.csv`

In [None]:
from tqdm import tqdm

In [None]:
current_ans = []

for i in tqdm(range(len(current_data))):
    query = current_data.questions[i]

    # Approach #1: model_finetune
    pred = model_finetune(query)
    current_ans.append(pred)

current_data['approach_1'] = current_ans

100%|██████████| 5/5 [00:01<00:00,  2.54it/s]


In [None]:
current_ans = []

for i in tqdm(range(len(current_data))):
    query = current_data.questions[i]

    # Approach #2: call_langchain
    try:
        pred = call_langchain(query)
    except:
        pred = ""
        print("Error")
    current_ans.append(pred)

current_data['approach_2'] = current_ans

In [None]:
current_ans = []

for i in tqdm(range(len(current_data))):
    query = current_data.questions[i]

    # Approach #3: call_chatgpt
    pred = call_chatgpt(query)
    current_ans.append(pred)

current_data['approach_3'] = current_ans

100%|██████████| 5/5 [00:28<00:00,  5.78s/it]


In [None]:
current_ans = []

for i in tqdm(range(len(current_data))):
    query = current_data.questions[i]

    # Approach #4: rag
    pred = rag(query)
    current_ans.append(pred)

current_data['approach_4'] = current_ans

100%|██████████| 5/5 [00:16<00:00,  3.38s/it]


In [None]:
%%time

current_data['score_approach_1'] = current_data.apply(lambda x: calculate_sts_openai_score(x['approach_1'], x['answers']), axis=1)
current_data['score_approach_2'] = current_data.apply(lambda x: calculate_sts_openai_score(x['approach_2'], x['answers']), axis=1)
current_data['score_approach_3'] = current_data.apply(lambda x: calculate_sts_openai_score(x['approach_3'], x['answers']), axis=1)
current_data['score_approach_4'] = current_data.apply(lambda x: calculate_sts_openai_score(x['approach_4'], x['answers']), axis=1)

CPU times: user 266 ms, sys: 20.7 ms, total: 286 ms
Wall time: 10.9 s


In [None]:
current_data.to_csv(f"/content/drive/MyDrive/Colab Notebooks/AI Research/Students/xxx/data/final_score_{nom}.csv")