In [None]:
# Install packages

! pip install -q openai datasets tiktoken

# set openai api key
import os


In [None]:
# load the squad dataset from huggingface

from datasets import load_dataset

train_data = load_dataset("squad_v2", split="train").shuffle(seed=0)

train_data = train_data.select(range(100))

# only include train data with a single answer
train_data = [
    item for item in train_data if len(item["answers"]["text"]) == 1
]
len(train_data)

In [None]:
from pprint import pprint

pprint(train_data[0])

In [None]:
def create_chat_messages(context, question, *examples):
    messages = [
        {"role": "system", "content": "You never respond in full sentences and you use as few words as possible. You usually answer with a single word or number repeating the reference text verbatim."}
    ]
    for example in examples:
        assert len(example["answers"]["text"]) == 1, "We only will support single answers for now"
        messages.extend([
            {"role": "user", "content": example["context"]},
            {"role": "user", "content": example["question"]},
            {"role": "assistant", "content": example["answers"]["text"][0]}
        ])
    messages.extend(
        [
            {"role": "user", "content": context},
            {"role": "user", "content": question},
        ]
    )
    return messages

def get_llm_response(messages):
    import openai

    MODEL = "gpt-3.5-turbo"

    response = openai.ChatCompletion.create(
        model=MODEL,    
        messages=messages,
        temperature=0.9,
    )
    response = response.choices[0]["message"]["content"]
    return response

messages = create_chat_messages("The answer is 42", "What is the answer?")
pprint(messages)
correct_answer = "42"
print(correct_answer)
print(f"model response: {get_llm_response(messages)}")

In [None]:
index = 2

context = train_data[index]["context"]
question = train_data[index]["question"]
messages = create_chat_messages(
    context,
    question,
    train_data[0], train_data[1]
)
pprint(messages)
assert len(train_data[index]["answers"]["text"]) == 1, "We only will support single answers for now"
correct_answer = train_data[index]["answers"]["text"][0]
print(f"correct answer: {correct_answer}")
llm_answer = get_llm_response(messages)
print(f"model response: {llm_answer}")

In [None]:
# convert the answer to tokens and calcuate the rouge score
import openai
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')


In [None]:
def exact_match_score(correct_answer, llm_answer):
    correct_answer_embedding = encoding.encode(correct_answer)
    llm_answer_embedding = encoding.encode(llm_answer)

    return 1.0 * float(correct_answer_embedding == llm_answer_embedding)

exact_match_score(correct_answer, llm_answer)

def f1_score(correct_answer, llm_answer):
    from collections import Counter
    correct_answer_embedding = Counter(encoding.encode(correct_answer))
    llm_answer_embedding = Counter(encoding.encode(llm_answer))

    common = correct_answer_embedding & llm_answer_embedding
    num_same = sum(common.values())

    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(llm_answer_embedding)
    recall = 1.0 * num_same / len(correct_answer_embedding)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

f1_score(correct_answer, llm_answer)

In [None]:
responses = []

for index in range(2, 60):
    print(f"index: {index}")
    context = train_data[index]["context"]
    question = train_data[index]["question"]
    messages = create_chat_messages(
        context,
        question,
        train_data[0], train_data[1]
    )
    print(f"question: {question}")
    correct_answer = train_data[index]["answers"]["text"][0]
    print(f"correct answer: {correct_answer}")
    llm_answer = get_llm_response(messages)
    print(f"model response: {llm_answer}")
    responses.append(
        {
            "question": question,
            "correct_answer": correct_answer,
            "llm_answer": llm_answer,
            "f1_score": f1_score(correct_answer, llm_answer),
            "exact_match_score": exact_match_score(correct_answer, llm_answer)
        }
    )
    # sleep for 5 seconds to avoid rate limiting
    import time
    time.sleep(5)
    print("")

In [None]:
import pandas as pd
df = pd.DataFrame(responses)
df

In [None]:
def normalize(string):
    import re
    # remove articles
    string = re.sub(r"\b(a|an|the)\b", " ", string)
    # fix whitespace
    string = re.sub(r"\s+", " ", string)
    # remove punctuation
    string = re.sub(r"[^\w\s]", " ", string)
    # lowercase
    string = string.lower()
    # strip leading and trailing whitespace
    string = string.strip()
    return string
df["f1_score"] = [f1_score(normalize(correct_answer), normalize(llm_answer)) for correct_answer, llm_answer in zip(df["correct_answer"], df["llm_answer"])]
df["exact_match_score"] = [exact_match_score(normalize(correct_answer), normalize(llm_answer)) for correct_answer, llm_answer in zip(df["correct_answer"], df["llm_answer"])]


df[['f1_score', 'exact_match_score']].mean()