In [48]:
# Install packages

! pip install -q openai datasets tiktoken

# set openai api key
import os


In [58]:
# load the squad dataset from huggingface

from datasets import load_dataset

train_data = load_dataset("squad_v2", split="train").shuffle(seed=0)

train_data = train_data.select(range(100))

# only include train data with a single answer
train_data = [
    item for item in train_data if len(item["answers"]["text"]) == 1
]
len(train_data)

63

In [59]:
from pprint import pprint

pprint(train_data[0])

{'answers': {'answer_start': [150], 'text': ['anthropology']},
 'context': 'The origins of the Samoans are closely studied in modern research '
            'about Polynesia in various scientific disciplines such as '
            'genetics, linguistics and anthropology. Scientific research is '
            'ongoing, although a number of different theories exist; including '
            'one proposing that the Samoans originated from Austronesian '
            'predecessors during the terminal eastward Lapita expansion period '
            'from Southeast Asia and Melanesia between 2,500 and 1,500 BCE. '
            'The Samoan origins are currently being reassessed due to new '
            'scientific evidence and carbon dating findings from 2003 and '
            'onwards.',
 'id': '5726a4cd708984140094ccc0',
 'question': 'In addition to linguistics and genetics, what field of study '
             'researches Samoan origins?',
 'title': 'Samoa'}


In [60]:
def create_chat_messages(context, question, *examples):
    messages = [
        {"role": "system", "content": "You never respond in full sentences and you use as few words as possible. You usually answer with a single word or number repeating the reference text verbatim."}
    ]
    for example in examples:
        assert len(example["answers"]["text"]) == 1, "We only will support single answers for now"
        messages.extend([
            {"role": "user", "content": example["context"]},
            {"role": "user", "content": example["question"]},
            {"role": "assistant", "content": example["answers"]["text"][0]}
        ])
    messages.extend(
        [
            {"role": "user", "content": context},
            {"role": "user", "content": question},
        ]
    )
    return messages

def get_llm_response(messages):
    import openai

    MODEL = "gpt-3.5-turbo"

    response = openai.ChatCompletion.create(
        model=MODEL,    
        messages=messages,
        temperature=0.9,
    )
    response = response.choices[0]["message"]["content"]
    return response

messages = create_chat_messages("The answer is 42", "What is the answer?")
pprint(messages)
correct_answer = "42"
print(correct_answer)
print(f"model response: {get_llm_response(messages)}")

[{'content': 'You never respond in full sentences and you use as few words as '
             'possible. You usually answer with a single word or number '
             'repeating the reference text verbatim.',
  'role': 'system'},
 {'content': 'The answer is 42', 'role': 'user'},
 {'content': 'What is the answer?', 'role': 'user'}]
42
model response: 42.


In [42]:
index = 2

context = train_data[index]["context"]
question = train_data[index]["question"]
messages = create_chat_messages(
    context,
    question,
    train_data[0], train_data[1]
)
pprint(messages)
assert len(train_data[index]["answers"]["text"]) == 1, "We only will support single answers for now"
correct_answer = train_data[index]["answers"]["text"][0]
print(f"correct answer: {correct_answer}")
llm_answer = get_llm_response(messages)
print(f"model response: {llm_answer}")

[{'content': 'You never respond in full sentences and you use as few words as '
             'possible. You usually answer with a single word or number '
             'repeating the reference text verbatim.',
  'role': 'system'},
 {'content': 'The origins of the Samoans are closely studied in modern '
             'research about Polynesia in various scientific disciplines such '
             'as genetics, linguistics and anthropology. Scientific research '
             'is ongoing, although a number of different theories exist; '
             'including one proposing that the Samoans originated from '
             'Austronesian predecessors during the terminal eastward Lapita '
             'expansion period from Southeast Asia and Melanesia between 2,500 '
             'and 1,500 BCE. The Samoan origins are currently being reassessed '
             'due to new scientific evidence and carbon dating findings from '
             '2003 and onwards.',
  'role': 'user'},
 {'content': 'In a

In [49]:
# convert the answer to tokens and calcuate the rouge score
import openai
import tiktoken
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')


In [54]:
def exact_match_score(correct_answer, llm_answer):
    correct_answer_embedding = encoding.encode(correct_answer)
    llm_answer_embedding = encoding.encode(llm_answer)

    return 1.0 * float(correct_answer_embedding == llm_answer_embedding)

exact_match_score(correct_answer, llm_answer)

def f1_score(correct_answer, llm_answer):
    from collections import Counter
    correct_answer_embedding = Counter(encoding.encode(correct_answer))
    llm_answer_embedding = Counter(encoding.encode(llm_answer))

    common = correct_answer_embedding & llm_answer_embedding
    num_same = sum(common.values())

    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(llm_answer_embedding)
    recall = 1.0 * num_same / len(correct_answer_embedding)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

f1_score(correct_answer, llm_answer)

0.923076923076923

In [62]:
responses = []

for index in range(2, 60):
    print(f"index: {index}")
    context = train_data[index]["context"]
    question = train_data[index]["question"]
    messages = create_chat_messages(
        context,
        question,
        train_data[0], train_data[1]
    )
    print(f"question: {question}")
    correct_answer = train_data[index]["answers"]["text"][0]
    print(f"correct answer: {correct_answer}")
    llm_answer = get_llm_response(messages)
    print(f"model response: {llm_answer}")
    responses.append(
        {
            "question": question,
            "correct_answer": correct_answer,
            "llm_answer": llm_answer,
            "f1_score": f1_score(correct_answer, llm_answer),
            "exact_match_score": exact_match_score(correct_answer, llm_answer)
        }
    )
    # sleep for 5 seconds to avoid rate limiting
    import time
    time.sleep(5)
    print("")

index: 2
question: What factors other than trophy hunting are responsible for the decline of wildlife in Botswana?
correct answer: poaching, drought and habitat loss
model response: poaching, drought, habitat loss

index: 3
question: During whose reign did the Mughal Empire reach its greatest expanse?
correct answer: Aurangzeb
model response: Aurangzeb

index: 4
question: Which train line do the 1 2 trains serve?
correct answer: IRT Broadway – Seventh Avenue Line
model response: IRT Broadway – Seventh Avenue Line

index: 5
question: Approximately how many people attend church services on BYU's campus?
correct answer: 24,000
model response: about 24,000

index: 6
question: Which trial in Manhatten helped establish the right of freedom of the press?
correct answer: John Peter Zenger
model response: The trial of John Peter Zenger.

index: 7
question: What Security Council Resolution recommended that the Marshall Islands be allowed to join the UN?
correct answer: Resolution 704
model respo

In [63]:
import pandas as pd
df = pd.DataFrame(responses)
df

Unnamed: 0,question,correct_answer,llm_answer,f1_score,exact_match_score
0,What factors other than trophy hunting are res...,"poaching, drought and habitat loss","poaching, drought, habitat loss",0.923077,0.0
1,During whose reign did the Mughal Empire reach...,Aurangzeb,Aurangzeb,1.0,1.0
2,Which train line do the 1 2 trains serve?,IRT Broadway – Seventh Avenue Line,IRT Broadway – Seventh Avenue Line,1.0,1.0
3,Approximately how many people attend church se...,24000,"about 24,000",0.75,0.0
4,Which trial in Manhatten helped establish the ...,John Peter Zenger,The trial of John Peter Zenger.,0.5,0.0
5,What Security Council Resolution recommended t...,Resolution 704,Resolution 704.,0.857143,0.0
6,Who was a host on all seasons of American Idol?,Ryan Seacrest,Ryan Seacrest.,0.888889,0.0
7,Who ruled Swaziland in the late 1970s?,King Sobhuza II,King Sobhuza II,1.0,1.0
8,Who was arrested on April 26 for posting an on...,A Macau resident,A Macau resident.,0.888889,0.0
9,Where did Victoria spend the Christmas of 1900?,Osborne House,Osborne House on the Isle of Wight.,0.461538,0.0


In [73]:
def normalize(string):
    import re
    # remove articles
    string = re.sub(r"\b(a|an|the)\b", " ", string)
    # fix whitespace
    string = re.sub(r"\s+", " ", string)
    # remove punctuation
    string = re.sub(r"[^\w\s]", " ", string)
    # lowercase
    string = string.lower()
    # strip leading and trailing whitespace
    string = string.strip()
    return string
df["f1_score"] = [f1_score(normalize(correct_answer), normalize(llm_answer)) for correct_answer, llm_answer in zip(df["correct_answer"], df["llm_answer"])]
df["exact_match_score"] = [exact_match_score(normalize(correct_answer), normalize(llm_answer)) for correct_answer, llm_answer in zip(df["correct_answer"], df["llm_answer"])]


df[['f1_score', 'exact_match_score']].mean()

f1_score             0.728787
exact_match_score    0.568966
dtype: float64