In [62]:
import json
import pandas as pd
import pickle
import openai
import functools

# Process the Squad JSON to extract what we need

In [112]:
d = pd.read_json('data/squad2.json')['data']

In [117]:
all_paragraphs = [paragraph['context'] for ent in d for paragraph in ent['paragraphs']]

In [118]:
# Ok. list comprehension is still possible but it gets a little obnoxious.
all_qa = []
paragraph_id = 0
for ent in d:
    for paragraph in ent['paragraphs']:
        for qa in paragraph['qas']:
            if len(qa['answers']) > 0 and qa['is_impossible'] == False:
                all_qa.append((paragraph_id, qa['question'], qa['answers'][0]['text']))
        paragraph_id += 1

In [119]:
len(all_qa)

86821

In [120]:
len(all_paragraphs)

19035

In [123]:
all_paragraphs[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [122]:
all_qa[0:5]

[(0, 'When did Beyonce start becoming popular?', 'in the late 1990s'),
 (0,
  'What areas did Beyonce compete in when she was growing up?',
  'singing and dancing'),
 (0,
  "When did Beyonce leave Destiny's Child and become a solo singer?",
  '2003'),
 (0, 'In what city and state did Beyonce  grow up? ', 'Houston, Texas'),
 (0, 'In which decade did Beyonce become famous?', 'late 1990s')]

In [124]:
f = open('data/paragraphs_and_qa.pickle','wb')
pickle.dump(all_paragraphs, f)
pickle.dump(all_qa, f)
f.close()

# Baseline OpenAI QA. Knowing exactly the paragraphs to use

In [125]:
f = open('data/paragraphs_and_qa.pickle','rb')
all_paragraphs = pickle.load(f)
all_qa = pickle.load(f)
f.close()

In [93]:
client = openai.OpenAI(max_retries=5,timeout=10)

In [90]:
qa_cache = {}

In [136]:
def question_answerer(client, question, context):
    global qa_cache
    key = context + question
    if key in qa_cache:
        return qa_cache[key]
    system_prompt = "You are an assistant for question-answering tasks. Use the provided pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Provide just the answer in as few words as possible. Do not use complete sentences."
    user_prompt = f"Question: {question} \nContext: {context} \nAnswer:"
    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            }
        ],
        model="gpt-3.5-turbo",
        temperature=0.0,
    )
    response = response.choices[0].message.content
    qa_cache[key] = response
    return response

In [129]:
para_id, question, answer = all_qa[0]
print(all_paragraphs[para_id])
print(question)
print(answer)

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
When did Beyonce start becoming popular?
in the late 1990s


In [137]:
response = question_answerer(client, question, all_paragraphs[para_id])
print(response)

Late 1990s


## Coming with a simple comparison evaluation

In [66]:
@functools.cache
def is_same(client, question, a1, a2):
    system_prompt = "You are an assistant for scoring answers. Two answers to a hypothetical question are provided. Say 'Yes' if both answers have the same meaning, and 'No' otherwise."
    user_prompt = f"Question: {question} \Answer 1: {a1} \nAnswer 2: {a2}"
    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            }
        ],
        model="gpt-3.5-turbo",
        temperature=0.0
    )
    return response.choices[0].message.content == 'Yes'
    

In [67]:
answer

'in the late 1990s'

In [70]:
is_same(client, qn, answer, response)

True

In [72]:
is_same(client, qn, answer, "Late 1980s")

False

# Evaluate Everything
80k questions at about 1k tokens per question == ~ 0.0010 * 80000 or about $80. That is a bit pricy for a quick test. We will subsample to ~ 10% questions

In [141]:
import numpy as np
sub_qa = all_qa[0:len(all_qa):2000]

In [139]:
import concurrent.futures

In [140]:
def do_work(client, question, true_answer, context):
    response = question_answerer(client, question, context)
    evaluation = is_same(client, question, true_answer, response)
    return true_answer, response, evaluation

In [142]:
responses = []
evaluations = []
futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    for ent in sub_qa:
        para_id, question, true_answer = ent
        future = executor.submit(do_work, client, question, true_answer, all_paragraphs[para_id])
        futures.append(future)
    for fut in futures:
        true_answer, response, evaluation = fut.result()
        print(f"Correct Answer: {true_answer}, Response: {response}, Eval: {evaluation}")
        responses.append(response)
        evaluations.append(evaluation)

Correct Answer: in the late 1990s, Response: Late 1990s, Eval: True
Correct Answer: Peter Oppenheimer, Response: Peter Oppenheimer, Eval: True
Correct Answer: gender and class, Response: gender and class, Eval: True
Correct Answer: guitar, Response: guitar, Eval: True
Correct Answer: CFS Leitrim in Ottawa, Response: CFS Leitrim in Ottawa., Eval: True
Correct Answer: The Campus, Response: The Campus, Eval: True
Correct Answer: Project Mercury, Response: Project Mercury, Eval: True
Correct Answer: incandescent, Response: Incandescent lighting, Eval: True
Correct Answer: early Christian liturgical music,, Response: Early Christian liturgical music., Eval: True
Correct Answer: September 21, 19 BC, Response: September 21, 19 BC., Eval: True
Correct Answer: ambiguity, Response: Ambiguity., Eval: True
Correct Answer: 1931, Response: 1931, Eval: True
Correct Answer: one of the top two, Response: Top two., Eval: True
Correct Answer: Electromagnetic Aircraft Launch System (EMALS), Response: Elec