In [2]:
import pandas as pd
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration
from utils import run_model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
base_model_v2 = 'allenai/unifiedqa-v2-t5-base-1251000'
large_model_v2  = 'allenai/unifiedqa-v2-t5-large-1251000'
threeb_model_v2 = 'allenai/unifiedqa-v2-t5-3b-1251000'

In [4]:
base_tokenizer = T5Tokenizer.from_pretrained(base_model_v2)
base_model = T5ForConditionalGeneration.from_pretrained(base_model_v2)

In [5]:
large_tokenizer = T5Tokenizer.from_pretrained(large_model_v2)
large_model = T5ForConditionalGeneration.from_pretrained(large_model_v2)

In [6]:
threeb_tokenizer = T5Tokenizer.from_pretrained(threeb_model_v2)
threeb_model = T5ForConditionalGeneration.from_pretrained(threeb_model_v2)

In [7]:
q_regex = re.compile(r'when did (.*) marry (.*)[?]')

In [8]:
annotated_file  = '../data/questions-for-annotation-annotated.csv'

In [9]:
month_dict = {
    '01': 'January',
    '02': 'February',
    '03': 'March',
    '04': 'April',
    '05': 'May',
    '06': 'June',
    '07': 'July',
    '08': 'August',
    '09': 'September',
    '10': 'October',
    '11': 'November',
    '12': 'December'
    
}

In [10]:
annotated_df = pd.read_csv(annotated_file).fillna("")

In [11]:
new_questions = []
for i, row in annotated_df.iterrows():
    if row['answerable'] == 'yes':
        precision = row['precision']
        answer = row['answer']
        year, month, day = answer.split('-')
        if precision == 'year':
            r_answer = f"in {year}"
        elif precision == 'month':
            r_answer = f"in {month_dict[month]} {year}"
        else:
            r_answer = f"on {str(int(day))} {month_dict[month]} {year}"

        rematches = q_regex.match(row['question'])
        node1 = rematches.group(1)
        node2 = rematches.group(2)

        new_questions.append({
            'question': f'Did {node1} marry {node2} {r_answer}?',
            'answer': 'yes',
            'dpr_answer': row['dpr_answer'],
            'precision': row['precision']
        })
        other_dates = row['other_dates'].split(',')
        for od in other_dates:
            if od != '':
                oyear, omonth, oday = od.split('-')
                if precision == 'year':
                    o_answer = f"in {oyear}"
                elif precision == 'month':
                    o_answer = f"in {month_dict[omonth]} {oyear}"
                else:
                    o_answer = f"on {str(int(oday))} {month_dict[omonth]} {oyear}"
                new_questions.append({
                'question': f'Did {node1} marry {node2} {o_answer}?',
                'answer': 'no',
                'dpr_answer': row['dpr_answer'],
                'precision': row['precision']
            })

            

In [12]:
len(new_questions)

350

In [13]:
len([x for x in new_questions if x['answer'] == 'yes'])

85

In [14]:
len([x for x in new_questions if x['answer'] == 'no'])

265

In [15]:
for q in new_questions:
    input_string = f"{q['question']}\n{q['dpr_answer']}"
    q['base_answer'] = run_model(input_string, base_tokenizer, base_model)[0]
    q['large_answer'] = run_model(input_string, large_tokenizer, large_model)[0]
    q['threeb_answer'] = run_model(input_string, threeb_tokenizer, threeb_model)[0]

In [20]:
new_questions[45]

{'question': 'Did Lisa Hartman Black marry Clint in 2001?',
 'answer': 'no',
 'dpr_answer': 'Lisa Hartman Black My Heart Stops has already been reissued on CD on Wounded Bird Records. In May 2012, Hartman starred in  a movie from Twentieth Century Fox Home Entertainment. She plays the mother of a budding equestrian rider (Kacey Rohl). In 2005, Hartman starred in a made-for-TV film, Back to You and Me, on the Hallmark channel. Hartman grew up in Houston, Texas. In 1991, she married musician Clint Black; and, in 2001, the couple had a daughter, Lily Pearl Black. They have lived in Nashville, Tennessee since 2002 after living in Laurel Canyon, Los Angeles, California. Lisa Hartman Black Lisa',
 'precision': 'year',
 'base_answer': 'no',
 'large_answer': 'no',
 'threeb_answer': 'no'}

In [21]:
new_questions_df = pd.DataFrame(new_questions)

In [22]:
new_questions_df

Unnamed: 0,question,answer,dpr_answer,precision,base_answer,large_answer,threeb_answer
0,Did Tommy Dorsey marry Patricia in 1943?,yes,Tommy Dorsey New Jersey. They divorced in 1943...,year,yes,yes,yes
1,Did Tommy Dorsey marry Patricia in 1956?,no,Tommy Dorsey New Jersey. They divorced in 1943...,year,no,no,no
2,Did Tommy Dorsey marry Patricia in 1947?,no,Tommy Dorsey New Jersey. They divorced in 1943...,year,no,no,no
3,Did Tommy Dorsey marry Patricia in 1948?,no,Tommy Dorsey New Jersey. They divorced in 1943...,year,yes,no,no
4,Did Tommy Dorsey marry Patricia in 2022?,no,Tommy Dorsey New Jersey. They divorced in 1943...,year,no,no,no
...,...,...,...,...,...,...,...
345,Did Carol Wayne marry Burt in 1970?,no,"Carol Wayne Heartbreakers. In January 1984, Wa...",year,no,no,no
346,Did Carol Wayne marry Burt in 1984?,no,"Carol Wayne Heartbreakers. In January 1984, Wa...",year,no,no,no
347,Did Ava Gardner marry Mickey on 10 January 1942?,yes,Ava Gardner Knots Landing (both 1985). Soon af...,day,yes,yes,yes
348,Did Ava Gardner marry Mickey on 1 July 1943?,no,Ava Gardner Knots Landing (both 1985). Soon af...,day,no,no,no


In [23]:
def b_match_answer(row):
    return row['answer'] == row['base_answer']
def l_match_answer(row):
    return row['answer'] == row['large_answer']
def t_match_answer(row):
    return row['answer'] == row['threeb_answer']

In [24]:
new_questions_df['bm'] = new_questions_df.apply(b_match_answer, axis=1)
new_questions_df['lm'] = new_questions_df.apply(l_match_answer, axis=1)
new_questions_df['tm'] = new_questions_df.apply(t_match_answer, axis=1)

In [25]:
print(len(new_questions_df[new_questions_df['bm'] == True]) / len(new_questions_df))

0.8628571428571429


In [26]:
print(len(new_questions_df[new_questions_df['lm'] == True]) / len(new_questions_df))

0.9028571428571428


In [27]:
print(len(new_questions_df[new_questions_df['tm'] == True]) / len(new_questions_df))

0.9742857142857143


In [28]:
for _, dff in new_questions_df.groupby(by='precision'):
    print(_, len(dff), len(dff[dff['bm'] == True]) / len(dff))
    print(_, len(dff), len(dff[dff['lm'] == True]) / len(dff))
    print(_, len(dff), len(dff[dff['tm'] == True]) / len(dff))

day 128 0.859375
day 128 0.9375
day 128 0.9609375
month 10 1.0
month 10 0.8
month 10 0.9
year 212 0.8584905660377359
year 212 0.8867924528301887
year 212 0.9858490566037735


In [1]:
new_questions_df

NameError: name 'new_questions_df' is not defined