In [78]:
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration
import spacy
from utils import run_model, match_dates_based_on_precision, parse_date, month_dict
import pandas as pd
import random
from IPython.display import HTML

In [22]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
nlp = spacy.load('en_core_web_trf')

In [2]:
base_model_v2 = 'allenai/unifiedqa-v2-t5-base-1251000'
large_model_v2  = 'allenai/unifiedqa-v2-t5-large-1251000'
threeb_model_v2 = 'allenai/unifiedqa-v2-t5-3b-1251000'

In [3]:
base_tokenizer = T5Tokenizer.from_pretrained(base_model_v2)
base_model = T5ForConditionalGeneration.from_pretrained(base_model_v2)

In [4]:
large_tokenizer = T5Tokenizer.from_pretrained(large_model_v2)
large_model = T5ForConditionalGeneration.from_pretrained(large_model_v2)

In [5]:
threeb_tokenizer = T5Tokenizer.from_pretrained(threeb_model_v2)
threeb_model = T5ForConditionalGeneration.from_pretrained(threeb_model_v2)

In [24]:
annotated_df = pd.read_csv('../data/questions-for-annotation-annotated.csv').fillna('')
len(annotated_df)

100

In [25]:
annotated_df.head()

Unnamed: 0,question,answer,precision,dpr_answer,rank,filtered_rank,answerable,other_dates
0,when did Tommy Dorsey marry Patricia?,1943-01-01,year,Tommy Dorsey New Jersey. They divorced in 1943...,1,1,yes,"1956-11-26,1947-07-01,1948-03-27,2022-06-29"
1,when did Edward Everett Hale marry Emily Baldwin?,1852-01-01,year,"Edward Everett Hale Unity in Worcester, Massac...",1,1,yes,"1856-07-01,1853-07-01"
2,when did Denys Rhodes marry Margaret?,1950-07-31,day,Denys Rhodes Rhodes. Rhodes served in the Seco...,24,1,yes,"1946-07-01,1950-07-01"
3,when did Ronald Tree marry Marietta Peabody?,1947-01-01,year,"Ronald Tree improved, notably including coveri...",3,1,yes,"1942-09-26,1949-07-01,1943-07-01"
4,"when did Francis Jeune, 1st Baron St Helier ma...",1881-08-17,day,"Francis Jeune, 1st Baron St Helier Bath (KCB)....",1,1,yes,"1902-08-08,1905-02-01,1902-07-01,1902-06-26"


In [36]:
def create_other_dates(some_date, num_dates):
    based_on = random.choice(['day', 'month', 'year'])
    year, month, day = some_date.split('-')
    new_dates = []
    while num_dates > 0:
        if based_on == 'day':
            new_day = int(day) + num_dates
            if int(new_day) >= 27:
                new_day = 1
            new_dates.append(f"{year}-{month}-{new_day:0>2}")
        elif based_on == 'month':
            new_month = int(month) + num_dates
            if new_month > 12:
                new_month = 1
            new_dates.append(f"{year}-{new_month:0>2}-{day}")
        elif based_on == 'year':
            new_year = int(year) + num_dates
            if new_year > 2019:
                new_year = 2019
            new_dates.append(f"{new_year}-{month}-{day}")
        num_dates -= 1
    return new_dates

In [43]:
def format_dates(some_date, precision):
    year, month, day = some_date.split('-')
    if precision == 'day':
        return f"{day} {month_dict[month]} {year}"
    if precision == 'month':
        return f"{month_dict[month]} {year}"
    return f"{year}"

In [69]:
def create_mcq_run_model(annotated_df):
    out = []
    for _, row in annotated_df.iterrows():
        answer = row['answer']
        precision = row['precision']
        if precision == 'year':
            requires = ['year']
        else:
            requires = ['year', 'month']
        xother_dates =  row['other_dates'].split(',')
        other_dates = [x for x in xother_dates if x != '']
        if len(other_dates) > 2:
            other_dates = other_dates[:2]
        other_dates.append(answer)
        if len(other_dates) < 3:
            other_dates.extend(create_other_dates(answer, 3 - len(other_dates)))
        mcq = f"(A) {format_dates(other_dates[0], precision)} (B) {format_dates(other_dates[1], precision)} (C) {format_dates(other_dates[2], precision)} (D) no answer"
        row['mcq'] = mcq
        i_string = f"{row['question']}\n{mcq}\n{row['dpr_answer']}"
        
        base_answer = run_model(i_string, base_tokenizer, base_model)
        row['base_answer'] = base_answer[0]
        base_parsed_date =parse_date(base_answer[0], nlp, requires=requires)
        if len(base_parsed_date) > 0:
            base_matched, base_prov = match_dates_based_on_precision(answer, precision, base_parsed_date[0])
            row['base_matched'] = base_matched
        else:
            row['base_matched'] = False
            
        large_answer = run_model(i_string, large_tokenizer, large_model)
        row['large_answer'] = large_answer[0]
        large_parsed_date =parse_date(large_answer[0], nlp, requires=requires)
        if len(large_parsed_date) > 0:
            large_matched, large_prov = match_dates_based_on_precision(answer, precision, large_parsed_date[0])
            row['large_matched'] = large_matched
        else:
            row['large_matched'] = False
            
        threeb_answer = run_model(i_string, threeb_tokenizer, threeb_model)
        row['threeb_answer'] = threeb_answer[0]
        threeb_parsed_date =parse_date(threeb_answer[0], nlp, requires=requires)
        if len(threeb_parsed_date) > 0:
            threeb_matched, threeb_prov = match_dates_based_on_precision(answer, precision, threeb_parsed_date[0])
            row['threeb_matched'] = threeb_matched
        else:
            row['threeb_matched'] = False
        
        out.append(row)
    return pd.DataFrame(out)
        
            
    

In [70]:
answer_df = create_mcq_run_model(annotated_df)

In [71]:
len(answer_df[(answer_df['base_matched'] == True) & (answer_df['answerable'] == 'yes')]) / len(answer_df[answer_df['answerable'] == 'yes'])

0.8352941176470589

In [72]:
len(answer_df[(answer_df['large_matched'] == True) & (answer_df['answerable'] == 'yes')]) / len(answer_df[answer_df['answerable'] == 'yes'])

0.9529411764705882

In [73]:
len(answer_df[(answer_df['threeb_matched'] == True) & (answer_df['answerable'] == 'yes')]) / len(answer_df[answer_df['answerable'] == 'yes'])

0.9647058823529412

In [88]:
ddf = answer_df[(answer_df['answerable'] == 'yes') & (answer_df['large_matched'] == False)]
ddf

Unnamed: 0,question,answer,precision,dpr_answer,rank,filtered_rank,answerable,other_dates,mcq,base_answer,base_matched,large_answer,large_matched,threeb_answer,threeb_matched
47,when did Charlie Hunnam marry Katharine?,1999-01-01,year,Charlie Hunnam a screenplay based on a 2011 Ro...,69,1,yes,"2022-06-15,2011-07-01",(A) 2022 (B) 2011 (C) 1999 (D) no answer,2022,False,2022,False,2011,False
50,"when did Archduchess Maria Isabella, Countess ...",1850-04-10,day,"Archduchess Maria Isabella of Austria Spain, o...",2,1,yes,"1834-05-21,1901-07-14",(A) 21 May 1834 (B) 14 July 1901 (C) 10 April ...,21 May 1834,False,14 July 1901,False,10 April 1850,True
52,when did Dan Savage marry Terry?,2012-01-01,year,Dan Savage for two seasons before its cancella...,7,1,yes,2005-07-01,(A) 2005 (B) 2012 (C) 2013 (D) no answer,2005,False,2005,False,2005,False
96,when did Doutzen Kroes marry Sunnery James?,2010-01-01,year,Doutzen Kroes is actively involved with the no...,85,1,yes,"2011-01-21,2009-07-01",(A) 2011 (B) 2009 (C) 2010 (D) no answer,2010,True,2011,False,2010,True
