In [12]:
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration
import spacy
import dateparser
import pandas as pds
from utils import parse_date, match_dates_based_on_precision

In [2]:
nlp = spacy.load('en_core_web_trf')

In [3]:
type(nlp)

spacy.lang.en.English

In [4]:
model_name = "allenai/unifiedqa-t5-base" 

In [5]:
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Downloading: 100%|████████████████████████████████████████████████████████████████████| 773k/773k [00:00<00:00, 2.65MB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████| 1.74k/1.74k [00:00<00:00, 766kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████| 25.0/25.0 [00:00<00:00, 12.0kB/s]
Downloading: 100%|███████████████████████████████████████████████████████████████████| 1.21k/1.21k [00:00<00:00, 563kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████| 850M/850M [00:17<00:00, 50.8MB/s]


In [44]:
def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = model.generate(input_ids, **generator_args)
    return tokenizer.batch_decode(res, skip_special_tokens=True)

In [7]:
run_model("when did Muhammad V of Kelantan marry Oksana?\n\"Muhammad V of Kelantan\" 2008. As a divorcee, he was the first Yang di-Pertuan Agong in Malaysian history to reign without a Raja Permaisuri Agong. On 22 November 2018, he married Rihana Oxana Gorbatenko [\"\"Cik\"\" Rihana binti Abdullah] (n\u00e9e Oksana Andreievna Gorbatenko on 20 January 1994) from Russia, a 2015 Miss Moscow beauty pageant winner, in Barvikha, Moscow Oblast. She is the elder daughter of Dr. Andrei Ivanovich Gorbatenko, of Taganrog, Rostov-on-Don, orthopaedic surgeon and traumatologist, sometime Assistant of the Department of Traumatology & Orthopedics of the Rostov State Medical University, originally from Astrakhan, by his wife, Lyudmila, n\u00e9e Voevodina, concert pianist. Sultan Muhammad")

['22 November 2018']

In [45]:
gt_questions = json.load(open('../data/dpr-post-process-filtered-matched-gt.json'))

In [46]:
gt_questions[3131]

{'q': 'when did Sujarinee Vivacharawongse marry Vajiralongkorn I of?',
 'a': '1994-02-01',
 'n1': 'Q16140394',
 'n2': 'Q548733',
 'n1_label': 'Sujarinee Vivacharawongse',
 'n2_label': 'Vajiralongkorn I of Thailand',
 'url': 'http://en.wikipedia.org/wiki/Sujarinee_Vivacharawongse',
 'precision': '10',
 'spacy_matched': True,
 'matched_dpr_answers': [{'dpr_answer': '"Sujarinee Vivacharawongse"\nhaeng kam"" and ""Manut pralat"". In 1977 she co-starred in the films ""Sip ha yok yok sip hok mai yon"" and ""Lueat nai din"". In 1978 and 1979 she was a leading actress in the film ""Saen saep"", ""Ai thuek"" and ""Hua chai thi chom din"". In August 1979, she announced her retirement from the entertainment industry. When Vajiralongkorn was introduced to Yuvadhida Polpraserth, she was an aspiring actress. She became his steady companion and gave birth to four sons and a daughter: They were married at a palace ceremony in February 1994, where they were blessed by the King',
   'dpr_score': 78.2723

In [47]:
%%time
unified_answers = []
for question in gt_questions:
    q = question['q']
    answers = question['matched_dpr_answers']
    for a in answers:
        dpr_a = a['dpr_answer'].split('\n')[1]
        unified_qa_string = f"{q}\n{dpr_a}"
        unified_ans = run_model(unified_qa_string)
        a['unified_qa_answers'] = unified_ans
    unified_answers.append(question)
    

CPU times: user 14h 50min 40s, sys: 35min 32s, total: 15h 26min 13s
Wall time: 46min 24s


In [31]:
len(unified_answers)

13355

In [48]:
parsed_unified_qa_answers = []
for q in unified_answers:
    matched_dpr_answers = q['matched_dpr_answers']
    for mda in matched_dpr_answers:
        unified_qa_answers = mda['unified_qa_answers']
        unified_parsed_dates = []
        for uaa in unified_qa_answers:
            parsed_dates = parse_date(uaa, nlp)
            if parsed_dates:
                unified_parsed_dates += parsed_dates
        mda['unified_parsed_dates'] = unified_parsed_dates
    parsed_unified_qa_answers.append(q)

In [33]:
parsed_unified_qa_answers[13300]

{'q': 'when did William Le Poer Trench marry Sarah?',
 'a': '1800-01-01',
 'n1': 'Q8014388',
 'n2': 'Q75301211',
 'n1_label': 'William Le Poer Trench',
 'n2_label': 'Sarah Cuppage',
 'url': 'http://en.wikipedia.org/wiki/William_Le_Poer_Trench_(Royal_Navy_officer)',
 'precision': '9',
 'matched_dpr_answers': [{'dpr_answer': '"William Le Poer Trench (Royal Navy officer)"\nCustoms and Port Duties in Ireland. He was married twice, first on 8 March 1800 to Sarah Cuppage, daughter of John Loftus Cuppage. Sarah died in June 1834, and on 1 February 1837 William married a second time to Margaret Downing, daughter of Dawson Downing and Anne Boyd. William Le Poer Trench (Royal Navy officer) Rear-Admiral The Hon. William Le Poer Trench (4 July 1771 – 14 August 1846) was born in Garbally, Galway, Ireland to William Power Keating Trench, 1st Earl of Clancarty and Anne Gardiner. He acted for a considerable period as the agent of the estates of',
   'dpr_score': 70.29373931884766,
   'dpr_docid': '125

In [34]:
len(parsed_unified_qa_answers)

13355

In [49]:
open('../data/dpr-post-process-unified-qa.json', 'w').write(json.dumps(parsed_unified_qa_answers))

21544360

In [36]:
q_to_p_dict = json.load(open('../data/questions_o_precision_dict.json'))

In [50]:
def match_dpr_dates(input_file, output_file, dpr_answer_field='matched_dpr_answers'):
    out = open(output_file, 'w')
    f = json.load(open(input_file))
    c = 0 
    unified_qa_date_matched = []
    for line in f:
        any_match = False
        c += 1
        answers = line[dpr_answer_field]
        for ans in answers:
            matched = False
            parsed_dates = ans.get('unified_parsed_dates', None)
            if parsed_dates:
                for pd in parsed_dates:
                    matched, prov = match_dates_based_on_precision(line['a'], 
                                                                   q_to_p_dict[line['q']], 
                                                                   pd)
                    if matched:
                        ans['uqa_prov'] = prov
                        any_match = True
                        break
            ans['uqa_matched'] = matched
        if not any_match:
            unified_qa_date_matched.append(line)
        else:
            line['uqa_matched'] = any_match
        out.write(json.dumps(line))
        out.write('\n')
    return unified_qa_date_matched

In [51]:
no_match_unified_qa = match_dpr_dates('../data/dpr-post-process-unified-qa.json',
                    '../data/dpr-post-process-unified-qa-matched.jl')

In [52]:
len(no_match_unified_qa)

1639

In [41]:
no_match_unified_qa[3]

{'q': 'when did Joshua Kushner marry Karlie?',
 'a': '2018-10-01',
 'n1': 'Q6289985',
 'n2': 'Q292749',
 'n1_label': 'Joshua Kushner',
 'n2_label': 'Karlie Kloss',
 'url': 'http://en.wikipedia.org/wiki/Joshua_Kushner',
 'precision': '10',
 'matched_dpr_answers': [{'dpr_answer': '"Joshua Kushner"\nis a technology platform designed to help certain types of clients, such as family offices and endowments, invest in real estate. Kushner dated model Karlie Kloss since 2012. They got engaged in July 2018, after six years of dating. Kloss converted to Judaism (Kushner\'s faith) in 2018. Kushner and Kloss got married on October 18, 2018. Although he is the brother of President Trump\'s Senior Advisor and son-in-law, Jared Kushner, Joshua stated through his spokesman that ""he loved his brother and did not want to say anything that might embarrass him. Nevertheless, the spokesman also said that Josh is a',
   'dpr_score': 75.5737075805664,
   'dpr_docid': '14991307',
   'rank': 5,
   'parsed_dat

In [42]:
def calculate_mrr(input_file, answer_field='matched_dpr_answers'):
    f = open(input_file)
    c = 0
    rrs = 0
    rank_distribution = {}
    for line in f:
        c += 1
        j = json.loads(line.strip())
        answers = j[answer_field]
        for answer in answers:
            if answer['uqa_matched']:
                rank = answer['rank']
                rr = 1.0 / float(rank)
                if rank  not in rank_distribution:
                    rank_distribution[rank] = 0
                rank_distribution[rank] += 1
                rrs += rr
                break
    print(rrs / c)
    print(c)
    return rank_distribution

In [43]:
rd = calculate_mrr('../data/dpr-post-process-unified-qa-matched.jl')

0.39311308231469877
13355


In [None]:
13382/14797

In [None]:
rank_df = pds.DataFrame([{'rank': x, 'count': rd[x]} for x in rd]).sort_values(by='rank')

In [None]:
rank_df.plot(kind='line', x='rank', y='count', ylabel='count')