In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from utils import parse_date, match_dates_based_on_precision, month_dict, extract_node1_node2, create_other_dates, format_dates, precision_dict
import spacy
from glob import glob
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load('en_core_web_trf')

In [3]:
input_file = '../data/dpr-post-process-unified-qa-matched.jl'
annotated_file = '../data/questions-for-annotation-annotated.csv'
evaluation_input_file = '../data/annotated-questions-for-evaluation.tsv'
evaluation_output_file = '../data/eval_output/output.tsv-95000'

In [4]:
annotated_df = pd.read_csv(annotated_file)

In [5]:
annotated_questions = set(annotated_df['question'].unique())

In [6]:
len(annotated_questions)

100

In [19]:
def load_filter_questions(input_file):
    all_questions = list()
    with open(input_file) as f:
        for line in f:
            j = json.loads(line.strip())
            answer = j['a']
            question = j['q']
            if answer < '2020-01-01':
                if question not in annotated_questions:
                    all_questions.append(j)

    questions_with_spacy_matched = []
    for aq in all_questions:
        dpr_answers = aq['matched_dpr_answers']
        for da in dpr_answers:
            if da['matched']:
                aq['matched_dpr_answer'] = da['dpr_answer']
                aq.pop('matched_dpr_answers')
                questions_with_spacy_matched.append(aq)
                break
    return questions_with_spacy_matched

In [20]:
questions_with_spacy_matched = load_filter_questions(input_file)

In [21]:
len(questions_with_spacy_matched)

13254

In [22]:
questions_with_spacy_matched[34]

{'q': 'when did Frederick Hervey, 8th Marquess of Bristol marry Meredith?',
 'a': '2018-05-11',
 'n1': 'Q5498020',
 'n2': 'Q76353883',
 'n1_label': 'Frederick Hervey, 8th Marquess of Bristol',
 'n2_label': 'Meredith Dunn',
 'url': 'http://en.wikipedia.org/wiki/Frederick_Hervey,_8th_Marquess_of_Bristol',
 'precision': '11',
 'spacy_matched': True,
 'uqa_matched': True,
 'matched_dpr_answer': '"Frederick Hervey, 8th Marquess of Bristol"\nthe National Trust for not reselling what would have been the remaining term of that leasehold to him, arguing that the 7th Marquess could only sell his own life interest, not that of his descendants. This was disputed by the National Trust who have since converted the East Wing into a hotel. However, in 2009 Sir Simon Jenkins, the National Trust\'s new chairman, stated, ""I think it is in our interest for the Marquesses of Bristol to be living there."" On 11 May 2018 Lord Bristol married Meredith Dunn, an American art consultant, in a Roman Catholic wed

In [13]:
def pick_100_questions_for_manual_annotation():
    random_questions = random.choices(questions_with_spacy_matched, k=100)
    print(len(random_questions))
    random_df = pd.DataFrame(random_questions)
    random_df.rename(columns={'q': 'question', 
                              'a': 'answer', 
                              'matched_dpr_answer': 'dpr_answer_a'}, inplace=True)
    random_df['dpr_answer'] = random_df['dpr_answer_a'].map(lambda x: x.split('\n')[1])
    random_df['precision'] = random_df['precision'].map(lambda x: precision_dict.get(x))
    random_df.drop(columns=['n1', 'n2', 'n1_label', 'n2_label','url', 'spacy_matched', 'uqa_matched', 'dpr_answer_a'], inplace=True)
    r = pd.concat([annotated_df, random_df]).fillna("")
    r.to_csv(annotated_file_v2, index=False)
                   

In [34]:
pick_100_questions_for_manual_annotation()

100


In [23]:
def add_spacy_matched_answers(list_questions):
    c = 1
    for q in list_questions:
        if c%1000 == 0:
            print(c)
        c += 1
        precision = q['precision']
        spacy_dates = parse_date(q['matched_dpr_answer'], nlp, 'year') # precision year because we want to extract any dates
        for sd in spacy_dates:
            matched, prov  = match_dates_based_on_precision(q['a'], 'year', sd)
            if matched:
                q['spacy_matched_date'] =  sd['orig_date']
                break
    return list_questions

In [24]:
questions_with_spacy_matched_2 = add_spacy_matched_answers(questions_with_spacy_matched)

  date_obj = stz.localize(date_obj)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000


In [25]:
questions_with_spacy_matched_2[560]

{'q': 'when did Aaron Paul marry Lauren?',
 'a': '2013-05-26',
 'n1': 'Q302491',
 'n2': 'Q40459846',
 'n1_label': 'Aaron Paul',
 'n2_label': 'Lauren Parsekian',
 'url': 'http://en.wikipedia.org/wiki/Aaron_Paul',
 'precision': '11',
 'spacy_matched': True,
 'uqa_matched': True,
 'matched_dpr_answer': '"Aaron Paul"\nParis on January 1, 2012. The two met at Coachella in Indio, California. They were married on May 26, 2013, in a 1920s Parisian carnival-themed wedding, in Malibu, California; music was provided by Foster the People and John Mayer. Paul emailed the song ""Beauty"" by The Shivers to everyone on the guest list and asked them to learn the words so they could sing along during the ceremony. In September 2017, Paul announced that he and Parsekian were expecting their first child. Their daughter, Story, was born in February 2018. To commemorate the final episode of ""Breaking Bad"", Paul and',
 'spacy_matched_date': 'May 26, 2013'}

In [93]:
train, test = train_test_split(questions_with_spacy_matched_2, test_size=0.25)

In [94]:
print(len(train), len(test))

9865 3289


In [26]:
def create_unified_qa_model_input_files(list_questions, bart_file_name, t5_file_name):
    bart_o = open(bart_file_name, 'w')
    t5_o = open(t5_file_name, 'w')
    actual_answer = 0
    no_answer = 0
    ojson = []
    for q in list_questions:
        question = q['q']
        answer = q.get('spacy_matched_date', None)
        if answer is None:
            answer = '<no answer>'
            no_answer += 1
        dpr_a  = q['matched_dpr_answer'].split('\n')[1]
        w_dpr_a = ''
        n2_trunc = ''
        try:
            w_dpr_a = q['wrong_matched_dpr_answer'].split('\n')[1]
        except:
            w_dpr_a = q['wrong_matched_dpr_answer']
        w_d_dpr_a = q.get('wrong_date_dpr_answer', None)
        n2_trunc = q['n2_label_trunc']
        
        
        if n2_trunc in dpr_a:
            bart_o.write(f"{question}\\n{dpr_a}\t{answer}\n")
            ojson.append({'input': f"{question}\n{dpr_a}", 'output':answer, 'precision': q['precision']})
            actual_answer += 1
        else:
            bart_o.write(f"{question}\\n{dpr_a}\t<no answer>\n")
            ojson.append({'input': f"{question}\n{dpr_a}", 'output':"<no answer>", 'precision': q['precision']})
            no_answer += 1

        bart_o.write(f"{question}\\n{w_dpr_a}\t<no answer>\n")
        ojson.append({'input': f"{question}\n{w_dpr_a}", 'output':"<no answer>", 'precision': q['precision']})
        no_answer += 1
        if w_d_dpr_a is not None:
            bart_o.write(f"{question}\\n{w_d_dpr_a}\t<no answer>\n")
        
    print(actual_answer, no_answer)
    t5_o.write(json.dumps(ojson))
    bart_o.close()
    t5_o.close()
        

In [None]:
create_unified_qa_model_input_files(train, 
                                    '../data/unified_qa_input/v2/bart_train.tsv',
                                   '../data/unified_qa_input/v2/t5_train.json')

In [None]:
create_unified_qa_model_input_files(test, '../data/unified_qa_input/v2/bart_test.tsv',
                                   '../data/unified_qa_input/v2/f5_test.json')

In [None]:
def create_input_for_evaluation(annotated_file, evaluation_input_file):
    df = pd.read_csv(annotated_file)
    o = open(evaluation_input_file, 'w')
    for question, para in list(zip(df['question'], df['dpr_answer'])):
        fstring = f"{question}\\n{para}\n"
        o.write(fstring)
    o.close()
    

In [None]:
create_input_for_evaluation(annotated_file, evaluation_input_file)

In [60]:
def check_names_overlap(name1: str, name2: str):
    name_parts = name2.split()
    for name_part in name_parts:
        if name_part in name1:
            return True
    return False

In [97]:
def helper(question_set, node_set, create_wrong_date=False):
    c = 0
    new_set = []
    for q in question_set:
        c += 1
        if c%1000 == 0:
            print(c)
        node1, node2 = extract_node1_node2(q['q'])
        if create_wrong_date:
            precision = str(q['precision'])
            requires = ['year', 'month'] if precision in ('10','11') else ['year']
            parsed_dates =  parse_date(q['matched_dpr_answer'], nlp)
            wrong_date_dpr_answer = ''
            for pd in parsed_dates:
                matched, p = match_dates_based_on_precision(q['a'], precision, pd)
                if matched:
                    new_date = format_dates(create_other_dates(pd['date'], 1, based_on=precision)[0], precision)
                    # print(pd['date'], q['a'], new_date, precision)
                    wrong_date_dpr_answer = q['matched_dpr_answer'].replace(pd['orig_date'], new_date)
                    # print(wrong_date_dpr_answer)
                    break
            if wrong_date_dpr_answer != '':
                q['wrong_date_dpr_answer'] = wrong_date_dpr_answer
        q['n2_label_trunc'] = node2
        newn2 = ''
        spacy_docs = nlp(q['matched_dpr_answer'])
        spacy_persons = [x.text for x in spacy_docs.ents if x.label_ == 'PERSON']

        while True:
            newn2 = random.choice(node_set)
            # if newn2 != node1 and newn2 != node2:
            if not check_names_overlap(node1, newn2) and not check_names_overlap(node2, newn2):
                break
 
        found_node2 = False
        for sp in spacy_persons:
            if node2 in sp:
                q['wrong_matched_dpr_answer'] = q['matched_dpr_answer'].replace(sp, newn2)
                found_node2 = True
                break
        if not found_node2:
            q['wrong_matched_dpr_answer'] = q['matched_dpr_answer'].replace(node2, newn2)
            
        new_set.append(q)
    return new_set

In [98]:
def create_no_answer_training_dataset(train_set, test_set):
    
    train_nodes = set()
    test_nodes  = set()
    for question in train_set:
        node1, node2 = extract_node1_node2(question['q'])
        train_nodes.add(node2)
    for question in test_set:
        node1, node2 = extract_node1_node2(question['q'])
        test_nodes.add(node2)

    train_nodes = list(train_nodes)
    test_nodes = list(test_nodes)
    
    
    
    new_train_set = helper(train_set, train_nodes)
    new_test_set = helper(test_set, test_nodes)

    assert len(train_set) == len(new_train_set)
    assert len(test_set) == len(new_test_set)
    return new_train_set, new_test_set
        
        
        
        

In [100]:
new_train, new_test = create_no_answer_training_dataset(train, test)

1000
2000
3000
4000
5000
6000
7000
8000
9000
1000
2000
3000


In [103]:
create_unified_qa_model_input_files(new_train, '../data/unified_qa_input/v9/bart_train.tsv', '../data/unified_qa_input/v9/t5_train.json')

7620 12110


In [104]:
create_unified_qa_model_input_files(new_test, '../data/unified_qa_input/v9/bart_test.tsv', '../data/unified_qa_input/v9/t5_test.json')

2582 3997


In [77]:
create_unified_qa_model_input_files(train, '../data/unified_qa_input/v7/train.tsv')

In [78]:
create_unified_qa_model_input_files(test, '../data/unified_qa_input/v7/test.tsv')