In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
def flatten_data(file_directory):

    with open(file_directory, 'r') as file:
        data = json.load(file)

    flattened_data = []

    # iterate through each entry in the data
    for entry in data:
        question_id = entry['id']
        ambiguous_question = entry['question']

        # check the type of annotations and process accordingly
        for annotation in entry['annotations']:
            if annotation['type'] == 'multipleQAs':
                for qa_pair in annotation['qaPairs']:
                    flattened_data.append({
                        'id': question_id,
                        'ambiguous_question': ambiguous_question,
                        'disambiguated_question': qa_pair['question'],
                        'answer': qa_pair['answer'][0]
                    })
            elif annotation['type'] == 'singleAnswer':
                flattened_data.append({
                    'id': question_id,
                    'ambiguous_question': ambiguous_question,
                    'disambiguated_question': np.nan,
                    'answer': annotation['answer'][0]
                })

    ambiguous_df = pd.DataFrame(flattened_data)
    return ambiguous_df

In [3]:
ambiguous_train_df = flatten_data('../../data/train_light.json')
ambiguous_train_df.head()

Unnamed: 0,id,ambiguous_question,disambiguated_question,answer
0,-4469503464110108318,When did the simpsons first air on television?,When did the Simpsons first air on television ...,"April 19, 1987"
1,-4469503464110108318,When did the simpsons first air on television?,When did the Simpsons first air as a half-hour...,"December 17, 1989"
2,4790842463458965203,Who played george washington in the john adams...,,David Morse
3,-6631915997977101143,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par...",18 years of age
4,-6631915997977101143,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par...",19


In [4]:
ambiguous_dev_df = flatten_data('../../data/dev_light.json')
ambiguous_dev_df.head()

Unnamed: 0,id,ambiguous_question,disambiguated_question,answer
0,-807825952267713091,Who plays the doctor in dexter season 1?,,Tony Goldwyn
1,8266116451988110240,How often does spermatogeneis—the production o...,,usually continues uninterrupted until death
2,8266116451988110240,How often does spermatogeneis—the production o...,,constant
3,7336174019902289593,When was the first remote control tv invented?,,1950
4,7336174019902289593,When was the first remote control tv invented?,,1950


In [5]:
ambiguous_train_cleaned_df = ambiguous_train_df.dropna(subset=['disambiguated_question'])
print(len(ambiguous_train_cleaned_df))

14082


In [6]:
ambiguous_dev_cleaned_df = ambiguous_dev_df.dropna(subset=['disambiguated_question'])
print(len(ambiguous_dev_cleaned_df))

4856


In [7]:
ambiguous_train_cleaned_df = ambiguous_train_cleaned_df.dropna()
num_train_disambiguous = len(ambiguous_train_cleaned_df)
ambiguous_train_cleaned_df.drop('id', axis=1, inplace=True)
ambiguous_train_cleaned_df.drop('answer', axis=1, inplace=True)
ambiguous_train_cleaned_df.head()

Unnamed: 0,ambiguous_question,disambiguated_question
0,When did the simpsons first air on television?,When did the Simpsons first air on television ...
1,When did the simpsons first air on television?,When did the Simpsons first air as a half-hour...
3,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."
4,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."
5,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."


In [8]:
ambiguous_dev_cleaned_df = ambiguous_dev_cleaned_df.dropna()
num_dev_disambiguous = len(ambiguous_dev_cleaned_df)
ambiguous_dev_cleaned_df.drop('id', axis=1, inplace=True)
ambiguous_dev_cleaned_df.drop('answer', axis=1, inplace=True)
ambiguous_dev_cleaned_df.head()

Unnamed: 0,ambiguous_question,disambiguated_question
6,Why did the st louis cardinals move to arizona?,What ability caused the st louis cardinals mov...
7,Why did the st louis cardinals move to arizona?,What physical issue caused the st louis cardin...
8,Why did the st louis cardinals move to arizona?,What fan issue caused the st louis cardinals m...
13,Who is the current chairman of african union c...,Who is the 4th chairman of african union commi...
14,Who is the current chairman of african union c...,Who is the 3rd chairman of african union commi...


In [9]:
print("There are", num_train_disambiguous, "ambiguous questions with disambiguated versions in the training set.")
print(len(ambiguous_train_df) - num_train_disambiguous, "ambiguous questions were removed from the training set.")

There are 14082 ambiguous questions with disambiguated versions in the training set.
5459 ambiguous questions were removed from the training set.


In [10]:
print("There are", num_dev_disambiguous, "ambiguous questions with disambiguated versions in the development set.")
print(len(ambiguous_dev_df) - num_dev_disambiguous, "ambiguous questions were removed from the development set.")

There are 4856 ambiguous questions with disambiguated versions in the development set.
1776 ambiguous questions were removed from the development set.


In [11]:
ambiguous_train_cleaned_df.to_csv('../../data/ambig_disambig_train.csv', index=False)
ambiguous_dev_cleaned_df.to_csv('../../data/ambig_disambig_dev.csv', index=False)