In [43]:
import numpy as np
import pandas as pd
import json

In [44]:
with open('train_light.json', 'r') as file:
    data = json.load(file)

In [45]:
flattened_data = []

# iterate through each entry in the data
for entry in data:
    question_id = entry['id']
    ambiguous_question = entry['question']
    
    # check the type of annotations and process accordingly
    for annotation in entry['annotations']:
        if annotation['type'] == 'multipleQAs':
            for qa_pair in annotation['qaPairs']:
                flattened_data.append({
                    'id': question_id,
                    'ambiguous_question': ambiguous_question,
                    'disambiguated_question': qa_pair['question'],
                    'answer': qa_pair['answer'][0]
                })
        elif annotation['type'] == 'singleAnswer':
            flattened_data.append({
                'id': question_id,
                'ambiguous_question': ambiguous_question,
                'disambiguated_question': np.nan,
                'answer': annotation['answer'][0]
            })
            
ambiguous_df = pd.DataFrame(flattened_data)

In [46]:
ambiguous_df.head()

Unnamed: 0,id,ambiguous_question,disambiguated_question,answer
0,-4469503464110108318,When did the simpsons first air on television?,When did the Simpsons first air on television ...,"April 19, 1987"
1,-4469503464110108318,When did the simpsons first air on television?,When did the Simpsons first air as a half-hour...,"December 17, 1989"
2,4790842463458965203,Who played george washington in the john adams...,,David Morse
3,-6631915997977101143,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par...",18 years of age
4,-6631915997977101143,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par...",19


In [47]:
print(len(ambiguous_df))

19541


In [48]:
ambiguous_cleaned_df = ambiguous_df.dropna(subset=['disambiguated_question'])
print("Number of disambiguous rewrites (length): ", num_disambiguous)

Number of disambiguous rewrites (length):  14082


In [49]:
ambiguous_cleaned_df = ambiguous_cleaned_df.dropna()
num_disambiguous = len(ambiguous_cleaned_df)
ambiguous_cleaned_df.drop('id', axis=1, inplace=True)
ambiguous_cleaned_df.drop('answer', axis=1, inplace=True)
ambiguous_cleaned_df.head()

Unnamed: 0,ambiguous_question,disambiguated_question
0,When did the simpsons first air on television?,When did the Simpsons first air on television ...
1,When did the simpsons first air on television?,When did the Simpsons first air as a half-hour...
3,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."
4,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."
5,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."


In [50]:
print("There are", num_disambiguous, "ambiguous questions with disambiguated versions.")
print(len(ambiguous_df) - num_disambiguous, "ambiguous questions were removed.")

There are 14082 ambiguous questions with disambiguated versions.
5459 ambiguous questions were removed.


In [51]:
ambiguous_cleaned_df.to_csv('amib_disambig_train.csv', index=False)