In [1]:
import numpy as np
import pandas as pd
import json

In [2]:

def read_json_in_chunks(file_path, chunk_size=1000):
    with open(file_path, 'r') as file:
        chunk = []
        for line in file:
            chunk.append(json.loads(line))
            if len(chunk) >= chunk_size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

# Example usage
for chunk in read_json_in_chunks('../../data/LIGHT/train_light.json'):
    # Process each chunk here
    # For example, convert chunk to DataFrame and process
    df = pd.DataFrame(chunk)
    # Your processing logic...
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10026,10027,10028,10029,10030,10031,10032,10033,10034,10035
0,"{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'singleAnswer', 'ans...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'singleAnswer', 'ans...","{'annotations': [{'type': 'singleAnswer', 'ans...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'singleAnswer', 'ans...",...,"{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'singleAnswer', 'ans...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'singleAnswer', 'ans...","{'annotations': [{'type': 'multipleQAs', 'qaPa...","{'annotations': [{'type': 'multipleQAs', 'qaPa..."


In [3]:
first_row = df.iloc[0][0]
print(first_row)

{'annotations': [{'type': 'multipleQAs', 'qaPairs': [{'question': 'When did the Simpsons first air on television as an animated short on the Tracey Ullman Show?', 'answer': ['April 19, 1987']}, {'question': 'When did the Simpsons first air as a half-hour prime time show?', 'answer': ['December 17, 1989']}]}], 'id': '-4469503464110108318', 'question': 'When did the simpsons first air on television?'}


In [4]:
first_row = df.iloc[0][1]
print(first_row)

{'annotations': [{'type': 'singleAnswer', 'answer': ['David Morse']}], 'id': '4790842463458965203', 'question': 'Who played george washington in the john adams series?'}


In [5]:
def flatten_data(file_directory):

    with open(file_directory, 'r') as file:
        data = json.load(file)

    flattened_data = []

    # iterate through each entry in the data
    for entry in data:
        question_id = entry['id']
        ambiguous_question = entry['question']

        # check the type of annotations and process accordingly
        for annotation in entry['annotations']:
            if annotation['type'] == 'multipleQAs':
                for qa_pair in annotation['qaPairs']:
                    flattened_data.append({
                        'id': question_id,
                        'ambiguous_question': ambiguous_question,
                        'disambiguated_question': qa_pair['question'],
                        'answer': qa_pair['answer'][0]
                    })
            elif annotation['type'] == 'singleAnswer':
                flattened_data.append({
                    'id': question_id,
                    'ambiguous_question': ambiguous_question,
                    'disambiguated_question': np.nan,
                    'answer': annotation['answer'][0]
                })

    ambiguous_df = pd.DataFrame(flattened_data)
    return ambiguous_df

In [6]:
ambiguous_train_df = flatten_data('../../data/LIGHT/train_light.json')
ambiguous_train_df.head()

Unnamed: 0,id,ambiguous_question,disambiguated_question,answer
0,-4469503464110108318,When did the simpsons first air on television?,When did the Simpsons first air on television ...,"April 19, 1987"
1,-4469503464110108318,When did the simpsons first air on television?,When did the Simpsons first air as a half-hour...,"December 17, 1989"
2,4790842463458965203,Who played george washington in the john adams...,,David Morse
3,-6631915997977101143,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par...",18 years of age
4,-6631915997977101143,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par...",19


In [7]:
print(len(ambiguous_train_df))

19541


In [8]:
ambiguous_dev_df = flatten_data('../../data/LIGHT/dev_light.json')
ambiguous_dev_df.head()

Unnamed: 0,id,ambiguous_question,disambiguated_question,answer
0,-807825952267713091,Who plays the doctor in dexter season 1?,,Tony Goldwyn
1,8266116451988110240,How often does spermatogeneis—the production o...,,usually continues uninterrupted until death
2,8266116451988110240,How often does spermatogeneis—the production o...,,constant
3,7336174019902289593,When was the first remote control tv invented?,,1950
4,7336174019902289593,When was the first remote control tv invented?,,1950


In [9]:
print(len(ambiguous_dev_df))

6632


In [10]:
ambiguous_train_cleaned_df = ambiguous_train_df.dropna(subset=['disambiguated_question'])
print(len(ambiguous_train_cleaned_df))

14082


In [11]:
ambiguous_dev_cleaned_df = ambiguous_dev_df.dropna(subset=['disambiguated_question'])
print(len(ambiguous_dev_cleaned_df))

4856


In [12]:
ambiguous_train_cleaned_df = ambiguous_train_cleaned_df.dropna()
num_train_disambiguous = len(ambiguous_train_cleaned_df)
ambiguous_train_cleaned_df.drop('id', axis=1, inplace=True)
ambiguous_train_cleaned_df.drop('answer', axis=1, inplace=True)
ambiguous_train_cleaned_df.head()

Unnamed: 0,ambiguous_question,disambiguated_question
0,When did the simpsons first air on television?,When did the Simpsons first air on television ...
1,When did the simpsons first air on television?,When did the Simpsons first air as a half-hour...
3,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."
4,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."
5,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."


In [13]:
ambiguous_dev_cleaned_df = ambiguous_dev_cleaned_df.dropna()
num_dev_disambiguous = len(ambiguous_dev_cleaned_df)
ambiguous_dev_cleaned_df.drop('id', axis=1, inplace=True)
ambiguous_dev_cleaned_df.drop('answer', axis=1, inplace=True)
ambiguous_dev_cleaned_df.head()

Unnamed: 0,ambiguous_question,disambiguated_question
6,Why did the st louis cardinals move to arizona?,What ability caused the st louis cardinals mov...
7,Why did the st louis cardinals move to arizona?,What physical issue caused the st louis cardin...
8,Why did the st louis cardinals move to arizona?,What fan issue caused the st louis cardinals m...
13,Who is the current chairman of african union c...,Who is the 4th chairman of african union commi...
14,Who is the current chairman of african union c...,Who is the 3rd chairman of african union commi...


In [14]:
print(len(ambiguous_dev_cleaned_df))

4856


In [None]:
# Pull validation out of train instead of test



In [12]:
# Calculate the index to split the DataFrame into two parts
halfway_idx = len(ambiguous_dev_cleaned_df) // 2
print(halfway_idx)
ambiguous_dev_cleaned_full_df = ambiguous_dev_cleaned_df

# Split the DataFrame into two halves
ambiguous_test_cleaned_df = ambiguous_dev_cleaned_df.iloc[:halfway_idx]
ambiguous_dev_cleaned_df = ambiguous_dev_cleaned_df.iloc[halfway_idx:]
      
print(len(ambiguous_test_cleaned_df))
print(len(ambiguous_dev_cleaned_df))

2428
2428
2428


In [13]:
print("There are", num_train_disambiguous, "ambiguous questions with disambiguated versions in the training set.")
print(len(ambiguous_train_df) - num_train_disambiguous, "ambiguous questions were removed from the training set.")

There are 14082 ambiguous questions with disambiguated versions in the training set.
5459 ambiguous questions were removed from the training set.


In [14]:
print("There are", num_dev_disambiguous, "ambiguous questions with disambiguated versions in the development set.")
print(len(ambiguous_dev_df) - num_dev_disambiguous, "ambiguous questions were removed from the development set.")

There are 4856 ambiguous questions with disambiguated versions in the development set.
1776 ambiguous questions were removed from the development set.


In [15]:
ambiguous_train_cleaned_df.to_csv('../../data/ambig_disambig_train.csv', index=False)
ambiguous_dev_cleaned_df.to_csv('../../data/ambig_disambig_dev.csv', index=False)
ambiguous_test_cleaned_df.to_csv('../../data/ambig_disambig_test.csv', index=False)
ambiguous_dev_cleaned_full_df.to_csv('../../data/ambig_disambig_dev_full.csv', index=False)

In [16]:
ambiguous_train_cleaned_df['ambiguous_question'][5]


'What is the legal age of marriage in usa?'

In [17]:
ambiguous_train_cleaned_df['disambiguated_question'][5]


'What is the legal age of marriage, without parental consent or other authorization, in Mississippi?'

In [19]:
ambiguous_train_cleaned_df['disambiguated_question'][6]


'What is youngest legal age of marriage possible in some US states when circumstances permit?'