In [26]:
import numpy as np
import pandas as pd
import json
from sklearn.cluster import KMeans

In [27]:
def flatten_data_LARGE(file_directory):

    with open(file_directory, 'r') as file:
        data = json.load(file)

    flattened_data = []

    # iterate through each entry in the data
    for entry in data:
        question_id = entry['id']
        ambiguous_question = entry['question']

        # check the type of annotations and process accordingly
        for annotation in entry['annotations']:
            if annotation['type'] == 'multipleQAs':
                for qa_pair in annotation['qaPairs']:
                    flattened_data.append({
                        'id': question_id,
                        'viewed_doc_titles': entry['viewed_doc_titles'][0],
                        'ambiguous_question': ambiguous_question,
                        'disambiguated_question': qa_pair['question'],
                        'answer': qa_pair['answer'][0]
                    })
            elif annotation['type'] == 'singleAnswer':
                flattened_data.append({
                    'id': question_id,
                    'ambiguous_question': ambiguous_question,
                    'disambiguated_question': np.nan,
                    'answer': annotation['answer'][0]
                })

    ambiguous_df = pd.DataFrame(flattened_data)
    return ambiguous_df

In [28]:
file_path = '../../data/LARGE/train.json'
ambiguous_train_LARGE_df = flatten_data_LARGE(file_path)
ambiguous_train_LARGE_df.head()

Unnamed: 0,id,viewed_doc_titles,ambiguous_question,disambiguated_question,answer
0,-4469503464110108318,The Simpsons,When did the simpsons first air on television?,When did the Simpsons first air on television ...,"April 19, 1987"
1,-4469503464110108318,The Simpsons,When did the simpsons first air on television?,When did the Simpsons first air as a half-hour...,"December 17, 1989"
2,4790842463458965203,,Who played george washington in the john adams...,,David Morse
3,-6631915997977101143,Marriage age in the United States,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par...",18 years of age
4,-6631915997977101143,Marriage age in the United States,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par...",19


In [29]:
print(len(ambiguous_train_LARGE_df))

19541


In [30]:
def read_json_in_chunks(file_path, chunk_size=1000):
    with open(file_path, 'r') as file:
        chunk = []
        for line in file:
            chunk.append(json.loads(line))
            if len(chunk) >= chunk_size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

# Example usage
for chunk in read_json_in_chunks(file_path):
    # Process each chunk here
    # For example, convert chunk to DataFrame and process
    df = pd.DataFrame(chunk)
    # Your processing logic...


In [31]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10026,10027,10028,10029,10030,10031,10032,10033,10034,10035
0,"{'viewed_doc_titles': ['The Simpsons'], 'used_...",{'viewed_doc_titles': ['John Adams (miniseries...,{'viewed_doc_titles': ['Marriage age in the Un...,"{'viewed_doc_titles': ['Barefoot in the Park',...",{'viewed_doc_titles': ['Timeline of the Manhat...,"{'viewed_doc_titles': ['Frozen Ever After'], '...","{'viewed_doc_titles': ['Deccan Plateau'], 'use...",{'viewed_doc_titles': ['Georgia Bulldogs baske...,{'viewed_doc_titles': ['Play That Funky Music'...,{'viewed_doc_titles': ['History of aerial warf...,...,"{'viewed_doc_titles': ['Lookin' for Love'], 'u...",{'viewed_doc_titles': ['Make You Feel My Love'...,{'viewed_doc_titles': ['Presiding Officer of t...,{'viewed_doc_titles': ['Friday the 13th (1980 ...,"{'viewed_doc_titles': ['Charles O'Hara', 'Char...",{'viewed_doc_titles': ['Category:Summer holida...,{'viewed_doc_titles': ['10 Things I Hate About...,{'viewed_doc_titles': ['List of people who wer...,"{'viewed_doc_titles': ['Wonder Woman', 'Wonder...","{'viewed_doc_titles': ['Air Jordan'], 'used_qu..."


In [32]:
# first_row = df.iloc[0][0]
# print(first_row)

In [33]:
file_path = '../../data/LARGE/dev.json'
ambiguous_dev_LARGE_df = flatten_data_LARGE(file_path)
ambiguous_dev_LARGE_df.head()

Unnamed: 0,id,ambiguous_question,disambiguated_question,answer,viewed_doc_titles
0,-807825952267713091,Who plays the doctor in dexter season 1?,,Tony Goldwyn,
1,8266116451988110240,How often does spermatogeneis—the production o...,,usually continues uninterrupted until death,
2,8266116451988110240,How often does spermatogeneis—the production o...,,constant,
3,7336174019902289593,When was the first remote control tv invented?,,1950,
4,7336174019902289593,When was the first remote control tv invented?,,1950,


In [34]:
print(len(ambiguous_dev_LARGE_df))

6632


In [35]:
ambiguous_train_LARGE_cleaned_df = ambiguous_train_LARGE_df.dropna(subset=['disambiguated_question'])
print(len(ambiguous_train_LARGE_cleaned_df))

14082


In [36]:
ambiguous_dev_LARGE_cleaned_df = ambiguous_dev_LARGE_df.dropna(subset=['disambiguated_question'])
print(len(ambiguous_dev_LARGE_cleaned_df))

4856


In [37]:
ambiguous_train_LARGE_cleaned_df = ambiguous_train_LARGE_cleaned_df.dropna()
num_train_disambiguous = len(ambiguous_train_LARGE_cleaned_df)
ambiguous_train_LARGE_cleaned_df.drop('id', axis=1, inplace=True)
ambiguous_train_LARGE_cleaned_df.drop('answer', axis=1, inplace=True)
ambiguous_train_LARGE_cleaned_df.head()

Unnamed: 0,viewed_doc_titles,ambiguous_question,disambiguated_question
0,The Simpsons,When did the simpsons first air on television?,When did the Simpsons first air on television ...
1,The Simpsons,When did the simpsons first air on television?,When did the Simpsons first air as a half-hour...
3,Marriage age in the United States,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."
4,Marriage age in the United States,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."
5,Marriage age in the United States,What is the legal age of marriage in usa?,"What is the legal age of marriage, without par..."


In [38]:
print(len(ambiguous_train_LARGE_cleaned_df))

14082


In [43]:
print("There are", len(ambiguous_train_LARGE_cleaned_df['viewed_doc_titles'].unique()), "topics.")

There are 4749 topics.


array(['The Simpsons', 'Marriage age in the United States',
       'Barefoot in the Park', ..., 'Category:Summer holidays',
       'Wonder Woman', 'Air Jordan'], dtype=object)