# ELI5 Dataset Conversion

Converting JSON files to CSV tables with separate rows for each answer.

In [1]:
import json
import pandas as pd
from pathlib import Path

## Conversion Function

In [2]:
def json_to_dataframe(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    rows = []
    
    for entry in data:
        q_id = entry.get('q_id', '')
        title = entry.get('title', '')
        category = entry.get('category', '')
        subreddit = entry.get('subreddit', '')
        
        answers = entry.get('answers', {})
        a_ids = answers.get('a_id', [])
        texts = answers.get('text', [])
        scores = answers.get('score', [])
        
        if not a_ids:
           continue # if no answers, skip 
        else:
            # in the dataset, there are multiple answers to each question
            for i in range(len(a_ids)):
                rows.append({
                    'q_id': q_id,
                    'title': title,
                    'category': category,
                    'subreddit': subreddit,
                    'a_id': a_ids[i] if i < len(a_ids) else '',
                    'text': texts[i] if i < len(texts) else '',
                    'score': scores[i] if i < len(scores) else ''
                })
    
    return pd.DataFrame(rows)

## Get Combined JSON File

In [3]:
json_file = Path("output/eli5_combined.json")
json_file

WindowsPath('output/eli5_combined.json')

## Process Files

In [4]:
df = json_to_dataframe(str(json_file))

csv_file = json_file.with_suffix('.csv')
df.to_csv(csv_file, index=False, encoding='utf-8')

## Preview Data

In [5]:
df.head(10)

Unnamed: 0,q_id,title,category,subreddit,a_id,text,score
0,5lchat,Why there was a 'leap second' added to the end...,Other,explainlikeimfive,dbuoyxl,the rotation of the earth is not a constant. i...,44
1,5lchat,Why there was a 'leap second' added to the end...,Other,explainlikeimfive,dbur7gi,The Earth's rotation is not regular. It varies...,5
2,5lchat,Why there was a 'leap second' added to the end...,Other,explainlikeimfive,dbuotht,Because the Earth's rotation is slowing. If yo...,4
3,5lcjq6,How do you claim undiscovered land?,Other,explainlikeimfive,dbuplm8,Imagine you are out walking in the woods near ...,195
4,5lcjq6,How do you claim undiscovered land?,Other,explainlikeimfive,dbuocvb,"By force. Historically, nations have defended ...",39
5,5lcjq6,How do you claim undiscovered land?,Other,explainlikeimfive,dbux9vf,With a flag and an force/money to back the cla...,5
6,5lcl43,Why do we fail to do realistic human CGI (like...,Technology,explainlikeimfive,dbuns7l,It's more that we're really good at picking up...,34
7,5lcl43,Why do we fail to do realistic human CGI (like...,Technology,explainlikeimfive,dbunw2c,Probably because we are so adapted to human sk...,11
8,5lcl43,Why do we fail to do realistic human CGI (like...,Technology,explainlikeimfive,dbup34d,When was the last time you compared an Orc IRL...,7
9,5lcl43,Why do we fail to do realistic human CGI (like...,Technology,explainlikeimfive,dbuo2f1,It's a phenomenon known as the uncanny valley....,7


## Data Summary

In [6]:
print(f"Total rows: {len(df)}")
print(f"Unique questions: {df['q_id'].nunique()}")
print(f"Unique categories: {df['category'].nunique()}")

Total rows: 261214
Unique questions: 105004
Unique categories: 12
