# ELI5 Dataset - ChatGPT Answer Generation

This notebook processes the ELI5 (Explain Like I'm 5) dataset and generates answers using OpenAI's ChatGPT.

## Dataset Information
- **Source**: HuggingFace dataset `rexarski/eli5_category`
- **Period**: January 2017 - June 2021
- **Content**: Human-written questions and answers from the ELI5 subreddit
- **Purpose**: Expand dataset with ChatGPT-generated answers for comparison and analysis

####  **<span style="color:red">IMPORTANT: <span>**
1. Finalize how what columns to pick for the final dataset
    * Current: drop score & subreddit column ONLY
2. Finalize what models to use
    * Current: gpt-4o-mini
3. The current ChatGPT df generation only saves the successful attempts to df_chatgpt (i.e. sample size = 10, but 2 fail, final df length is 8)

## 1. Install and Import Required Libraries

In [None]:
# Install required packages (run once)
!pip install pandas numpy datasets
!pip install openai
!pip install gdown matplotlib seaborn tqdm
!pip install fastparquet

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
import json
import os
from datetime import datetime
import textwrap

# OpenAI API
from openai import OpenAI

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

## 2. Set Up OpenAI API Key

In [None]:
import dotenv
dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if OPENAI_API_KEY:
    client = OpenAI(api_key=OPENAI_API_KEY)
    print("OpenAI API key configured successfully")
else:
    print("Warning: OpenAI API key not set")

## 3. Load the ELI5 Dataset

####  **<span style="color:red">IMPORTANT: <span>**
You must run the scripts in the human_data folder to get the ELI5 Dataset

In [None]:
path="./human_data/output/eli5_combined.csv"

df = pd.read_csv(path)
print(f"Dataset loaded with {len(df)} records")
df.head()

## 4. Explore Dataset Structure

In [None]:
# Display basic information
print("=" * 80)
print("DATASET INFORMATION")
print("=" * 80)
df.info()
print("\n" + "=" * 80)
print("FIRST FEW ROWS")
print("=" * 80)
df.head()

In [None]:
# Statistical summary
print("=" * 80)
print("STATISTICAL SUMMARY")
print("=" * 80)
df.describe(include='all')

In [None]:
# Check for missing values
print("=" * 80)
print("MISSING VALUES")
print("=" * 80)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

## 5. Data Cleaning and Filtering

In [None]:
df_clean = df.copy()

print(f"Original dataset size: {len(df_clean)} rows")

# 1. Remove duplicates
df_clean = df_clean.drop_duplicates()
print(f"After removing duplicates: {len(df_clean)} rows")

# 2. Filter by text length (keep only answers with <= 1000 characters)
# Justification: Mean is approx. 600 and Median is approx. 400. 
# Highest is around 9000 and token limit is 1000
df_clean = df_clean[df_clean['text'].str.len() <= 1000]
print(f"After filtering text length: {len(df_clean)} rows")

# 3. Drop unnecessary columns (score && subreddit)
df_clean = df_clean.drop(columns=['score', 'subreddit'], errors='ignore')

print(f"\nFinal cleaned dataset size: {len(df_clean)} rows")
print(f"Removed: {len(df) - len(df_clean)} rows ({((len(df) - len(df_clean)) / len(df) * 100):.2f}%)")

## 6. Load questions to an array

In [None]:
questions = df_clean['title'].tolist()

print("Question num: "  + str(len(questions)))

## 7. Generate Answers with ChatGPT

In [None]:
sample_size = 2

In [None]:
def generate_chatgpt_answer(question, model="gpt-4o-mini", max_retries=3):
    """
    Generate an ELI5-style answer using ChatGPT.
    
    Args:
        question: The question to answer
        model: OpenAI model to use (default: gpt-4o-mini)
        max_retries: Number of retry attempts on failure
    
    Returns:
        Generated answer as string, or error message if failed
    """
    if not OPENAI_API_KEY:
        return "ERROR: OpenAI API key not configured"
    
    system_prompt = """You are answering questions in the style of the ELI5 (Explain Like I'm 5) subreddit. 
Provide a clear, simple explanation that a 5-year-old could understand, but still be informative.
Keep everything as one block of text."""
    
    user_prompt = f"Question: {question}\n\nAnswer:"
    
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.7,
                max_tokens=1000,
            )
            return response.choices[0].message.content.strip()
        
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                return f"ERROR: {str(e)}"
    
    return None

# Test the function
print("TEST")
test_question = "Why is the sky blue?"
test_answer = generate_chatgpt_answer(test_question)
print(f"\nQuestion: {test_question}")
print(f"Answer: {test_answer}")

In [None]:
# Generate ChatGPT answers for the dataset

def batch_generate_chatgpt(df, questions, sample_size=None, delay=1):
    # Determine which indices to process
    if sample_size is None:
        target_count = len(questions)
    else:
        target_count = sample_size
    
    successful_rows = []
    successful_indices = []  # Track which rows succeeded
    idx = 0
    
    # Keep generating until we have enough successful answers
    with tqdm(total=target_count, desc="Generating ChatGPT answers") as pbar:
        while len(successful_rows) < target_count and idx < len(questions):
            question = questions[idx]
            answer = generate_chatgpt_answer(question)
            
            # Only include if answer doesn't start with ERROR
            if not answer.startswith("ERROR"):
                row = df.iloc[idx].copy()
                row['text'] = answer
                successful_rows.append(row)
                successful_indices.append(idx)
                pbar.update(1)
            else:
                print(f"\nSkipping row {idx} due to error")
            
            idx += 1
            time.sleep(delay)
    
    # Convert list of rows to DataFrame
    df_chatgpt = pd.DataFrame(successful_rows).reset_index(drop=True)
    
    print(f"\nSuccessfully generated {len(df_chatgpt)} answers")
    if idx >= len(questions) and len(df_chatgpt) < target_count:
        print(f"Warning: Only got {len(df_chatgpt)} successful answers out of {target_count} requested")
    
    return df_chatgpt, successful_indices

# CHANGE SAMPLE_SIZE
df_chatgpt, chatgpt_indices = batch_generate_chatgpt(df_clean, questions, sample_size, delay=1)

In [None]:
df_chatgpt.iloc[0]

## 7. Compare Human vs ChatGPT Answers

In [None]:
# Calculate answer lengths for both human and ChatGPT datasets
def calculate_answer_stats(df_human, df_chatgpt):
    """Calculate statistics for human and ChatGPT-generated answers."""
    
    stats = {}
    
    # Human stats (from df_clean)
    if 'text' in df_human.columns:
        human_lengths = df_human['text'].astype(str).str.len()
        stats['human'] = {
            'mean_length': human_lengths.mean(),
            'median_length': human_lengths.median(),
            'max_length': human_lengths.max(),
            'min_length': human_lengths.min()
        }
    
    # ChatGPT stats (from df_chatgpt)
    if 'text' in df_chatgpt.columns:
        chatgpt_lengths = df_chatgpt['text'].astype(str).str.len()
        stats['chatgpt'] = {
            'mean_length': chatgpt_lengths.mean(),
            'median_length': chatgpt_lengths.median(),
            'max_length': chatgpt_lengths.max(),
            'min_length': chatgpt_lengths.min()
        }
    
    return stats

# Calculate stats
stats = calculate_answer_stats(df_clean, df_chatgpt)

# Display stats
print("=" * 80)
print("ANSWER LENGTH STATISTICS")
print("=" * 80)
for source, metrics in stats.items():
    print(f"\n{source.upper()}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.2f}")

In [None]:
# Visualize answer length comparison

def plot_answer_length_comparison(df_human, df_chatgpt):
    """Create visualization comparing answer lengths across different sources."""
    
    # Prepare data for plotting
    plot_data = []
    
    if 'text' in df_human.columns:
        plot_data.append({
            'source': 'Human',
            'lengths': df_human['text'].astype(str).str.len().tolist()
        })
    
    if 'text' in df_chatgpt.columns:
        plot_data.append({
            'source': 'ChatGPT',
            'lengths': df_chatgpt['text'].astype(str).str.len().tolist()
        })
    
    if not plot_data:
        print("No data available for plotting yet.")
        return
    
    # Create box plot
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Box plot
    sources = [d['source'] for d in plot_data]
    lengths = [d['lengths'] for d in plot_data]
    
    axes[0].boxplot(lengths, labels=sources)
    axes[0].set_title('Answer Length Distribution by Source')
    axes[0].set_ylabel('Answer Length (characters)')
    axes[0].grid(True, alpha=0.3)
    
    # Bar chart of mean lengths
    mean_lengths = [np.mean(d['lengths']) for d in plot_data]
    axes[1].bar(sources, mean_lengths, color=['blue', 'green'][:len(sources)])
    axes[1].set_title('Mean Answer Length by Source')
    axes[1].set_ylabel('Mean Length (characters)')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Create visualization
plot_answer_length_comparison(df_clean, df_chatgpt)

In [None]:
# Sample comparison - View actual answers side by side
def compare_answers(df_human, df_chatgpt, index=0, width=100):
    """Display a side-by-side comparison of answers for a specific question."""
    
    row_human = df_human.iloc[index]
    row_chatgpt = df_chatgpt.iloc[index]
    
    print(f"QUESTION: {row_human['title']}")
    print()
    
    print("-" * width)
    print("HUMAN ANSWER")
    print("-" * width)
    wrapped_text = textwrap.fill(str(row_human['text']), width=width)
    print(wrapped_text)
    print()
    
    print("-" * width)
    print("CHATGPT ANSWER")
    print("-" * width)
    wrapped_answer = textwrap.fill(str(row_chatgpt['text']), width=width)
    print(wrapped_answer)
    print()

# Compare first answer
compare_answers(df_clean, df_chatgpt, index=0)

## 8. Save Enhanced Dataset

In [None]:
# Save the enhanced dataset with ALL human and ChatGPT answers combined and shuffled

# Prepare human answers dataset
df_human_labeled = df_clean.copy()
df_human_labeled['answer_source'] = 'human'

# Prepare ChatGPT answers dataset
df_chatgpt_labeled = df_chatgpt.copy()
df_chatgpt_labeled['answer_source'] = 'chatgpt'

# Combine ALL answers (human + ChatGPT)
df_combined = pd.concat([df_human_labeled, df_chatgpt_labeled], ignore_index=True)

# Shuffle the combined dataset
np.random.seed(42)  # Set seed for reproducibility
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Combined dataset created:")
print(f"  Total rows: {len(df_combined)}")
print(f"  Human answers: {(df_combined['answer_source'] == 'human').sum()} ({(df_combined['answer_source'] == 'human').sum()/len(df_combined)*100:.1f}%)")
print(f"  ChatGPT answers: {(df_combined['answer_source'] == 'chatgpt').sum()} ({(df_combined['answer_source'] == 'chatgpt').sum()/len(df_combined)*100:.1f}%)")
print(f"  Columns: {list(df_combined.columns)}")
print()

# Create output folder if it doesn't exist
output_folder = "openai-output"
os.makedirs(output_folder, exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = os.path.join(output_folder, f"eli5_combined_{timestamp}.csv")

# Save to CSV
df_combined.to_csv(output_filename, index=False)
print(f"Combined dataset saved to: {output_filename}")

# Optional: Save as parquet for better compression and faster loading
parquet_filename = os.path.join(output_folder, f"eli5_combined_{timestamp}.parquet")
df_combined.to_parquet(parquet_filename, index=False)
print(f"Combined dataset saved to: {parquet_filename}")

# Save summary statistics
summary_filename = os.path.join(output_folder, f"eli5_summary_{timestamp}.json")
summary = {
    'total_rows': len(df_combined),
    'columns': list(df_combined.columns),
    'human_answers': int((df_combined['answer_source'] == 'human').sum()),
    'chatgpt_answers': int((df_combined['answer_source'] == 'chatgpt').sum()),
    'timestamp': timestamp
}

with open(summary_filename, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Summary statistics saved to: {summary_filename}")
print("\n" + "=" * 80)
print("DATASET COMBINATION COMPLETE!")
print("=" * 80)