# ELI5 Dataset - ChatGPT Answer Generation

This notebook processes the ELI5 (Explain Like I'm 5) dataset and generates LLM answers using OpenAI's ChatGPT.

## Dataset Information
- **Source**: HuggingFace dataset `rexarski/eli5_category`
- **Period**: January 2017 - June 2021
- **Content**: Human-written questions and answers from the ELI5 subreddit
- **Purpose**: Generate one LLM answer per unique question and merge with human responses

####  **<span style="color:red">IMPORTANT: <span>**

**Workflow & Integration:**
1. This notebook: Load dataset → Test API → Generate ChatGPT answers → Merge with human answers
2. Model: gpt-4o-mini
3. Each unique question gets ONE LLM answer replicated across all human responses
4. **To combine with Gemini:**
   - Run this notebook first (generates human + chatgpt answers)
   - Run `gemini_generate_dataset_clean.ipynb` (generates gemini answers)
   - Use the merge function in Cell 10 to combine both without duplicating human answers
   - Final dataset: human + chatgpt + gemini sources

## 1. Install and Import Required Libraries

In [None]:
# Install required packages (run once)
# !pip install pandas numpy datasets
# !pip install openai
# !pip install gdown matplotlib seaborn tqdm
# !pip install fastparquet

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import json
import os
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# OpenAI API
from openai import OpenAI

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

## 2. Set Up OpenAI API Key

In [None]:
import dotenv
dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if OPENAI_API_KEY:
    client = OpenAI(api_key=OPENAI_API_KEY)
    print("OpenAI API key configured successfully")
else:
    print("OpenAI API key not set")

## 3. Load the ELI5 Dataset

## Configuration - Set Parameters Here

In [None]:
num_questions_to_generate = 1000
delay_between_api_calls = 0.7
test_sample_size = 2
max_workers = 6  # Number of concurrent API calls

openai_model = "gpt-4o-mini"

print("Configuration:")
print(f"  Questions to generate: {num_questions_to_generate}")
print(f"  Delay between calls: {delay_between_api_calls}s")
print(f"  Max concurrent workers: {max_workers}")
print(f"  Test sample size: {test_sample_size}")
print(f"  Model: {openai_model}")

In [None]:
path = "./human_data/output/eli5_cleaned.csv"

df_human = pd.read_csv(path)
print(f"✓ Dataset loaded with {len(df_human)} records")

# Get unique questions
unique_questions = df_human[['q_id', 'title']].drop_duplicates().reset_index(drop=True)
print(f"Found {len(unique_questions)} unique questions")
print(f"Average answers per question: {len(df_human) / len(unique_questions):.1f}")

In [None]:
# Display first few rows
print("\nFirst few rows of dataset:")
print("=" * 80)
df_human[['q_id', 'title', 'text']].head(3)

## 4. Define LLM Answer Generation Function

In [None]:
def generate_chatgpt_answer(question, model="gpt-4o-mini", max_retries=3):
    """
    Generate an ELI5-style answer using ChatGPT.
    
    Args:
        question: The question to answer
        model: OpenAI model to use (default: gpt-4o-mini)
        max_retries: Number of retry attempts on failure
    
    Returns:
        Generated answer as string, or error message if failed
    """
    if not OPENAI_API_KEY:
        return "ERROR: OpenAI API key not configured"
    
    system_prompt = """You are answering questions in the style of the ELI5 (Explain Like I'm 5) subreddit. 
Provide a clear, simple explanation that a 5-year-old could understand, but still be informative.
Keep everything as one block of text."""
    
    user_prompt = f"Question: {question}\n\nAnswer:"
    
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.7,
                max_tokens=1000,
            )
            return response.choices[0].message.content.strip()
        
        except Exception as e:
            print(f"  Attempt {attempt + 1} failed: {str(e)[:100]}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            else:
                return f"ERROR: {str(e)}"
    
    return None

## 5. Test API with Sample Questions

In [None]:
# Test with sample questions
print(f"Testing API with {test_sample_size} sample questions...")
print("=" * 80)

test_questions = unique_questions.head(test_sample_size)

for idx, row in test_questions.iterrows():
    question = row['title']
    print(f"\n[Test {idx + 1}] {question[:70]}...")
    answer = generate_chatgpt_answer(question)
    
    if answer.startswith("ERROR"):
        print(f"{answer}")
    else:
        print(f"Generated ({len(answer)} chars): {answer[:100]}...")
    
    time.sleep(1)

print("\n" + "=" * 80)
print("API test complete!")

## 6. Generate LLM Answer for Each Unique Question

In [None]:
# Generate one LLM answer per unique question using parallel processing
llm_answers_map = {}  # Map from q_id to llm_answer

questions_to_process = unique_questions.head(min(num_questions_to_generate, len(unique_questions)))

def process_single_question(row):
    """
    Process a single question and return (q_id, answer) tuple.
    """
    q_id = row['q_id']
    question = row['title']
    
    try:
        llm_answer = generate_chatgpt_answer(question)
        
        if not llm_answer.startswith("ERROR"):
            return (q_id, llm_answer, True)
        else:
            return (q_id, llm_answer[:80], False)
    except Exception as e:
        return (q_id, f"ERROR: {str(e)[:80]}", False)

# Use ThreadPoolExecutor for concurrent API calls
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    futures = {executor.submit(process_single_question, row): idx 
               for idx, row in questions_to_process.iterrows()}
    
    # Process completed futures with progress bar
    with tqdm(total=len(futures), desc="Generating answers") as pbar:
        for future in as_completed(futures):
            q_id, result, success = future.result()
            
            if success:
                llm_answers_map[q_id] = result
            else:
                print(f"\nSkipped {q_id}: {result}")
            
            pbar.update(1)
            
            # Rate limiting: brief delay after each completion
            time.sleep(delay_between_api_calls / max_workers)

print(f"\n{'=' * 80}")
print(f"Generated {len(llm_answers_map)} / {len(questions_to_process)} LLM answers")

## 7. Create LLM Answers Dataset

In [None]:
# Create LLM answers dataset
llm_rows_list = []

for q_id, llm_answer in llm_answers_map.items():
    # Get the question info
    question_info = unique_questions[unique_questions['q_id'] == q_id].iloc[0]
    
    llm_rows_list.append({
        'q_id': q_id,
        'title': question_info['title'],
        'text': llm_answer,
        'source': 'chatgpt'
    })

df_llm = pd.DataFrame(llm_rows_list).reset_index(drop=True)

print("LLM Answers Dataset Summary:")
print("=" * 80)
print(f"Total rows: {len(df_llm)}")
print(f"Unique questions: {df_llm['q_id'].nunique()}")
print(f"Columns: {list(df_llm.columns)}")
print(f"\nDataset shape: {df_llm.shape}")

In [None]:
# Show sample rows
print("\nSample LLM answers:")
print("=" * 80)

for idx, row in df_llm.head(3).iterrows():
    print(f"\n[{idx + 1}] Question: {row['title'][:70]}...")
    print(f"    Answer: {row['text'][:100]}...")
    print(f"    Source: {row['source']}")

## 8. Save LLM Answers Dataset

In [None]:
# Create output folder
output_folder = "openai-output"
os.makedirs(output_folder, exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save to CSV
csv_filename = os.path.join(output_folder, f"eli5_chatgpt_answers_{timestamp}.csv")
df_llm.to_csv(csv_filename, index=False)
print(f"CSV saved to: {csv_filename}")

# Save as Parquet
parquet_filename = os.path.join(output_folder, f"eli5_chatgpt_answers_{timestamp}.parquet")
df_llm.to_parquet(parquet_filename, index=False)
print(f"Parquet saved to: {parquet_filename}")

# Save metadata
metadata = {
    'timestamp': timestamp,
    'total_rows': int(len(df_llm)),
    'unique_questions': int(df_llm['q_id'].nunique()),
    'chatgpt_answers': int(len(df_llm)),
    'llm_model': openai_model,
    'max_workers': max_workers,
    'columns': list(df_llm.columns)
}

metadata_filename = os.path.join(output_folder, f"metadata_{timestamp}.json")
with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"Metadata saved to: {metadata_filename}")
