# LLM Answers Merging

This notebook merges the LLM-generated answers from different models (OpenAI, Gemini, and DeepSeek) into a single consolidated dataset.

## Import Libraries

In [15]:
import pandas as pd
import os
from pathlib import Path

## Load Merged CSV Files

In [16]:
# Define paths to merged CSV files
openai_file = 'openai-output/eli5_chatgpt_answers_merged.csv'
gemini_file = 'gemini-output/eli5_gemini_answers_merged.csv'
deepseek_file = 'deepseek-output/eli5_deepseek_answers_merged.csv'

# Load available files
dataframes = []

# Load OpenAI
if os.path.exists(openai_file):
    df_openai = pd.read_csv(openai_file)
    print(f"Loaded OpenAI data: {len(df_openai)} rows")
    print(f"Columns: {df_openai.columns.tolist()}")
    dataframes.append(df_openai)
else:
    print(f"Warning: {openai_file} not found")

# Load Gemini 
if os.path.exists(gemini_file):
    df_gemini = pd.read_csv(gemini_file)
    print(f"\nLoaded Gemini data: {len(df_gemini)} rows")
    print(f"Columns: {df_gemini.columns.tolist()}")
    dataframes.append(df_gemini)
else:
    print(f"Warning: {gemini_file} not found")

# Load DeepSeek 
if os.path.exists(deepseek_file):
    df_deepseek = pd.read_csv(deepseek_file)
    print(f"\nLoaded DeepSeek data: {len(df_deepseek)} rows")
    print(f"Columns: {df_deepseek.columns.tolist()}")
    dataframes.append(df_deepseek)
else:
    print(f"\nNote: {deepseek_file} not found (will be included when available)")

Loaded OpenAI data: 5000 rows
Columns: ['q_id', 'title', 'text', 'source']

Loaded Gemini data: 5000 rows
Columns: ['q_id', 'title', 'text', 'source']

Loaded DeepSeek data: 5000 rows
Columns: ['q_id', 'title', 'text', 'source']


## Merge All Datasets

In [17]:
if dataframes:
    df_all = pd.concat(dataframes, ignore_index=True)
    print(f"Total merged rows: {len(df_all)}")
    print(f"\nData by source:")
    print(df_all['source'].value_counts())
    
    df_all['answer'] = df_all['text']
    
    # Create a pivot table with q_id as index and source as columns
    df_merged = df_all.pivot_table(
        index=['q_id', 'title'], 
        columns='source', 
        values='answer', 
        aggfunc='first'
    ).reset_index()
    df_merged.columns.name = None  # Remove the 'source' label from column names
    
    print(f"\n{'='*50}")
    print(f"Pivoted dataset shape: {df_merged.shape}")
    print(f"Columns: {df_merged.columns.tolist()}")
    print(f"\nFirst few rows:")
    display(df_merged.head())
else:
    print("No data files found to merge!")

Total merged rows: 15000

Data by source:
source
chatgpt     5000
gemini      5000
deepseek    5000
Name: count, dtype: int64

Pivoted dataset shape: (5000, 5)
Columns: ['q_id', 'title', 'chatgpt', 'deepseek', 'gemini']

First few rows:


Unnamed: 0,q_id,title,chatgpt,deepseek,gemini
0,5lchat,Why there was a 'leap second' added to the end...,"Okay, so imagine you have a big clock that hel...",Think of Earth like a spinning top that's slow...,Imagine we have a super-duper perfect clock th...
1,5lcjq6,How do you claim undiscovered land?,Claiming undiscovered land means saying that a...,You can't just go and claim any empty land you...,"Imagine you're exploring a giant playground, a..."
2,5lcl43,Why do we fail to do realistic human CGI (like...,"Okay, so imagine you have a really cool toy th...",We can make amazing CGI animals and monsters b...,Imagine you have a picture of your mommy or da...
3,5lcr1h,Why is it that we calm down when we take a dee...,"When you take a deep breath, it's like giving ...","When you feel scared or upset, your body gets ...",You know how sometimes when you're super excit...
4,5lcsyf,Why does 1080p on a 4k TV look better than 108...,"Okay, imagine you have two different kinds of ...",Imagine your TV screen is made of tiny light s...,Imagine your TV screen is like a giant board m...


In [18]:
# Check for missing values
print("Missing values per column:")
print(df_merged.isnull().sum())


Missing values per column:
q_id        0
title       0
chatgpt     0
deepseek    0
gemini      0
dtype: int64


## Save Merged Dataset

In [19]:
# Create output directory if it doesn't exist
output_dir = 'llm-dataset'
os.makedirs(output_dir, exist_ok=True)

# Drop the temporary num_answers column before saving
df_output = df_merged.drop(columns=['num_answers'], errors='ignore')

# Save the merged dataset
output_file = os.path.join(output_dir, 'eli5_all_llm_answers_merged.csv')
df_output.to_csv(output_file, index=False)

print(f"Total rows (questions): {len(df_output)}")
print(f"Total columns: {len(df_output.columns)}")
print(f"\nColumn names: {df_output.columns.tolist()}")
print(f"\nDataset summary:")
print(df_output.info())

Total rows (questions): 5000
Total columns: 5

Column names: ['q_id', 'title', 'chatgpt', 'deepseek', 'gemini']

Dataset summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   q_id      5000 non-null   object
 1   title     5000 non-null   object
 2   chatgpt   5000 non-null   object
 3   deepseek  5000 non-null   object
 4   gemini    5000 non-null   object
dtypes: object(5)
memory usage: 195.4+ KB
None
