# VERL Data Preprocessing Notebook

This notebook helps you prepare datasets for RL training with verl.

## Supported Datasets
- **GSM8K** - Grade school math problems
- **MATH** - Competition-level math problems
- **HH-RLHF** - Helpful and Harmless dialogue
- **Custom datasets** - Your own data

## Output Format
All datasets are converted to **Parquet** format with required columns:
- `data_source`: Dataset name
- `prompt`: Input prompt
- `ability`: Task category (optional)
- Other metadata fields

## How to Use
1. Run the installation cell
2. Choose and run the section for your dataset
3. The processed data will be saved to the specified output directory

---
## Installation

In [None]:
# Install required packages
!pip install datasets pandas pyarrow huggingface_hub -q

print("✅ Dependencies installed!")

---
## Section 1: GSM8K Dataset

Grade School Math 8K - Math word problems for elementary school students.

**Source**: HuggingFace `openai/gsm8k`

In [None]:
from datasets import load_dataset
import pandas as pd
import os

# ===================================================================
# GSM8K CONFIGURATION - EDIT OUTPUT PATH
# ===================================================================

GSM8K_CONFIG = {
    'output_dir': os.path.expanduser('~/data/gsm8k'),  # Edit this
    'dataset_name': 'openai/gsm8k',
    'subset': 'main',
}

# ===================================================================

os.makedirs(GSM8K_CONFIG['output_dir'], exist_ok=True)

# Load dataset
print(f"Loading GSM8K dataset from HuggingFace...")
dataset = load_dataset(GSM8K_CONFIG['dataset_name'], GSM8K_CONFIG['subset'])

print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

# Process train split
def process_gsm8k(examples):
    """Process GSM8K examples to verl format"""
    processed = []
    
    for question, answer in zip(examples['question'], examples['answer']):
        processed.append({
            'data_source': 'gsm8k',
            'prompt': question,
            'ability': 'math',
            'reward_model': {
                'style': 'rule',
                'ground_truth': answer,
            },
            'extra_info': {
                'answer': answer,
            }
        })
    
    return processed

# Process and save train split
train_data = process_gsm8k(dataset['train'])
train_df = pd.DataFrame(train_data)
train_path = os.path.join(GSM8K_CONFIG['output_dir'], 'train.parquet')
train_df.to_parquet(train_path, index=False)
print(f"✅ Train data saved to: {train_path}")

# Process and save test split
test_data = process_gsm8k(dataset['test'])
test_df = pd.DataFrame(test_data)
test_path = os.path.join(GSM8K_CONFIG['output_dir'], 'test.parquet')
test_df.to_parquet(test_path, index=False)
print(f"✅ Test data saved to: {test_path}")

# Show sample
print("\nSample data:")
print(train_df.head(2))

---
## Section 2: MATH Dataset

Competition-level mathematics problems.

**Source**: HuggingFace `lighteval/MATH`

In [None]:
from datasets import load_dataset
import pandas as pd
import os

# ===================================================================
# MATH DATASET CONFIGURATION - EDIT OUTPUT PATH
# ===================================================================

MATH_CONFIG = {
    'output_dir': os.path.expanduser('~/data/math'),  # Edit this
    'dataset_name': 'lighteval/MATH',
}

# ===================================================================

os.makedirs(MATH_CONFIG['output_dir'], exist_ok=True)

# Load dataset
print(f"Loading MATH dataset from HuggingFace...")
dataset = load_dataset(MATH_CONFIG['dataset_name'])

print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

# Process function
def process_math(examples):
    """Process MATH dataset to verl format"""
    processed = []
    
    for i in range(len(examples['problem'])):
        processed.append({
            'data_source': 'math',
            'prompt': examples['problem'][i],
            'ability': 'math',
            'reward_model': {
                'style': 'rule',
                'ground_truth': examples['solution'][i],
            },
            'extra_info': {
                'level': examples.get('level', ['unknown'])[i],
                'type': examples.get('type', ['unknown'])[i],
                'solution': examples['solution'][i],
            }
        })
    
    return processed

# Process and save train split
train_data = process_math(dataset['train'])
train_df = pd.DataFrame(train_data)
train_path = os.path.join(MATH_CONFIG['output_dir'], 'train.parquet')
train_df.to_parquet(train_path, index=False)
print(f"✅ Train data saved to: {train_path}")

# Process and save test split
test_data = process_math(dataset['test'])
test_df = pd.DataFrame(test_data)
test_path = os.path.join(MATH_CONFIG['output_dir'], 'test.parquet')
test_df.to_parquet(test_path, index=False)
print(f"✅ Test data saved to: {test_path}")

# Show sample
print("\nSample data:")
print(train_df.head(2))

---
## Section 3: HH-RLHF Dataset

Helpful and Harmless dialogue dataset for RLHF.

**Source**: HuggingFace `Anthropic/hh-rlhf`

In [None]:
from datasets import load_dataset
import pandas as pd
import os

# ===================================================================
# HH-RLHF CONFIGURATION - EDIT OUTPUT PATH
# ===================================================================

HH_RLHF_CONFIG = {
    'output_dir': os.path.expanduser('~/data/hh_rlhf'),  # Edit this
    'dataset_name': 'Anthropic/hh-rlhf',
}

# ===================================================================

os.makedirs(HH_RLHF_CONFIG['output_dir'], exist_ok=True)

# Load dataset
print(f"Loading HH-RLHF dataset from HuggingFace...")
dataset = load_dataset(HH_RLHF_CONFIG['dataset_name'])

print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

# Process function
def extract_prompt(conversation):
    """Extract the user prompt from conversation"""
    # HH-RLHF format: "\n\nHuman: {prompt}\n\nAssistant:"
    if '\n\nHuman:' in conversation:
        parts = conversation.split('\n\nHuman:')
        if len(parts) > 1:
            human_part = parts[1].split('\n\nAssistant:')[0].strip()
            return human_part
    return conversation

def process_hh_rlhf(examples):
    """Process HH-RLHF to verl format"""
    processed = []
    
    for chosen in examples['chosen']:
        prompt = extract_prompt(chosen)
        
        processed.append({
            'data_source': 'hh_rlhf',
            'prompt': prompt,
            'ability': 'conversation',
            'extra_info': {
                'full_conversation': chosen,
            }
        })
    
    return processed

# Process and save train split
train_data = process_hh_rlhf(dataset['train'])
train_df = pd.DataFrame(train_data)
train_path = os.path.join(HH_RLHF_CONFIG['output_dir'], 'train.parquet')
train_df.to_parquet(train_path, index=False)
print(f"✅ Train data saved to: {train_path}")

# Process and save test split
test_data = process_hh_rlhf(dataset['test'])
test_df = pd.DataFrame(test_data)
test_path = os.path.join(HH_RLHF_CONFIG['output_dir'], 'test.parquet')
test_df.to_parquet(test_path, index=False)
print(f"✅ Test data saved to: {test_path}")

# Show sample
print("\nSample data:")
print(train_df.head(2))

---
## Section 4: Custom Dataset

Process your own custom dataset.

**Requirements**:
- Your data should have at minimum a `prompt` field
- Can be CSV, JSON, or any format pandas can read

In [None]:
import pandas as pd
import os

# ===================================================================
# CUSTOM DATASET CONFIGURATION - EDIT THESE
# ===================================================================

CUSTOM_CONFIG = {
    'input_file': '/path/to/your/data.csv',  # Edit this
    'output_dir': os.path.expanduser('~/data/custom'),  # Edit this
    'data_source_name': 'my_custom_dataset',  # Edit this
    'prompt_column': 'prompt',  # Column name containing prompts
    'ability': 'custom',  # Task category
}

# ===================================================================

os.makedirs(CUSTOM_CONFIG['output_dir'], exist_ok=True)

# Load your data (adjust based on your file format)
# For CSV:
# df = pd.read_csv(CUSTOM_CONFIG['input_file'])

# For JSON:
# df = pd.read_json(CUSTOM_CONFIG['input_file'])

# For JSONL:
# df = pd.read_json(CUSTOM_CONFIG['input_file'], lines=True)

# Example placeholder
df = pd.DataFrame({
    'prompt': ['Example prompt 1', 'Example prompt 2'],
    'other_field': ['value1', 'value2'],
})

# Convert to verl format
def process_custom(row):
    """Convert custom data to verl format"""
    return {
        'data_source': CUSTOM_CONFIG['data_source_name'],
        'prompt': row[CUSTOM_CONFIG['prompt_column']],
        'ability': CUSTOM_CONFIG['ability'],
        'extra_info': {
            # Add any other fields you want to preserve
            k: v for k, v in row.items() 
            if k != CUSTOM_CONFIG['prompt_column']
        }
    }

processed_data = [process_custom(row) for _, row in df.iterrows()]
processed_df = pd.DataFrame(processed_data)

# Save
output_path = os.path.join(CUSTOM_CONFIG['output_dir'], 'data.parquet')
processed_df.to_parquet(output_path, index=False)
print(f"✅ Custom data saved to: {output_path}")

# Show sample
print("\nSample processed data:")
print(processed_df.head())

---
## Section 5: Data Validation

Validate your processed data before training.

In [None]:
import pandas as pd

# ===================================================================
# VALIDATION CONFIGURATION - EDIT PATH
# ===================================================================

DATA_TO_VALIDATE = os.path.expanduser('~/data/gsm8k/train.parquet')  # Edit this

# ===================================================================

def validate_verl_data(file_path):
    """Validate verl data format"""
    print(f"Validating: {file_path}")
    print("="*70)
    
    # Load data
    df = pd.read_parquet(file_path)
    
    # Check required columns
    required_cols = ['data_source', 'prompt']
    missing_cols = [col for col in required_cols if col not in df.columns]
    
    if missing_cols:
        print(f"❌ Missing required columns: {missing_cols}")
    else:
        print(f"✅ All required columns present")
    
    # Basic statistics
    print(f"\nTotal samples: {len(df)}")
    print(f"Columns: {list(df.columns)}")
    
    # Prompt length statistics
    if 'prompt' in df.columns:
        prompt_lengths = df['prompt'].str.len()
        print(f"\nPrompt length statistics:")
        print(f"  Min: {prompt_lengths.min()}")
        print(f"  Max: {prompt_lengths.max()}")
        print(f"  Mean: {prompt_lengths.mean():.2f}")
        print(f"  Median: {prompt_lengths.median():.2f}")
    
    # Data source distribution
    if 'data_source' in df.columns:
        print(f"\nData source distribution:")
        print(df['data_source'].value_counts())
    
    # Ability distribution
    if 'ability' in df.columns:
        print(f"\nAbility distribution:")
        print(df['ability'].value_counts())
    
    # Show sample
    print(f"\nSample records:")
    print(df.head(3))
    
    print("="*70)
    print("✅ Validation complete!")

# Run validation
if os.path.exists(DATA_TO_VALIDATE):
    validate_verl_data(DATA_TO_VALIDATE)
else:
    print(f"❌ File not found: {DATA_TO_VALIDATE}")

---
## Section 6: Merge Multiple Datasets

Combine multiple datasets for multi-task training.

In [None]:
import pandas as pd
import os

# ===================================================================
# MERGE CONFIGURATION - EDIT THESE
# ===================================================================

MERGE_CONFIG = {
    'datasets_to_merge': [
        os.path.expanduser('~/data/gsm8k/train.parquet'),
        os.path.expanduser('~/data/math/train.parquet'),
    ],
    'output_path': os.path.expanduser('~/data/merged/train.parquet'),
}

# ===================================================================

os.makedirs(os.path.dirname(MERGE_CONFIG['output_path']), exist_ok=True)

# Load and merge
dfs = []
for dataset_path in MERGE_CONFIG['datasets_to_merge']:
    if os.path.exists(dataset_path):
        df = pd.read_parquet(dataset_path)
        dfs.append(df)
        print(f"✅ Loaded {len(df)} samples from {dataset_path}")
    else:
        print(f"⚠️  File not found: {dataset_path}")

if dfs:
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # Shuffle
    merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Save
    merged_df.to_parquet(MERGE_CONFIG['output_path'], index=False)
    
    print(f"\n✅ Merged {len(merged_df)} total samples")
    print(f"✅ Saved to: {MERGE_CONFIG['output_path']}")
    
    # Show distribution
    print(f"\nData source distribution:")
    print(merged_df['data_source'].value_counts())
else:
    print("❌ No datasets to merge")

---
## Summary

You've now prepared your data for verl training!

**Next steps**:
1. Go to `1_verl_complete_training.ipynb`
2. Set your data paths in Section 3
3. Start training with your preferred algorithm

**Data format reminder**:
- All data is in Parquet format
- Required columns: `data_source`, `prompt`
- Optional: `ability`, `reward_model`, `extra_info`