# Data Preparation for Conditional SAE Analysis

This notebook prepares synthetic and natural datasets for training SAEs on conditional features.
Run this locally before uploading to Colab for training.

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
from pathlib import Path
import yaml

# Import our data utilities
from src.data_utils import (
    ConditionalDatasetGenerator,
    create_minimal_test_set,
    save_dataset,
    load_dataset
)

## 1. Load Configuration

In [None]:
# Load configuration
with open('../configs/training_config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Configuration:")
print(f"  Number of samples: {config['data']['n_samples']}")
print(f"  Data directory: {config['paths']['data_dir']}")

## 2. Generate Synthetic Dataset

In [None]:
# Initialize generator
generator = ConditionalDatasetGenerator(seed=42)

# Generate dataset
print(f"Generating {config['data']['n_samples']} sentences...")
df = generator.generate_dataset(n_samples=config['data']['n_samples'])

print(f"\nDataset statistics:")
print(f"  Total sentences: {len(df)}")
print(f"  Conditionals: {df['has_conditional'].sum()}")
print(f"  Controls: {(~df['has_conditional']).sum()}")
print(f"\nSentence types:")
print(df['type'].value_counts())

In [None]:
# Display sample sentences
print("Sample sentences by type:\n")
for sentence_type in df['type'].unique():
    print(f"{sentence_type.upper()}:")
    samples = df[df['type'] == sentence_type]['text'].head(3)
    for i, text in enumerate(samples, 1):
        print(f"  {i}. {text}")
    print()

## 3. Create Test Sets for Inference

In [None]:
# Create minimal test set for logical inference
test_df = create_minimal_test_set()

print("Test cases for inference:")
for _, row in test_df.iterrows():
    print(f"\nType: {row['type']}")
    print(f"  Premise: {row['premise']}")
    print(f"  Fact: {row['fact']}")
    print(f"  Conclusion: {row['conclusion']}")
    print(f"  Valid: {row['valid']}")

## 4. Analyze Dataset Properties

In [None]:
# Analyze text lengths
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("Text statistics:")
print(df[['text_length', 'word_count']].describe())

# Plot distributions
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(df['text_length'], bins=20, edgecolor='black')
axes[0].set_xlabel('Character Count')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Text Length Distribution')

axes[1].hist(df['word_count'], bins=15, edgecolor='black')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Word Count Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Analyze keyword frequencies
keywords = ['if', 'then', 'and', 'or', 'not', 'because', 'when']

for keyword in keywords:
    df[f'has_{keyword}'] = df['text'].str.lower().str.contains(keyword)

keyword_stats = pd.DataFrame({
    'keyword': keywords,
    'count': [df[f'has_{keyword}'].sum() for keyword in keywords],
    'percentage': [100 * df[f'has_{keyword}'].mean() for keyword in keywords]
})

print("Keyword frequencies:")
print(keyword_stats.to_string(index=False))

## 5. Save Dataset

In [None]:
# Create data directory if it doesn't exist
data_dir = Path('..') / config['paths']['data_dir']
data_dir.mkdir(parents=True, exist_ok=True)

# Save main dataset
main_dataset_path = data_dir / 'conditionals_dataset.csv'
save_dataset(df, str(main_dataset_path))
print(f"Main dataset saved to: {main_dataset_path}")

# Save test set
test_dataset_path = data_dir / 'inference_test_set.csv'
save_dataset(test_df, str(test_dataset_path))
print(f"Test set saved to: {test_dataset_path}")

# Also save as JSON for easier inspection
json_path = data_dir / 'conditionals_dataset.json'
save_dataset(df.head(100), str(json_path))  # Save first 100 for inspection
print(f"Sample saved to JSON: {json_path}")

## 6. Create Extended Datasets (Optional)

In [None]:
# Create a focused dataset with only simple conditionals
simple_conditionals = df[df['type'] == 'simple_conditional']
print(f"Simple conditionals: {len(simple_conditionals)}")

# Create a balanced dataset
n_per_type = df['has_conditional'].value_counts().min()
balanced_df = pd.concat([
    df[df['has_conditional']].sample(n=n_per_type, random_state=42),
    df[~df['has_conditional']].sample(n=n_per_type, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Balanced dataset: {len(balanced_df)} (50% conditional, 50% control)")

# Save additional datasets
save_dataset(simple_conditionals, str(data_dir / 'simple_conditionals.csv'))
save_dataset(balanced_df, str(data_dir / 'balanced_dataset.csv'))

## 7. Prepare for GitHub Upload

In [None]:
# Check file sizes
import os

print("File sizes:")
for file in data_dir.glob('*.csv'):
    size_kb = os.path.getsize(file) / 1024
    print(f"  {file.name}: {size_kb:.1f} KB")

print("\nThese files are small enough to commit to GitHub.")
print("Next steps:")
print("  1. git add -A")
print("  2. git commit -m 'Add prepared datasets'")
print("  3. git push origin main")
print("  4. Upload notebook 02_sae_training.ipynb to Google Colab")

## Summary

We've successfully created:
1. **Main dataset** with various types of conditionals and control sentences
2. **Test set** for logical inference evaluation
3. **Balanced dataset** for controlled experiments
4. **Simple conditionals** subset for focused analysis

The data is ready to be pushed to GitHub and used in the Colab training notebook.