# Training Notebook

Concise training pipeline for Knowledge vs Reasoning Separation project.

**Purpose**: Train models with different ε-masking values and save results to GCS for evaluation.


In [None]:
# Imports
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
import numpy as np
import json
import time
from typing import List, Dict, Optional
import os

# GCS imports
try:
    import gcsfs
    import pyarrow.parquet as pq
    GCS_AVAILABLE = True
except ImportError:
    GCS_AVAILABLE = False
    print("Warning: GCS not available")

print("✅ Imports successful")


In [None]:
# Configuration
EPSILON_VALUES = [0.0, 0.1, 0.3, 0.5, 0.7]  # 5 different epsilon values
MODEL_NAME = "gpt2"
BUCKET_NAME = "parquet_v2_openwebtext-with-pos-ner"
OUTPUT_BUCKET = "model-training-results"  # Where to save trained models
CREDENTIALS_PATH = "eastern-bridge-credentials.json"

# Training parameters
MAX_LENGTH = 512
BATCH_SIZE = 8
LEARNING_RATE = 5e-5
NUM_EPOCHS = 1
MAX_SAMPLES = 1000  # Limit for quick training

print(f"Training configuration:")
print(f"  Epsilon values: {EPSILON_VALUES}")
print(f"  Model: {MODEL_NAME}")
print(f"  Max samples: {MAX_SAMPLES}")
print(f"  Batch size: {BATCH_SIZE}")


In [None]:
# GCS Data Loader
class GCSDataLoader:
    def __init__(self, bucket_name: str, credentials_path: str):
        if not GCS_AVAILABLE:
            raise ImportError("GCS not available")
        
        self.fs = gcsfs.GCSFileSystem(token=credentials_path)
        self.bucket_name = bucket_name
        print(f"✅ Connected to GCS bucket: {bucket_name}")
    
    def load_sample_data(self, max_samples: int = 1000) -> pd.DataFrame:
        """Load sample data for training."""
        try:
            # List parquet files
            files = sorted(self.fs.glob(f"{self.bucket_name}/**/*.parquet"))
            print(f"Found {len(files)} files")
            
            if not files:
                raise ValueError("No parquet files found")
            
            # Load first few files
            dfs = []
            total_samples = 0
            
            for file_path in files[:5]:  # Load first 5 files
                if total_samples >= max_samples:
                    break
                    
                df = pd.read_parquet(f"gs://{file_path}")
                dfs.append(df)
                total_samples += len(df)
                print(f"Loaded {len(df)} samples from {file_path}")
            
            combined_df = pd.concat(dfs, ignore_index=True)
            # Sample if we have too many
            if len(combined_df) > max_samples:
                combined_df = combined_df.sample(n=max_samples, random_state=42)
            
            print(f"✅ Loaded {len(combined_df)} total samples")
            return combined_df
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            return pd.DataFrame()

# Initialize data loader
if GCS_AVAILABLE:
    data_loader = GCSDataLoader(BUCKET_NAME, CREDENTIALS_PATH)
else:
    data_loader = None


In [None]:
# ε-Masking Function (simplified from tokenization notebook)
def apply_epsilon_masking(text: str, epsilon: float) -> str:
    """Apply ε-masking to text."""
    if epsilon == 0.0:
        return text
    
    words = text.split()
    masked_words = []
    
    for word in words:
        # Simple masking - mask content words with probability epsilon
        if np.random.random() < epsilon and len(word) > 2:
            masked_words.append("<mask>")
        else:
            masked_words.append(word)
    
    return " ".join(masked_words)

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    """Tokenize examples for training."""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

print("✅ Masking function and tokenizer ready")


In [None]:
# Training Function
def train_model_for_epsilon(epsilon: float, train_dataset: Dataset) -> Dict:
    """Train a model for a specific epsilon value."""
    print(f"\n🚀 Training model for ε = {epsilon}")
    
    # Apply masking to dataset
    masked_texts = [apply_epsilon_masking(text, epsilon) for text in train_dataset["text"]]
    masked_dataset = Dataset.from_dict({"text": masked_texts})
    
    # Tokenize dataset
    tokenized_dataset = masked_dataset.map(tokenize_function, batched=True)
    
    # Initialize model
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./model_epsilon_{epsilon}",
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        logging_steps=10,
        save_steps=100,
        evaluation_strategy="no",
        save_total_limit=1,
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
    )
    
    # Train
    start_time = time.time()
    trainer.train()
    training_time = time.time() - start_time
    
    print(f"✅ Training completed for ε = {epsilon} in {training_time:.2f}s")
    
    return {
        "epsilon": epsilon,
        "model": model,
        "trainer": trainer,
        "training_time": training_time,
        "num_samples": len(train_dataset)
    }

print("✅ Training function ready")


In [None]:
# GCS Model Saver
class GCSModelSaver:
    def __init__(self, bucket_name: str, credentials_path: str):
        if not GCS_AVAILABLE:
            raise ImportError("GCS not available")
        
        self.fs = gcsfs.GCSFileSystem(token=credentials_path)
        self.bucket_name = bucket_name
        print(f"✅ Model saver connected to bucket: {bucket_name}")
    
    def save_model_results(self, results: List[Dict]):
        """Save training results to GCS."""
        try:
            # Create results summary
            summary = {
                "training_timestamp": time.time(),
                "model_name": MODEL_NAME,
                "epsilon_values": EPSILON_VALUES,
                "training_config": {
                    "max_length": MAX_LENGTH,
                    "batch_size": BATCH_SIZE,
                    "learning_rate": LEARNING_RATE,
                    "num_epochs": NUM_EPOCHS,
                    "max_samples": MAX_SAMPLES
                },
                "results": []
            }
            
            for result in results:
                summary["results"].append({
                    "epsilon": result["epsilon"],
                    "training_time": result["training_time"],
                    "num_samples": result["num_samples"]
                })
            
            # Save summary to GCS
            summary_path = f"{self.bucket_name}/training_summary_{int(time.time())}.json"
            with self.fs.open(summary_path, 'w') as f:
                json.dump(summary, f, indent=2)
            
            print(f"✅ Training summary saved to: {summary_path}")
            return summary_path
            
        except Exception as e:
            print(f"❌ Error saving results: {e}")
            return None

# Initialize model saver
if GCS_AVAILABLE:
    model_saver = GCSModelSaver(OUTPUT_BUCKET, CREDENTIALS_PATH)
else:
    model_saver = None


In [None]:
# Main Training Pipeline
def run_training_pipeline():
    """Run the complete training pipeline."""
    print("🚀 Starting Training Pipeline")
    print("=" * 50)
    
    # Load data
    if data_loader is None:
        print("❌ GCS not available - using dummy data")
        # Create dummy data for testing
        dummy_texts = [
            "The quick brown fox jumps over the lazy dog.",
            "Machine learning is transforming artificial intelligence.",
            "Natural language processing enables computers to understand text.",
            "Deep learning models require large amounts of training data.",
            "Transformers have revolutionized the field of NLP."
        ] * 200  # Repeat to get 1000 samples
        train_data = pd.DataFrame({"text": dummy_texts[:MAX_SAMPLES]})
    else:
        train_data = data_loader.load_sample_data(MAX_SAMPLES)
    
    if train_data.empty:
        print("❌ No training data available")
        return
    
    # Convert to Dataset
    train_dataset = Dataset.from_pandas(train_data)
    print(f"✅ Training dataset ready: {len(train_dataset)} samples")
    
    # Train models for each epsilon value
    results = []
    for epsilon in EPSILON_VALUES:
        try:
            result = train_model_for_epsilon(epsilon, train_dataset)
            results.append(result)
        except Exception as e:
            print(f"❌ Training failed for ε = {epsilon}: {e}")
    
    # Save results
    if model_saver and results:
        summary_path = model_saver.save_model_results(results)
        print(f"\n✅ Training pipeline completed!")
        print(f"📊 Trained {len(results)} models")
        print(f"💾 Results saved to: {summary_path}")
    else:
        print(f"\n✅ Training pipeline completed!")
        print(f"📊 Trained {len(results)} models")
        print("⚠️ Results not saved to GCS")
    
    return results

# Run the pipeline
if __name__ == "__main__":
    results = run_training_pipeline()
