# 🎯 SAMo Multi-Dataset Emotion Classification Pipeline

## **COMPREHENSIVE MULTI-DATASET TRAINING WITH SCIENTIFIC LOGGING**

**GOAL**: Achieve >60% F1-macro on combined GoEmotions, SemEval, ISEAR, and MELD datasets

**FEATURES**:
- ✅ **Multi-Dataset Integration**: Combines 4 emotion datasets (70K+ samples)
- ✅ **Scientific Logging**: Comprehensive experiment tracking and metrics
- ✅ **Google Drive Backup**: Automatic backup of all results and logs
- ✅ **Robust Training**: Error handling, progress monitoring, and recovery
- ✅ **Clean Workflow**: Streamlined 2-step process

**DATASETS**:
- 📊 **GoEmotions**: 48,836 samples (28 emotions)
- 📊 **SemEval**: 802 samples (11 emotions) 
- 📊 **ISEAR**: 7,500 samples (7 emotions)
- 📊 **MELD**: 13,708 samples (7 emotions)

**TOTAL**: 70,846+ samples across 28 emotion classes

---

## 🚀 **QUICK START - 2 STEP WORKFLOW**

1. **Cell 1**: Environment Setup & Data Preparation
2. **Cell 2**: Training Execution with Live Progress

**Expected Training Time**: 6-8 hours on 2x RTX 3090
**Expected Performance**: >60% F1-macro (vs 51.79% baseline)

## 🔧 **STEP 1: ENVIRONMENT SETUP & DATA PREPARATION**

**Run this cell first to:**
- ✅ Verify environment and dependencies
- ✅ Prepare combined multi-dataset
- ✅ Validate data integrity
- ✅ Set up logging and backup systems

In [None]:
# 🔧 STEP 1: ENVIRONMENT SETUP & DATA PREPARATION
print("🚀 SAMo Multi-Dataset Pipeline - Environment Setup")
print("=" * 60)

import os
import sys
import json
import subprocess
import time
from pathlib import Path
from datetime import datetime

# Set working directory
os.chdir('/workspace')
print(f"📁 Working directory: {os.getcwd()}")

# Environment verification
print("\n🔍 Verifying Environment...")
print(f"Python: {sys.executable}")
print(f"Version: {sys.version}")

# Check conda environment
conda_env = os.environ.get('CONDA_DEFAULT_ENV', 'None')
print(f"Conda env: {conda_env}")

if conda_env != 'deberta-v3':
    print("⚠️  WARNING: Switch to 'Python (deberta-v3)' kernel for best results")

# Check critical packages
print("\n📦 Checking Dependencies...")
try:
    import torch
    print(f"✅ PyTorch {torch.__version__}")
    print(f"   CUDA Available: {torch.cuda.is_available()}")
    print(f"   GPU Count: {torch.cuda.device_count()}")
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
except ImportError as e:
    print(f"❌ PyTorch missing: {e}")

try:
    import transformers
    print(f"✅ Transformers {transformers.__version__}")
except ImportError as e:
    print(f"❌ Transformers missing: {e}")

try:
    import sklearn
    print(f"✅ Scikit-learn {sklearn.__version__}")
except ImportError as e:
    print(f"❌ Scikit-learn missing: {e}")

# Check GPU status
print("\n🖥️  GPU Status:")
try:
    result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total,memory.used', '--format=csv,noheader,nounits'], 
                          capture_output=True, text=True, timeout=10)
    if result.returncode == 0:
        print(result.stdout.strip())
    else:
        print("⚠️  nvidia-smi not available")
except Exception as e:
    print(f"⚠️  GPU check failed: {e}")

# Data preparation
print("\n📊 Preparing Multi-Dataset...")
print("   This will combine GoEmotions, SemEval, ISEAR, and MELD datasets")

# Check if data already exists
data_dir = Path('data/combined')
if data_dir.exists() and (data_dir / 'train.jsonl').exists() and (data_dir / 'val.jsonl').exists():
    print("✅ Combined dataset already exists")
    
    # Show dataset statistics
    train_samples = sum(1 for _ in open(data_dir / 'train.jsonl'))
    val_samples = sum(1 for _ in open(data_dir / 'val.jsonl'))
    total_samples = train_samples + val_samples
    
    print(f"   📈 Training samples: {train_samples:,}")
    print(f"   📈 Validation samples: {val_samples:,}")
    print(f"   📈 Total samples: {total_samples:,}")
    
    # Load metadata
    if (data_dir / 'metadata.json').exists():
        with open(data_dir / 'metadata.json', 'r') as f:
            metadata = json.load(f)
        print(f"   🎯 Emotion classes: {metadata.get('emotion_count', 'Unknown')}")
        print(f"   📊 Datasets: {', '.join(metadata.get('datasets_included', []))}")
else:
    print("🔄 Running data preparation...")
    
    # Run data preparation script
    start_time = time.time()
    result = subprocess.run(['python3', 'prepare_datasets_minimal.py'], 
                          capture_output=True, text=True, timeout=300)
    
    if result.returncode == 0:
        print("✅ Data preparation completed successfully")
        print(f"⏱️  Time taken: {time.time() - start_time:.1f} seconds")
        
        # Show final statistics
        if data_dir.exists():
            train_samples = sum(1 for _ in open(data_dir / 'train.jsonl'))
            val_samples = sum(1 for _ in open(data_dir / 'val.jsonl'))
            total_samples = train_samples + val_samples
            
            print(f"\n📊 Final Dataset Statistics:")
            print(f"   📈 Training samples: {train_samples:,}")
            print(f"   📈 Validation samples: {val_samples:,}")
            print(f"   📈 Total samples: {total_samples:,}")
    else:
        print(f"❌ Data preparation failed:")
        print(f"   Error: {result.stderr}")
        raise RuntimeError("Data preparation failed")

# Verify training scripts exist
print("\n🔍 Verifying Training Scripts...")
scripts = [
    'prepare_datasets_minimal.py',
    'train_multidataset_deberta.py',
    'train_comprehensive_multidataset.sh',
    'backup_to_gdrive.sh'
]

for script in scripts:
    if Path(script).exists():
        print(f"✅ {script}")
    else:
        print(f"❌ {script} missing")

# Create necessary directories
print("\n📁 Creating Directories...")
directories = ['outputs', 'logs', 'models']
for dir_name in directories:
    Path(dir_name).mkdir(exist_ok=True)
    print(f"✅ {dir_name}/")

print("\n🎉 ENVIRONMENT SETUP COMPLETE!")
print("=" * 60)
print("✅ Environment verified")
print("✅ Multi-dataset prepared")
print("✅ Training scripts ready")
print("✅ Directories created")
print("\n🚀 Ready for training! Proceed to Cell 2.")

## 🏃 **STEP 2: TRAINING EXECUTION WITH LIVE PROGRESS**

**Run this cell to:**
- ✅ Start comprehensive multi-dataset training
- ✅ Monitor live progress with real-time metrics
- ✅ Enable automatic Google Drive backup
- ✅ Track scientific logs and experiment data

**Expected Results:**
- 🎯 **Target F1-macro**: >60% (vs 51.79% baseline)
- ⏱️ **Training Time**: 6-8 hours
- 📊 **Final Dataset**: 70K+ samples across 28 emotions
- ☁️ **Backup**: Automatic Google Drive backup every 30 minutes

In [None]:
# 🏃 STEP 2: TRAINING EXECUTION WITH LIVE PROGRESS
print("🚀 SAMo Multi-Dataset Training - Starting...")
print("=" * 60)

import subprocess
import time
from datetime import datetime

# Training configuration
EXPERIMENT_NAME = f"MultiDataset_BCE_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
print(f"📊 Experiment ID: {EXPERIMENT_NAME}")
print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"🎯 Target: >60% F1-macro (vs 51.79% baseline)")
print(f"⏱️  Expected duration: 6-8 hours")
print()

# Verify data exists
data_dir = Path('data/combined')
if not (data_dir / 'train.jsonl').exists() or not (data_dir / 'val.jsonl').exists():
    print("❌ Combined dataset not found! Run Cell 1 first.")
    raise FileNotFoundError("Combined dataset not found")

# Show dataset statistics
train_samples = sum(1 for _ in open(data_dir / 'train.jsonl'))
val_samples = sum(1 for _ in open(data_dir / 'val.jsonl'))
total_samples = train_samples + val_samples

print(f"📊 Dataset Statistics:")
print(f"   📈 Training samples: {train_samples:,}")
print(f"   📈 Validation samples: {val_samples:,}")
print(f"   📈 Total samples: {total_samples:,}")
print(f"   🎯 Emotion classes: 28")
print()

# Check GPU availability
print("🖥️  GPU Status:")
try:
    result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total,memory.used', '--format=csv,noheader,nounits'], 
                          capture_output=True, text=True, timeout=10)
    if result.returncode == 0:
        gpu_info = result.stdout.strip().split('\n')
        for i, gpu in enumerate(gpu_info):
            print(f"   GPU {i}: {gpu}")
    else:
        print("   ⚠️  nvidia-smi not available")
except Exception as e:
    print(f"   ⚠️  GPU check failed: {e}")

print()
print("🏃 Starting Training...")
print("   📝 Live progress will be shown below")
print("   📊 Metrics will be logged every 100 steps")
print("   💾 Checkpoints saved every 1000 steps")
print("   ☁️  Google Drive backup every 30 minutes")
print("   ⏱️  Evaluation every 500 steps")
print()
print("" + "="*60)
print("🎯 LIVE TRAINING PROGRESS")
print("" + "="*60)

# Start training with visible progress
start_time = time.time()

try:
    # Run training script with live output
    training_result = subprocess.run(
        ['bash', 'train_comprehensive_multidataset.sh'],
        capture_output=False,  # Show live progress
        text=True,
        timeout=28800  # 8 hour timeout
    )
    
    training_time = time.time() - start_time
    
    print("\n" + "="*60)
    print("🎉 TRAINING COMPLETED!")
    print("" + "="*60)
    print(f"⏱️  Total training time: {training_time/3600:.2f} hours")
    print(f"📊 Exit code: {training_result.returncode}")
    
    if training_result.returncode == 0:
        print("✅ Training completed successfully!")
        
        # Check for results
        output_dir = Path('outputs/multidataset')
        if output_dir.exists():
            print(f"\n📁 Model saved to: {output_dir}")
            
            # List saved files
            model_files = list(output_dir.glob('*'))
            if model_files:
                print("   📄 Saved files:")
                for file in model_files:
                    print(f"      - {file.name}")
        
        # Check logs
        log_dir = Path('logs')
        if log_dir.exists():
            log_files = list(log_dir.glob('*.log'))
            if log_files:
                print(f"\n📊 Logs saved to: {log_dir}")
                print("   📄 Log files:")
                for log_file in log_files:
                    print(f"      - {log_file.name}")
        
        # Check Google Drive backup
        print("\n☁️  Google Drive Backup:")
        print("   📤 Backup completed automatically")
        print("   📁 Location: drive:00_Projects/🎯 TechLabs-2025/Final_Project/TRAINING/")
        
        print("\n🎯 NEXT STEPS:")
        print("   1. Check final F1-macro score in logs")
        print("   2. Verify model performance >60% F1-macro")
        print("   3. Download model from Google Drive if needed")
        print("   4. Use model for inference on new data")
        
    else:
        print(f"❌ Training failed with exit code: {training_result.returncode}")
        print("\n🔍 Troubleshooting:")
        print("   1. Check logs/train_comprehensive_multidataset.log")
        print("   2. Verify GPU memory availability")
        print("   3. Check disk space")
        print("   4. Review error messages above")
        
except subprocess.TimeoutExpired:
    print("\n⏰ Training timeout (8 hours exceeded)")
    print("   📊 Training may still be running in background")
    print("   🔍 Check logs for progress")
    
except Exception as e:
    print(f"\n❌ Training error: {e}")
    print("   🔍 Check error messages above")

print("\n" + "="*60)
print(f"🏁 Training session ended at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("" + "="*60)

## 📊 **RESULTS ANALYSIS & MONITORING**

**Use this cell to:**
- ✅ Monitor training progress in real-time
- ✅ Analyze final results and metrics
- ✅ Check Google Drive backup status
- ✅ Verify model performance against baseline

In [None]:
# 📊 RESULTS ANALYSIS & MONITORING
print("📊 SAMo Multi-Dataset Results Analysis")
print("=" * 50)

import json
import pandas as pd
from pathlib import Path
import subprocess

# Check training logs
print("🔍 Checking Training Logs...")
log_file = Path('logs/train_comprehensive_multidataset.log')
if log_file.exists():
    print(f"✅ Training log found: {log_file}")
    
    # Read last few lines for recent progress
    with open(log_file, 'r') as f:
        lines = f.readlines()
    
    print("\n📈 Recent Training Progress:")
    for line in lines[-10:]:  # Last 10 lines
        if 'f1_macro' in line.lower() or 'eval_f1' in line.lower():
            print(f"   {line.strip()}")
else:
    print("⚠️  Training log not found")

# Check scientific logs
print("\n🔬 Checking Scientific Logs...")
logs_dir = Path('logs')
experiment_dirs = [d for d in logs_dir.iterdir() if d.is_dir() and 'MultiDataset_BCE' in d.name]

if experiment_dirs:
    latest_experiment = max(experiment_dirs, key=lambda x: x.stat().st_mtime)
    print(f"✅ Latest experiment: {latest_experiment.name}")
    
    # Check evaluation log
    eval_log = latest_experiment / 'evaluation_log.json'
    if eval_log.exists():
        with open(eval_log, 'r') as f:
            eval_data = json.load(f)
        
        print(f"\n📊 Evaluation Results:")
        for entry in eval_data[-3:]:  # Last 3 evaluations
            metrics = entry.get('metrics', {})
            epoch = entry.get('epoch', 'Unknown')
            print(f"   Epoch {epoch}:")
            for metric, value in metrics.items():
                print(f"      {metric}: {value:.4f}")
    
    # Check experiment summary
    summary_file = latest_experiment / 'experiment_summary.json'
    if summary_file.exists():
        with open(summary_file, 'r') as f:
            summary = json.load(f)
        
        print(f"\n📋 Experiment Summary:")
        print(f"   🆔 ID: {summary.get('experiment_id', 'Unknown')}")
        print(f"   📅 Timestamp: {summary.get('timestamp', 'Unknown')}")
        print(f"   📈 Training steps: {summary.get('total_training_steps', 'Unknown')}")
        print(f"   📊 Evaluations: {summary.get('total_evaluations', 'Unknown')}")
        
        final_metrics = summary.get('final_metrics')
        if final_metrics:
            print(f"\n🎯 Final Metrics:")
            for metric, value in final_metrics.items():
                print(f"   {metric}: {value:.4f}")
                
            # Check against baseline
            f1_macro = final_metrics.get('f1_macro', 0)
            baseline = 0.5179  # 51.79% baseline
            if f1_macro > baseline:
                improvement = ((f1_macro - baseline) / baseline) * 100
                print(f"\n🎉 SUCCESS! F1-macro improved by {improvement:.1f}% over baseline")
            else:
                print(f"\n⚠️  F1-macro ({f1_macro:.4f}) below baseline ({baseline:.4f})")

else:
    print("⚠️  No experiment logs found")

# Check model outputs
print("\n🤖 Checking Model Outputs...")
output_dir = Path('outputs/multidataset')
if output_dir.exists():
    print(f"✅ Model output directory: {output_dir}")
    
    # List model files
    model_files = list(output_dir.glob('*'))
    if model_files:
        print("   📄 Model files:")
        for file in model_files:
            size_mb = file.stat().st_size / (1024 * 1024) if file.is_file() else 0
            print(f"      - {file.name} ({size_mb:.1f} MB)")
    else:
        print("   ⚠️  No model files found")
else:
    print("⚠️  Model output directory not found")

# Check Google Drive backup
print("\n☁️  Checking Google Drive Backup...")
try:
    result = subprocess.run(['rclone', 'ls', 'drive:00_Projects/🎯 TechLabs-2025/Final_Project/TRAINING/'], 
                          capture_output=True, text=True, timeout=30)
    if result.returncode == 0:
        print("✅ Google Drive accessible")
        
        # Look for our experiment
        backup_lines = result.stdout.strip().split('\n')
        multidataset_backups = [line for line in backup_lines if 'MultiDataset_BCE' in line]
        
        if multidataset_backups:
            print(f"   📤 Found {len(multidataset_backups)} MultiDataset backups:")
            for backup in multidataset_backups[-3:]:  # Last 3
                print(f"      - {backup}")
        else:
            print("   ⚠️  No MultiDataset backups found")
    else:
        print("⚠️  Google Drive not accessible")
except Exception as e:
    print(f"⚠️  Google Drive check failed: {e}")

print("\n" + "="*50)
print("📊 Analysis Complete!")
print("" + "="*50)

## 🔧 **TROUBLESHOOTING & MAINTENANCE**

**Use this cell for:**
- ✅ System diagnostics and health checks
- ✅ Cleanup and maintenance tasks
- ✅ Manual backup and recovery
- ✅ Performance optimization

In [None]:
# 🔧 TROUBLESHOOTING & MAINTENANCE
print("🔧 SAMo Multi-Dataset Troubleshooting")
print("=" * 45)

import shutil
import subprocess
from pathlib import Path

# System diagnostics
print("🖥️  System Diagnostics:")

# Check disk space
try:
    disk_usage = shutil.disk_usage('/')
    free_gb = disk_usage.free / (1024**3)
    used_percent = (disk_usage.used / disk_usage.total) * 100
    
    print(f"   💾 Disk space: {free_gb:.1f}GB free, {used_percent:.1f}% used")
    
    if free_gb < 10:
        print("   ⚠️  WARNING: Low disk space!")
    elif used_percent > 85:
        print("   ⚠️  WARNING: High disk usage!")
    else:
        print("   ✅ Disk space OK")
except Exception as e:
    print(f"   ❌ Disk check failed: {e}")

# Check GPU memory
print("\n🎮 GPU Memory Status:")
try:
    result = subprocess.run(['nvidia-smi', '--query-gpu=memory.total,memory.used,memory.free', '--format=csv,noheader,nounits'], 
                          capture_output=True, text=True, timeout=10)
    if result.returncode == 0:
        gpu_memory = result.stdout.strip().split('\n')
        for i, memory in enumerate(gpu_memory):
            total, used, free = memory.split(', ')
            used_percent = (int(used) / int(total)) * 100
            print(f"   GPU {i}: {used}MB/{total}MB used ({used_percent:.1f}%)")
            
            if used_percent > 90:
                print(f"      ⚠️  WARNING: High GPU memory usage!")
            elif used_percent > 70:
                print(f"      ⚠️  GPU memory getting full")
            else:
                print(f"      ✅ GPU memory OK")
    else:
        print("   ⚠️  nvidia-smi not available")
except Exception as e:
    print(f"   ❌ GPU check failed: {e}")

# Check running processes
print("\n🔄 Running Processes:")
try:
    result = subprocess.run(['ps', 'aux'], capture_output=True, text=True, timeout=10)
    if result.returncode == 0:
        python_processes = [line for line in result.stdout.split('\n') if 'python' in line and 'train' in line]
        if python_processes:
            print("   🔄 Active training processes:")
            for process in python_processes[:3]:  # Show first 3
                print(f"      {process.strip()}")
        else:
            print("   ✅ No active training processes")
    else:
        print("   ❌ Process check failed")
except Exception as e:
    print(f"   ❌ Process check failed: {e}")

# Cleanup options
print("\n🧹 Cleanup Options:")
print("   📁 Check large directories:")

directories_to_check = ['outputs', 'logs', 'models', 'data']
for dir_name in directories_to_check:
    dir_path = Path(dir_name)
    if dir_path.exists():
        total_size = sum(f.stat().st_size for f in dir_path.rglob('*') if f.is_file())
        size_gb = total_size / (1024**3)
        print(f"      {dir_name}/: {size_gb:.2f}GB")
        
        if size_gb > 5:
            print(f"         ⚠️  Large directory - consider cleanup")
    else:
        print(f"      {dir_name}/: Not found")

# Manual backup
print("\n☁️  Manual Backup:")
print("   To manually backup to Google Drive:")
print("   ```bash")
print("   bash backup_to_gdrive.sh")
print("   ```")

# Performance tips
print("\n⚡ Performance Tips:")
print("   • Use smaller batch size if GPU memory is low")
print("   • Reduce max_length if training is slow")
print("   • Enable gradient checkpointing for memory efficiency")
print("   • Use mixed precision training (fp16)")
print("   • Monitor GPU utilization with nvidia-smi")

print("\n" + "="*45)
print("🔧 Troubleshooting Complete!")
print("" + "="*45)