# 🏥 mHealth Privacy-Utility: Complete Training Pipeline

**Executa todos os cenários de treino:**
- ✅ Baseline (sem privacidade)
- ✅ DP (Differential Privacy, ε=1.0)
- ✅ FL (Federated Learning, 5 clientes)
- ✅ FL+DP (Federated + Differential Privacy)

**Assumido:**
- Dados preprocessados em `MyDrive/mhealth-data/processed/`
- Projeto em `MyDrive/mhealth-privacy/`

## 1. Setup: Mount Drive

In [None]:
from google.colab import drive
import os
from pathlib import Path

# Mount Drive
drive.mount('/content/drive')
print('✅ Google Drive mounted')

## 2. Setup: Define Paths & Verify Structure

In [None]:
from pathlib import Path

# Define paths
DRIVE_BASE = Path('/content/drive/MyDrive')
PROJECT_DIR = DRIVE_BASE / 'mhealth-privacy'
DATA_BASE = DRIVE_BASE / 'mhealth-data'

print('📁 Drive Structure Check:')
print(f'   Project: {PROJECT_DIR.exists()} → {PROJECT_DIR}')
print(f'   Data: {DATA_BASE.exists()} → {DATA_BASE}')

# Check processed data
sleep_edf_dir = DATA_BASE / 'processed' / 'sleep-edf'
wesad_dir = DATA_BASE / 'processed' / 'wesad'

print(f'\n📊 Processed Data Check:')
print(f'   Sleep-EDF: {sleep_edf_dir.exists()}')
if sleep_edf_dir.exists():
    files = list(sleep_edf_dir.glob('*.npy')) + list(sleep_edf_dir.glob('*.pkl'))
    print(f'      Files: {len(files)}')
    for f in sorted(files)[:5]:
        print(f'      - {f.name}')

print(f'\n   WESAD: {wesad_dir.exists()}')
if wesad_dir.exists():
    files = list(wesad_dir.glob('*.npy')) + list(wesad_dir.glob('*.pkl'))
    print(f'      Files: {len(files)}')
    for f in sorted(files)[:5]:
        print(f'      - {f.name}')

# Verify project structure
print(f'\n🏗️  Project Structure Check:')
print(f'   src/: {(PROJECT_DIR / "src").exists()}')
print(f'   scripts/: {(PROJECT_DIR / "scripts").exists()}')
print(f'   experiments/: {(PROJECT_DIR / "experiments").exists()}')
print(f'   src/configs/: {(PROJECT_DIR / "src" / "configs").exists()}')

## 3. Setup: Clone Project (if needed)

In [None]:
import subprocess
import os
from pathlib import Path

DRIVE_BASE = Path('/content')  
PROJECT_DIR = DRIVE_BASE / 'mhealth-data-privacy'

if not PROJECT_DIR.exists():
    print('📥 Cloning project...')
    os.chdir(DRIVE_BASE)
    result = subprocess.run([
        'git', 'clone',
        'https://github.com/vasco-fernandes21/mhealth-data-privacy.git'
    ], capture_output=True, text=True)
    
    if result.returncode == 0:
        print('✅ Project cloned')
    else:
        print(f'❌ Clone failed: {result.stderr}')
else:
    print('✅ Project already exists')

os.chdir(PROJECT_DIR)
print(f'✅ Working directory: {PROJECT_DIR}')

## 4. Setup: Install Dependencies

In [None]:
# Install minimal dependencies (most should be in Colab)
!pip install -q  opacus scikit-learn pyyaml tqdm -U
print('✅ Dependencies installed')

## 5. Setup: Python Environment

In [None]:
import sys
import torch
import random
import numpy as np

# Add project to path
sys.path.insert(0, str(PROJECT_DIR))

# Set seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f'✅ Environment setup:')
print(f'   Device: {DEVICE}')
print(f'   Seed: {SEED}')
if DEVICE == 'cuda':
    print(f'   GPU: {torch.cuda.get_device_name(0)}')
    print(f'   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB')

## 6. Configuration: Select Dataset & Scenarios

In [None]:
# ============================================================================
# ⚙️ CONFIGURE HERE
# ============================================================================

# Choose dataset
DATASET = 'sleep-edf'  # Options: 'sleep-edf' or 'wesad'

# Choose which scenarios to run
RUN_SCENARIOS = {
    'baseline': True,      # ✅ No privacy
    'dp': True,            # ✅ Differential Privacy
    'fl': True,            # ✅ Federated Learning
    'fl_dp': True          # ✅ FL + DP
}

# Common parameters
TRAIN_PARAMS = {
    'epochs': 20,          # Reduced for Colab
    'batch_size': 32,
    'learning_rate': 0.001,
    'seed': SEED,
    'device': DEVICE
}

# DP parameters
DP_PARAMS = {
    'epsilon': 1.0,
    'delta': 1e-5,
    'max_grad_norm': 1.0
}

# FL parameters
FL_PARAMS = {
    'n_clients': 5,
    'local_epochs': 3,
    'global_rounds': 20
}

print('⚙️ Configuration:')
print(f'   Dataset: {DATASET}')
print(f'   Scenarios: {[s for s, v in RUN_SCENARIOS.items() if v]}')
print(f'   Epochs: {TRAIN_PARAMS["epochs"]}')
print(f'   DP Epsilon: {DP_PARAMS["epsilon"]}')
print(f'   FL Clients: {FL_PARAMS["n_clients"]}')

## 7. Training: Baseline

In [None]:
if RUN_SCENARIOS['baseline']:
    print('\n' + '='*70)
    print('🚀 BASELINE TRAINING')
    print('='*70)
    
    cmd = f"""python scripts/train_baseline.py \
      --dataset {DATASET} \
      --seed {TRAIN_PARAMS['seed']} \
      --device {TRAIN_PARAMS['device']}"""
    
    print(f'Command: {cmd}\n')
    result = os.system(cmd)
    
    if result == 0:
        print('\n✅ Baseline training completed')
    else:
        print(f'\n❌ Baseline training failed (code: {result})')
else:
    print('⏭️  Baseline training skipped')

## 8. Training: Differential Privacy

In [None]:
if RUN_SCENARIOS['dp']:
    print('\n' + '='*70)
    print('🔐 DIFFERENTIAL PRIVACY TRAINING')
    print('='*70)
    
    cmd = f"""python scripts/train_dp.py \
      --dataset {DATASET} \
      --epsilon {DP_PARAMS['epsilon']} \
      --seed {TRAIN_PARAMS['seed']} \
      --device {TRAIN_PARAMS['device']}"""
    
    print(f'Command: {cmd}\n')
    result = os.system(cmd)
    
    if result == 0:
        print(f'\n✅ DP training completed (ε={DP_PARAMS["epsilon"]})')
    else:
        print(f'\n❌ DP training failed (code: {result})')
else:
    print('⏭️  DP training skipped')

## 9. Training: Federated Learning

In [None]:
if RUN_SCENARIOS['fl']:
    print('\n' + '='*70)
    print('🤝 FEDERATED LEARNING TRAINING')
    print('='*70)
    
    cmd = f"""python scripts/train_fl.py \
      --dataset {DATASET} \
      --n_clients {FL_PARAMS['n_clients']} \
      --seed {TRAIN_PARAMS['seed']} \
      --device {TRAIN_PARAMS['device']}"""
    
    print(f'Command: {cmd}\n')
    result = os.system(cmd)
    
    if result == 0:
        print(f'\n✅ FL training completed ({FL_PARAMS["n_clients"]} clients)')
    else:
        print(f'\n❌ FL training failed (code: {result})')
else:
    print('⏭️  FL training skipped')

## 10. Training: Federated Learning + Differential Privacy

In [None]:
if RUN_SCENARIOS['fl_dp']:
    print('\n' + '='*70)
    print('🔒 FEDERATED LEARNING + DIFFERENTIAL PRIVACY')
    print('='*70)
    
    cmd = f"""python scripts/train_fl_dp.py \
      --dataset {DATASET} \
      --n_clients {FL_PARAMS['n_clients']} \
      --epsilon {DP_PARAMS['epsilon']} \
      --seed {TRAIN_PARAMS['seed']} \
      --device {TRAIN_PARAMS['device']}"""
    
    print(f'Command: {cmd}\n')
    result = os.system(cmd)
    
    if result == 0:
        print(f'\n✅ FL+DP training completed')
    else:
        print(f'\n❌ FL+DP training failed (code: {result})')
else:
    print('⏭️  FL+DP training skipped')

## 11. Results: Load & Compare

In [None]:
import json
import pandas as pd
from pathlib import Path

RESULTS_BASE = Path('./results')

print('\n' + '='*70)
print('📊 RESULTS SUMMARY')
print('='*70)

results_dict = {}

# Load results for each scenario
scenarios_to_check = [
    ('baseline', f'baseline/{DATASET}/results.json'),
    ('dp', f'dp/epsilon_1.0/{DATASET}/results.json'),
    ('fl', f'fl/{DATASET}/results.json'),
    ('fl_dp', f'fl_dp/epsilon_1.0/{DATASET}/results.json')
]

for scenario_name, result_path in scenarios_to_check:
    if not RUN_SCENARIOS.get(scenario_name, True):
        continue
    
    full_path = RESULTS_BASE / result_path
    
    if full_path.exists():
        try:
            with open(full_path) as f:
                results = json.load(f)
            
            results_dict[scenario_name] = {
                'accuracy': results.get('accuracy', 0),
                'f1_score': results.get('f1_score', 0),
                'training_time': results.get('training_time_seconds', 0)
            }
            
            print(f'\n✅ {scenario_name.upper()}')
            print(f'   Accuracy: {results_dict[scenario_name]["accuracy"]:.4f}')
            print(f'   F1-Score: {results_dict[scenario_name]["f1_score"]:.4f}')
            print(f'   Time: {results_dict[scenario_name]["training_time"]:.1f}s')
            
            if scenario_name == 'dp':
                epsilon = results.get('final_epsilon', results.get('epsilon', 'N/A'))
                print(f'   Privacy (ε): {epsilon}')
        
        except Exception as e:
            print(f'❌ Error loading {scenario_name}: {e}')
    else:
        print(f'⚠️  {scenario_name}: Results file not found at {full_path}')

# Create comparison table
if results_dict:
    df = pd.DataFrame(results_dict).T
    print(f'\n' + '='*70)
    print('COMPARISON TABLE')
    print('='*70)
    print(df.round(4).to_string())

## 12. Results: Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

if results_dict:
    sns.set_style('whitegrid')
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Accuracy comparison
    scenarios = list(results_dict.keys())
    accuracies = [results_dict[s]['accuracy'] for s in scenarios]
    
    colors = ['green', 'orange', 'blue', 'red']
    bars = axes[0].bar(scenarios, accuracies, color=colors[:len(scenarios)], 
                        alpha=0.7, edgecolor='black', linewidth=2)
    axes[0].set_ylabel('Accuracy', fontsize=12, fontweight='bold')
    axes[0].set_title(f'{DATASET.upper()} - Accuracy Comparison', fontsize=13, fontweight='bold')
    axes[0].set_ylim([0.5, 1.0])
    axes[0].grid(axis='y', alpha=0.3)
    
    for bar, acc in zip(bars, accuracies):
        height = bar.get_height()
        axes[0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{acc:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    # F1-Score comparison
    f1_scores = [results_dict[s]['f1_score'] for s in scenarios]
    bars2 = axes[1].bar(scenarios, f1_scores, color=colors[:len(scenarios)], 
                         alpha=0.7, edgecolor='black', linewidth=2)
    axes[1].set_ylabel('F1-Score', fontsize=12, fontweight='bold')
    axes[1].set_title(f'{DATASET.upper()} - F1-Score Comparison', fontsize=13, fontweight='bold')
    axes[1].set_ylim([0.5, 1.0])
    axes[1].grid(axis='y', alpha=0.3)
    
    for bar, f1 in zip(bars2, f1_scores):
        height = bar.get_height()
        axes[1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{f1:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('results_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    print('✅ Visualization saved as results_comparison.png')
else:
    print('⚠️  No results to visualize')

## 13. Download Results

In [None]:
from google.colab import files
import shutil

print('\n' + '='*70)
print('📥 EXPORTING RESULTS')
print('='*70)

# Create zip with results
if RESULTS_BASE.exists():
    print('\nCreating results archive...')
    shutil.make_archive('mhealth_results', 'zip', RESULTS_BASE)
    print('✅ Archive created: mhealth_results.zip')
    
    # Download
    files.download('mhealth_results.zip')
    print('✅ Downloaded to your computer!')
else:
    print('⚠️  No results directory found')

# Also download visualization if exists
if os.path.exists('results_comparison.png'):
    files.download('results_comparison.png')
    print('✅ Downloaded visualization!')

## 14. Summary

In [None]:
print('\n' + '='*70)
print('✅ PIPELINE COMPLETE')
print('='*70)

print(f'\n📊 Execution Summary:')
print(f'   Dataset: {DATASET}')
print(f'   Scenarios run: {[s for s, v in RUN_SCENARIOS.items() if v]}')
print(f'   Epochs: {TRAIN_PARAMS["epochs"]}')
print(f'   Results scenarios: {list(results_dict.keys()) if results_dict else "None"}')

if results_dict:
    baseline_acc = results_dict.get('baseline', {}).get('accuracy', 0)
    dp_acc = results_dict.get('dp', {}).get('accuracy', 0)
    
    if baseline_acc > 0 and dp_acc > 0:
        drop = (baseline_acc - dp_acc) * 100
        print(f'\n📈 Privacy-Utility Tradeoff:')
        print(f'   Baseline Accuracy: {baseline_acc:.4f}')
        print(f'   DP Accuracy (ε=1.0): {dp_acc:.4f}')
        print(f'   Accuracy Drop: {drop:.2f}%')

print(f'\n📁 Results Location:')
print(f'   Local: {RESULTS_BASE}')
print(f'   Drive: {DRIVE_BASE}/mhealth-privacy/results')

print(f'\n🚀 Next Steps:')
print(f'   1. Analyze results in results_comparison.png')
print(f'   2. Run with different epsilon values: [0.5, 1.0, 2.0, 5.0]')
print(f'   3. Vary n_clients for FL experiments')
print(f'   4. Generate paper plots and tables')

print('\n' + '='*70)