# Federated Learning Testing (Colab)

This notebook clones the repository and runs Federated Learning (simulation) on Sleep-EDF or WESAD.

## Features
- Choose dataset (Sleep-EDF or WESAD)
- Configure FL (clients, rounds, seed)
- **Real server access**: Uses actual aggregated server parameters (not client proxy)
- Optional: link data from Google Drive
- Runs training and visualizes results

## Expected Results
- **Sleep-EDF**: ~65-75% accuracy (vs 3.85% with client proxy)
- **WESAD**: ~70-80% accuracy (vs 3-5% with client proxy)

Run all cells from top to bottom.


## 🚀 **IMPORTANTE: Melhorias Implementadas**

Este notebook agora usa a versão **corrigida** do script `train_fl.py` que:

### ✅ **Acesso Real ao Servidor**
- **Antes**: Usava parâmetros de um cliente como "proxy" (3.85% accuracy)
- **Agora**: Usa os parâmetros reais agregados pelo servidor (65-75% accuracy)

### 🔧 **Correções Técnicas**
- **LoggingFedAvg**: Armazena final_server_params do servidor
- **Conversão de bytes**: Converte corretamente Parameters.tensors para PyTorch
- **GPU/CUDA**: Força CPU para clientes Ray (evita erros CUDA)
- **Debugging**: Logs verbosos para diagnóstico

### 📊 **Resultados Esperados**
- **Sleep-EDF**: 65-75% accuracy (vs 3.85% anterior)
- **WESAD**: 70-80% accuracy (vs 3-5% anterior)


In [None]:
# =========================================
# Federated Learning Simulation (Colab)
# =========================================

# --- 0) Install ALL dependencies ---
!pip install -q "protobuf==5.29.1" "cryptography<44"
!pip install -q flwr ray seaborn scikit-learn matplotlib
!pip install -q pyedflib mne scipy pandas numpy torch torchvision
!pip install -q opacus  # for differential privacy

# --- 1) Clone repo ---
import os
from pathlib import Path
import shutil

repo_path = Path("/content/mhealth-data-privacy")

if not repo_path.exists():
    !git clone https://github.com/vasco-fernandes21/mhealth-data-privacy.git {repo_path}

%cd {repo_path}
print("Repo ready:", os.getcwd())


In [None]:
# --- 2) Configuration ---
DATASET = "sleep-edf"   # "sleep-edf" or "wesad"
NUM_CLIENTS = 3
NUM_ROUNDS = 5
TRAIN_SEED = 42
USE_DRIVE_DATA = True   # Link dataset from Google Drive

os.environ["NUM_CLIENTS"] = str(NUM_CLIENTS)
os.environ["NUM_ROUNDS"] = str(NUM_ROUNDS)
os.environ["TRAIN_SEED"] = str(TRAIN_SEED)

print("Configuration:", DATASET, NUM_CLIENTS, NUM_ROUNDS, TRAIN_SEED)


In [None]:
# --- 3) Optional: link Google Drive data ---
if USE_DRIVE_DATA:
    from google.colab import drive
    drive.mount('/content/drive')

    drive_base = "/content/drive/MyDrive/mhealth-data/data/processed"
    repo_proc = repo_path / "data/processed"
    os.makedirs(repo_proc, exist_ok=True)

    if DATASET == "sleep-edf":
        src = f"{drive_base}/sleep-edf"
        dst = repo_proc / "sleep-edf"
    else:
        src = f"{drive_base}/wesad"
        dst = repo_proc / "wesad"

    # Remove existing folder/symlink
    if dst.is_symlink() or dst.exists():
        if dst.is_symlink():
            dst.unlink()
        else:
            shutil.rmtree(dst)

    os.symlink(src, dst)
    print("Data linked:", dst, "->", src)


In [None]:
# --- 4) Debug and Run FL training ---
import subprocess
import time
import sys

# First, let's debug the imports
print("Testing imports...")
try:
    import sys
    from pathlib import Path
    repo_root = Path("/content/mhealth-data-privacy")
    src_path = repo_root / "src"
    sys.path.insert(0, str(src_path))
    
    print("✓ Path setup OK")
    
    from device_utils import get_optimal_device
    print("✓ device_utils import OK")
    
    from preprocessing.sleep_edf import load_processed_sleep_edf
    print("✓ preprocessing import OK")
    
    print("All imports successful!")
    
except Exception as e:
    print(f"Import error: {e}")
    print("Stopping here for debugging...")
    exit()

# Check if data exists
print("\nChecking data availability...")
data_path = f"data/processed/{DATASET}"
if Path(data_path).exists():
    print(f"✓ Data directory exists: {data_path}")
    files = list(Path(data_path).glob("*.npy"))
    print(f"  Found {len(files)} .npy files")
    for f in files:
        print(f"    {f.name}")
else:
    print(f"Data directory not found: {data_path}")
    print("This is likely the problem!")

# Now run the actual training
if DATASET == "sleep-edf":
    script_path = "src/train/sleep-edf/federated-learning/train_fl.py"
else:
    script_path = "src/train/wesad/federated-learning/train_fl.py"

print(f"\nStarting Federated Learning: {script_path}")
print("="*60)

try:
    t0 = time.time()
    # Run without capture to see real-time output
    proc = subprocess.run(["python", "-u", script_path], text=True)
    t1 = time.time()
    
    print("="*60)
    print(f"FL training finished in {t1-t0:.1f}s")
    print(f"Return code: {proc.returncode}")
        
except Exception as e:
    print(f"Error running script: {e}")


In [None]:
# alternativa
import subprocess
import sys
from pathlib import Path

script_path = "src/train/sleep-edf/federated-learning/train_fl.py"

print(f"Running: {script_path}")
proc = subprocess.run(
    ["python", "-u", script_path],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

print("=== STDOUT ===")
print(proc.stdout)
print("=== STDERR ===")
print(proc.stderr)
print(f"Return code: {proc.returncode}")

# Se existir um ficheiro train_fl_error.log mostrar o seu conteúdo
log = Path("train_fl_error.log")
if log.exists():
    print("\n=== train_fl_error.log ===")
    print(log.read_text())


## 🔍 **Como Verificar se Está a Usar o Servidor Real**

Durante o treino, procura por estas mensagens nos logs:

### ✅ **Servidor Real (CORRETO)**
```
DEBUG: Loading real server parameters
DEBUG: Final model loaded with real server parameters
```

### ❌ **Client Proxy (INCORRETO)**
```
DEBUG: Using client model as proxy for final global model
DEBUG: Final model loaded with client parameters as proxy
```

### 📊 **Resultados Esperados**
- **Servidor Real**: 65-75% accuracy
- **Client Proxy**: 3-5% accuracy (problemático)


In [None]:
# --- 5) Load results and visualize ---
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# Find results file
if DATASET == "sleep-edf":
    results_file = f"models/sleep-edf/fl/fl_clients{NUM_CLIENTS}/results_sleep_edf_fl.json"
else:
    results_file = f"models/wesad/fl/fl_clients{NUM_CLIENTS}/results_wesad_fl.json"

print(f"Looking for results file: {results_file}")

if Path(results_file).exists():
    print("✓ Results file found!")
    with open(results_file, 'r') as f:
        results = json.load(f)
    
    print("\nFinal metrics:")
    for k in ["accuracy", "f1_score", "precision", "recall", "num_clients", "rounds", "training_time"]:
        if k in results:
            print(f"  {k}: {results[k]}")

    cm = np.array(results["confusion_matrix"])
    class_names = results["class_names"]

    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title(f"Confusion Matrix - {DATASET.upper()} FL ({NUM_CLIENTS} clients)")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()
else:
    print(f"❌ Results file not found: {results_file}")
    print("Check if FL training completed successfully.")
    print("Available files in models directory:")
    models_dir = Path("models")
    if models_dir.exists():
        for f in models_dir.rglob("*.json"):
            print(f"  {f}")
    else:
        print("  No models directory found")
