# AFML Part 1 - Team 44_XLR8 (v2 OPTIMIZED)
## Target: Val Loss < 0.005 and NMSE < 0.3

**Strategy**: NO normalization + Ensemble + Deeper network

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

np.random.seed(42)
torch.manual_seed(42)

if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("✅ M2 GPU")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("✅ CUDA GPU")
else:
    device = torch.device('cpu')
    print("⚠️  CPU")

print(f"Device: {device}")

In [None]:
print("Loading...")
train_clean = pd.read_csv('train-part1-clean.csv').values.astype(np.float32)
train_noise = pd.read_csv('train-part1-noise.csv').values.astype(np.float32)
test_data = pd.read_csv('test-part1.csv').values.astype(np.float32)

print(f"Clean: {train_clean.shape}")
print(f"Noisy: {train_noise.shape}")
print(f"Test: {test_data.shape}")

## NO NORMALIZATION - Direct Learning

In [None]:
# Split only - NO NORMALIZATION
X_train, X_val, y_train, y_val = train_test_split(
    train_noise, train_clean, test_size=0.15, random_state=42
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}")

# Direct to tensors
X_train_t = torch.FloatTensor(X_train)
y_train_t = torch.FloatTensor(y_train)
X_val_t = torch.FloatTensor(X_val)
y_val_t = torch.FloatTensor(y_val)
test_t = torch.FloatTensor(test_data)

print("✓ No normalization - raw data")

## Deeper Model with More Capacity

In [None]:
class DeepDenoiser(nn.Module):
    def __init__(self, input_dim=20):
        super().__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.05)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(256, input_dim)
        )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

print("Creating ensemble of 5 models...")
models = [DeepDenoiser().to(device) for _ in range(5)]
print(f"Params per model: {sum(p.numel() for p in models[0].parameters()):,}")
print(f"Total params: {sum(p.numel() for p in models[0].parameters()) * 5:,}")

## Training Setup - Aggressive

In [None]:
BATCH_SIZE = 4096  # Larger batches
NUM_EPOCHS = 120   # More epochs
LR = 0.0003        # Lower LR for stability

train_loader = DataLoader(
    TensorDataset(X_train_t, y_train_t),
    batch_size=BATCH_SIZE, shuffle=True, drop_last=True
)

val_loader = DataLoader(
    TensorDataset(X_val_t, y_val_t),
    batch_size=BATCH_SIZE, shuffle=False
)

criterion = nn.MSELoss()

optimizers = [optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-6) for model in models]
schedulers = [optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=8) for opt in optimizers]

print(f"Batches/epoch: {len(train_loader)}")

## Train Ensemble

In [None]:
best_val_losses = [float('inf')] * 5
patience_counters = [0] * 5
MAX_PATIENCE = 25

for epoch in range(NUM_EPOCHS):
    # Train
    for model in models:
        model.train()
    
    train_losses = [0] * 5
    
    for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        for i, (model, optimizer) in enumerate(zip(models, optimizers)):
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            
            train_losses[i] += loss.item()
    
    train_losses = [tl / len(train_loader) for tl in train_losses]
    
    # Validate
    for model in models:
        model.eval()
    
    val_losses = [0] * 5
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            
            for i, model in enumerate(models):
                pred = model(X_batch)
                loss = criterion(pred, y_batch)
                val_losses[i] += loss.item()
    
    val_losses = [vl / len(val_loader) for vl in val_losses]
    
    # Update schedulers
    for i, (scheduler, val_loss) in enumerate(zip(schedulers, val_losses)):
        scheduler.step(val_loss)
    
    # Save best
    saved = []
    for i, (model, val_loss) in enumerate(zip(models, val_losses)):
        if val_loss < best_val_losses[i]:
            best_val_losses[i] = val_loss
            torch.save(model.state_dict(), f'model_{i}.pth')
            patience_counters[i] = 0
            saved.append(i)
        else:
            patience_counters[i] += 1
    
    avg_train = np.mean(train_losses)
    avg_val = np.mean(val_losses)
    
    if saved:
        print(f"✓ Epoch {epoch+1} - Train: {avg_train:.6f}, Val: {avg_val:.6f} [SAVED: {saved}]")
    elif (epoch+1) % 10 == 0:
        print(f"  Epoch {epoch+1} - Train: {avg_train:.6f}, Val: {avg_val:.6f}")
    
    # Early stopping if all models stopped improving
    if all(p >= MAX_PATIENCE for p in patience_counters):
        print(f"\nEarly stop at epoch {epoch+1}")
        break

print(f"\nBest val losses: {[f'{v:.6f}' for v in best_val_losses]}")
print(f"Average: {np.mean(best_val_losses):.6f}")

## Ensemble Prediction with TTA

In [None]:
# Load best models
for i, model in enumerate(models):
    model.load_state_dict(torch.load(f'model_{i}.pth'))
    model.eval()

print("Predicting with ensemble + TTA...")

def predict_ensemble_tta(models, data_tensor, n_tta=3):
    """Ensemble + Test Time Augmentation"""
    all_preds = []
    data_tensor = data_tensor.to(device)
    
    with torch.no_grad():
        for model in models:
            # Original
            pred = model(data_tensor).cpu().numpy()
            all_preds.append(pred)
            
            # TTA with small noise
            for _ in range(n_tta - 1):
                noise = torch.randn_like(data_tensor) * 0.005
                pred = model(data_tensor + noise).cpu().numpy()
                all_preds.append(pred)
    
    return np.mean(all_preds, axis=0)

test_pred = predict_ensemble_tta(models, test_t, n_tta=3)
val_pred = predict_ensemble_tta(models, X_val_t, n_tta=3)

print("✓ Ensemble + TTA done")

## Calculate NMSE

In [None]:
mse = np.mean((y_val - val_pred) ** 2)
variance = np.var(y_val)
nmse = mse / variance

print(f"\n{'='*70}")
print(f"VALIDATION NMSE: {nmse:.6f}")
print(f"Target: < 0.3")
print(f"{'='*70}")

if nmse < 0.3:
    print(f"🎉 SUCCESS! NMSE < 0.3!")
elif nmse < 0.5:
    print(f"⚠️  Close! NMSE = {nmse:.4f}")
else:
    print(f"❌ NMSE too high: {nmse:.4f}")

print(f"\nMSE: {mse:.8f}")
print(f"Variance: {variance:.8f}")
print(f"Avg Val Loss: {np.mean(best_val_losses):.6f}")
print(f"Best individual: {min(best_val_losses):.6f}")

## Save

In [None]:
submission = pd.DataFrame(test_pred)
submission.to_csv('submission.csv', index=False)
print("\n✓ Saved: submission.csv")
print(f"Shape: {submission.shape}")
submission.head()

## Visualize

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
for i in range(20):
    plt.scatter(y_val[:500, i], val_pred[:500, i], alpha=0.5, s=2)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'NMSE={nmse:.4f}')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
errors = np.abs(y_val - val_pred)
plt.hist(errors.flatten(), bins=100, alpha=0.7, edgecolor='black')
plt.xlabel('Absolute Error')
plt.ylabel('Frequency')
plt.title('Error Distribution')
plt.yscale('log')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nFinal NMSE: {nmse:.6f}")
print(f"MAE: {np.mean(errors):.6f}")
print(f"Median AE: {np.median(errors):.6f}")

## Summary

**v2 OPTIMIZED Improvements:**
1. ✅ NO normalization (like v1 - fast learning)
2. ✅ 5 models ensemble (more averaging)
3. ✅ Deeper network (1024 hidden units)
4. ✅ Lower dropout (0.05 vs 0.1)
5. ✅ Larger batches (4096)
6. ✅ More epochs (120)
7. ✅ TTA (3x per model = 15 predictions averaged)
8. ✅ ReduceLROnPlateau scheduler

**Expected:**
- Val Loss: 0.003-0.005 (like v1)
- NMSE: 0.20-0.30 (better than v1's 0.66 due to ensemble)