# AFML Part 1 - Team 44_XLR8 (v3 - AGGRESSIVE)
## New Strategy: Predict NOISE instead of CLEAN

**Key Insight**: `clean = noisy - noise` is easier to learn!

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

np.random.seed(42)
torch.manual_seed(42)

if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("✅ M2 GPU")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("✅ CUDA GPU")
else:
    device = torch.device('cpu')
    print("⚠️  CPU")

print(f"Device: {device}")

In [None]:
print("Loading...")
train_clean = pd.read_csv('train-part1-clean.csv').values.astype(np.float32)
train_noise = pd.read_csv('train-part1-noise.csv').values.astype(np.float32)
test_data = pd.read_csv('test-part1.csv').values.astype(np.float32)

print(f"Clean: {train_clean.shape}")
print(f"Noisy: {train_noise.shape}")
print(f"Test: {test_data.shape}")

## NEW APPROACH: Predict NOISE, not CLEAN!

In [None]:
# Split
X_train, X_val, y_train, y_val = train_test_split(
    train_noise, train_clean, test_size=0.15, random_state=42
)

# Calculate NOISE (this is what we'll predict!)
noise_train = X_train - y_train  # noise = noisy - clean
noise_val = X_val - y_val

print(f"Train: {X_train.shape}, Val: {X_val.shape}")
print(f"Noise range: [{noise_train.min():.4f}, {noise_train.max():.4f}]")

# To tensors - NO NORMALIZATION
X_train_t = torch.FloatTensor(X_train)
noise_train_t = torch.FloatTensor(noise_train)  # Predict noise!
X_val_t = torch.FloatTensor(X_val)
noise_val_t = torch.FloatTensor(noise_val)
test_t = torch.FloatTensor(test_data)

print("✓ Predicting NOISE (residual learning)")

## Wider & Deeper Network

In [None]:
class NoisePredictor(nn.Module):
    def __init__(self, input_dim=20):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(2048, 2048),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(512, input_dim)
        )
    
    def forward(self, x):
        return self.net(x)

print("Creating 7 models (more diversity)...")
models = [NoisePredictor().to(device) for _ in range(7)]
print(f"Params per model: {sum(p.numel() for p in models[0].parameters()):,}")

## Training Setup

In [None]:
BATCH_SIZE = 8192
NUM_EPOCHS = 150
LR = 0.0001  # Lower LR for stability

train_loader = DataLoader(
    TensorDataset(X_train_t, noise_train_t),  # Predict noise!
    batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=0
)

val_loader = DataLoader(
    TensorDataset(X_val_t, noise_val_t),
    batch_size=BATCH_SIZE, shuffle=False, num_workers=0
)

criterion = nn.MSELoss()
optimizers = [optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-6) for model in models]
schedulers = [optim.lr_scheduler.CosineAnnealingLR(opt, T_max=NUM_EPOCHS, eta_min=1e-7) for opt in optimizers]

print(f"Batches/epoch: {len(train_loader)}")

## Train

In [None]:
best_val_losses = [float('inf')] * 7
patience_counters = [0] * 7
MAX_PATIENCE = 30

for epoch in range(NUM_EPOCHS):
    # Train
    for model in models:
        model.train()
    
    train_losses = [0] * 7
    
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for X_batch, noise_batch in pbar:
        X_batch = X_batch.to(device)
        noise_batch = noise_batch.to(device)
        
        for i, (model, optimizer) in enumerate(zip(models, optimizers)):
            pred_noise = model(X_batch)
            loss = criterion(pred_noise, noise_batch)
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            train_losses[i] += loss.item()
        
        pbar.set_postfix({'loss': f'{np.mean(train_losses) / (pbar.n + 1):.6f}'})
    
    train_losses = [tl / len(train_loader) for tl in train_losses]
    
    # Validate
    for model in models:
        model.eval()
    
    val_losses = [0] * 7
    
    with torch.no_grad():
        for X_batch, noise_batch in val_loader:
            X_batch = X_batch.to(device)
            noise_batch = noise_batch.to(device)
            
            for i, model in enumerate(models):
                pred_noise = model(X_batch)
                loss = criterion(pred_noise, noise_batch)
                val_losses[i] += loss.item()
    
    val_losses = [vl / len(val_loader) for vl in val_losses]
    
    # Schedulers
    for scheduler in schedulers:
        scheduler.step()
    
    # Save
    saved = []
    for i, (model, val_loss) in enumerate(zip(models, val_losses)):
        if val_loss < best_val_losses[i]:
            best_val_losses[i] = val_loss
            torch.save(model.state_dict(), f'model_{i}.pth')
            patience_counters[i] = 0
            saved.append(i)
        else:
            patience_counters[i] += 1
    
    avg_train = np.mean(train_losses)
    avg_val = np.mean(val_losses)
    
    if saved:
        print(f"✓ Epoch {epoch+1} - Train: {avg_train:.6f}, Val: {avg_val:.6f} [SAVED: {saved}]")
    elif (epoch+1) % 10 == 0:
        print(f"  Epoch {epoch+1} - Train: {avg_train:.6f}, Val: {avg_val:.6f}")
    
    if all(p >= MAX_PATIENCE for p in patience_counters):
        print(f"\nEarly stop at epoch {epoch+1}")
        break

print(f"\nBest val losses: {[f'{v:.6f}' for v in best_val_losses]}")
print(f"Average: {np.mean(best_val_losses):.6f}")

## Predict: noise → clean

In [None]:
for i, model in enumerate(models):
    model.load_state_dict(torch.load(f'model_{i}.pth'))
    model.eval()

print("Predicting noise, then subtracting...")

with torch.no_grad():
    test_t_device = test_t.to(device)
    X_val_t_device = X_val_t.to(device)
    
    # Predict noise with each model
    test_noise_preds = [model(test_t_device).cpu().numpy() for model in models]
    val_noise_preds = [model(X_val_t_device).cpu().numpy() for model in models]
    
    # Average noise predictions
    test_noise = np.mean(test_noise_preds, axis=0)
    val_noise = np.mean(val_noise_preds, axis=0)

# Subtract noise to get clean!
test_pred = test_data - test_noise  # clean = noisy - noise
val_pred = X_val - val_noise

print("✓ Done")

## NMSE

In [None]:
mse = np.mean((y_val - val_pred) ** 2)
variance = np.var(y_val)
nmse = mse / variance

print(f"\n{'='*70}")
print(f"VALIDATION NMSE: {nmse:.6f}")
print(f"Target: < 0.3")
print(f"{'='*70}")

if nmse < 0.3:
    print(f"🎉 SUCCESS! NMSE < 0.3!")
elif nmse < 0.5:
    print(f"⚠️  Close! NMSE = {nmse:.4f}")
else:
    print(f"❌ NMSE = {nmse:.4f}")

print(f"\nMSE: {mse:.8f}")
print(f"Variance: {variance:.8f}")
print(f"Avg Val Loss (noise): {np.mean(best_val_losses):.6f}")

In [None]:
submission = pd.DataFrame(test_pred)
submission.to_csv('submission.csv', index=False)
print("✓ Saved: submission.csv")
print(f"Shape: {submission.shape}")
submission.head()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
for i in range(20):
    plt.scatter(y_val[:500, i], val_pred[:500, i], alpha=0.5, s=2)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'NMSE={nmse:.4f}')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
errors = np.abs(y_val - val_pred)
plt.hist(errors.flatten(), bins=100, alpha=0.7)
plt.xlabel('Error')
plt.ylabel('Frequency')
plt.yscale('log')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nFinal NMSE: {nmse:.6f}")

## v3 Key Changes:

1. ✅ **Predicts NOISE instead of CLEAN** (residual learning)
2. ✅ **7 models** (more diversity)
3. ✅ **Wider network** (2048 units)
4. ✅ **Deeper** (6 layers)
5. ✅ **Lower LR** (0.0001 for stability)
6. ✅ **Cosine annealing** (smooth decay)

**Why predicting noise works better:**
- Noise has simpler patterns than clean weights
- Network learns to identify and remove noise
- Formula: `clean = noisy - predicted_noise`

**Expected: NMSE 0.15-0.25** ✅