# AFML Part 1 - Team 44_XLR8 (v3 FIXED)
## Simple & Effective: Large Ensemble + Data Augmentation

**Strategy**: 10 simple models + augmentation = lower NMSE

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

np.random.seed(42)
torch.manual_seed(42)

if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("✅ M2 GPU")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("✅ CUDA GPU")
else:
    device = torch.device('cpu')
    print("⚠️  CPU")

print(f"Device: {device}")

In [None]:
print("Loading...")
train_clean = pd.read_csv('train-part1-clean.csv').values.astype(np.float32)
train_noise = pd.read_csv('train-part1-noise.csv').values.astype(np.float32)
test_data = pd.read_csv('test-part1.csv').values.astype(np.float32)

print(f"Clean: {train_clean.shape}")
print(f"Noisy: {train_noise.shape}")
print(f"Test: {test_data.shape}")

## Data Augmentation

In [None]:
# Split
X_train, X_val, y_train, y_val = train_test_split(
    train_noise, train_clean, test_size=0.10, random_state=42  # Less validation = more training
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}")

# Augment training data by adding small noise
print("Augmenting training data...")
X_train_aug = []
y_train_aug = []

# Original
X_train_aug.append(X_train)
y_train_aug.append(y_train)

# Add 2 augmented versions
for i in range(2):
    noise_factor = 0.001 * (i + 1)
    X_aug = X_train + np.random.randn(*X_train.shape).astype(np.float32) * noise_factor
    X_train_aug.append(X_aug)
    y_train_aug.append(y_train)

X_train = np.vstack(X_train_aug)
y_train = np.vstack(y_train_aug)

print(f"Augmented train: {X_train.shape}")

# To tensors - NO NORMALIZATION
X_train_t = torch.FloatTensor(X_train)
y_train_t = torch.FloatTensor(y_train)
X_val_t = torch.FloatTensor(X_val)
y_val_t = torch.FloatTensor(y_val)
test_t = torch.FloatTensor(test_data)

print("✓ Data ready")

## Simple but Effective Model

In [None]:
class SimpleModel(nn.Module):
    def __init__(self, input_dim=20):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.03),
            
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Dropout(0.03),
            
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(0.03),
            
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.03),
            
            nn.Linear(512, input_dim)
        )
    
    def forward(self, x):
        return self.net(x)

print("Creating 10 models...")
models = [SimpleModel().to(device) for _ in range(10)]
print(f"Params per model: {sum(p.numel() for p in models[0].parameters()):,}")

## Training

In [None]:
BATCH_SIZE = 16384  # Even larger
NUM_EPOCHS = 120
LR = 0.0003

train_loader = DataLoader(
    TensorDataset(X_train_t, y_train_t),
    batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=0
)

val_loader = DataLoader(
    TensorDataset(X_val_t, y_val_t),
    batch_size=BATCH_SIZE, shuffle=False, num_workers=0
)

criterion = nn.MSELoss()
optimizers = [optim.Adam(model.parameters(), lr=LR) for model in models]
schedulers = [optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=8, min_lr=1e-6) for opt in optimizers]

print(f"Batches/epoch: {len(train_loader)}")

In [None]:
best_val_losses = [float('inf')] * 10
patience_counters = [0] * 10
MAX_PATIENCE = 25

for epoch in range(NUM_EPOCHS):
    # Train
    for model in models:
        model.train()
    
    train_losses = [0] * 10
    
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
    for X_batch, y_batch in pbar:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        
        for i, (model, optimizer) in enumerate(zip(models, optimizers)):
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_losses[i] += loss.item()
        
        pbar.set_postfix({'loss': f'{np.mean(train_losses) / (pbar.n + 1):.6f}'})
    
    train_losses = [tl / len(train_loader) for tl in train_losses]
    
    # Validate
    for model in models:
        model.eval()
    
    val_losses = [0] * 10
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            
            for i, model in enumerate(models):
                pred = model(X_batch)
                loss = criterion(pred, y_batch)
                val_losses[i] += loss.item()
    
    val_losses = [vl / len(val_loader) for vl in val_losses]
    
    # Schedulers
    for i, (scheduler, val_loss) in enumerate(zip(schedulers, val_losses)):
        scheduler.step(val_loss)
    
    # Save
    saved = []
    for i, (model, val_loss) in enumerate(zip(models, val_losses)):
        if val_loss < best_val_losses[i]:
            best_val_losses[i] = val_loss
            torch.save(model.state_dict(), f'model_{i}.pth')
            patience_counters[i] = 0
            saved.append(i)
        else:
            patience_counters[i] += 1
    
    avg_train = np.mean(train_losses)
    avg_val = np.mean(val_losses)
    
    if saved:
        print(f"✓ Epoch {epoch+1} - Train: {avg_train:.6f}, Val: {avg_val:.6f} [SAVED: {len(saved)} models]")
    elif (epoch+1) % 10 == 0:
        print(f"  Epoch {epoch+1} - Train: {avg_train:.6f}, Val: {avg_val:.6f}")
    
    if all(p >= MAX_PATIENCE for p in patience_counters):
        print(f"\nEarly stop at epoch {epoch+1}")
        break

print(f"\nBest val losses: {[f'{v:.6f}' for v in best_val_losses]}")
print(f"Average: {np.mean(best_val_losses):.6f}")

## Predict with Weighted Ensemble

In [None]:
for i, model in enumerate(models):
    model.load_state_dict(torch.load(f'model_{i}.pth'))
    model.eval()

print("Predicting with weighted ensemble...")

# Weight models by inverse of their validation loss
weights = 1.0 / np.array(best_val_losses)
weights = weights / weights.sum()  # Normalize

print(f"Model weights: {weights}")

with torch.no_grad():
    test_t_device = test_t.to(device)
    X_val_t_device = X_val_t.to(device)
    
    test_preds = [model(test_t_device).cpu().numpy() for model in models]
    val_preds = [model(X_val_t_device).cpu().numpy() for model in models]
    
    # Weighted average
    test_pred = sum(w * p for w, p in zip(weights, test_preds))
    val_pred = sum(w * p for w, p in zip(weights, val_preds))

print("✓ Done")

## NMSE

In [None]:
mse = np.mean((y_val - val_pred) ** 2)
variance = np.var(y_val)
nmse = mse / variance

print(f"\n{'='*70}")
print(f"VALIDATION NMSE: {nmse:.6f}")
print(f"Target: < 0.3")
print(f"{'='*70}")

if nmse < 0.3:
    print(f"🎉 SUCCESS! NMSE < 0.3!")
elif nmse < 0.5:
    print(f"⚠️  Close! NMSE = {nmse:.4f}")
else:
    print(f"❌ NMSE = {nmse:.4f}")

print(f"\nMSE: {mse:.8f}")
print(f"Variance: {variance:.8f}")
print(f"Avg Val Loss: {np.mean(best_val_losses):.6f}")
print(f"Best model: {min(best_val_losses):.6f}")

In [None]:
submission = pd.DataFrame(test_pred)
submission.to_csv('submission.csv', index=False)
print("✓ Saved: submission.csv")
print(f"Shape: {submission.shape}")
submission.head()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
for i in range(20):
    plt.scatter(y_val[:500, i], val_pred[:500, i], alpha=0.5, s=2)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'NMSE={nmse:.4f}')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
errors = np.abs(y_val - val_pred)
plt.hist(errors.flatten(), bins=100, alpha=0.7)
plt.xlabel('Error')
plt.ylabel('Frequency')
plt.yscale('log')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nFinal NMSE: {nmse:.6f}")

## v3 FIXED Strategy:

1. ✅ **10 models** (more diversity than v2's 5)
2. ✅ **Data augmentation** (3x training data)
3. ✅ **Weighted ensemble** (better models get more weight)
4. ✅ **Less validation** (10% vs 15% = more training)
5. ✅ **Larger batches** (16384 = faster)
6. ✅ **Lower dropout** (0.03 vs 0.05 = less regularization)

**Expected: NMSE 0.20-0.28** ✅