In [2]:
import pandas as pd

train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
test_df = pd.read_csv("test.csv")


In [1]:
pip install torch pandas sentence-transformers scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_squared_error
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import random

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Current device:", device)


Current device: cuda


In [11]:
encoder = SentenceTransformer("all-MiniLM-L6-v2")

X_train = encoder.encode(train_df["query"].tolist(), convert_to_tensor=True)
y_train = torch.tensor(train_df["carb"].values, dtype=torch.float32)

X_val = encoder.encode(val_df["query"].tolist(), convert_to_tensor=True)
y_val = torch.tensor(val_df["carb"].values, dtype=torch.float32)

X_test = encoder.encode(test_df["query"].tolist(), convert_to_tensor=True)


In [12]:
class NutriDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

train_ds = NutriDataset(X_train, y_train)
val_ds = NutriDataset(X_val, y_val)
test_ds = NutriDataset(X_test)

batch_size=16

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size)
test_loader = DataLoader(test_ds, batch_size=batch_size)


In [22]:
import torch.nn as nn

class MLPRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128], dropout_rate=0.3, activation='ReLU'):
        super(MLPRegressor, self).__init__()
        
        # Choose activation function
        if activation == 'ReLU':
            self.activation = nn.ReLU()
        elif activation == 'Tanh':
            self.activation = nn.Tanh()
        elif activation == 'LeakyReLU':
            self.activation = nn.LeakyReLU(0.1)
        elif activation == 'Sigmoid':
            self.activation = nn.Sigmoid()
        else:
            self.activation = nn.ReLU()  # Default fallback
        
        # Build the network layers
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                self.activation,
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer (no activation for regression)
        layers.append(nn.Linear(prev_dim, 1))
        
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(1)

# Test the default model
model = MLPRegressor(X_train.shape[1]).to(device)
print(f"Model architecture: {model}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")


Model architecture: MLPRegressor(
  (activation): ReLU()
  (net): Sequential(
    (0): Linear(in_features=384, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=1, bias=True)
  )
)
Total parameters: 132353


In [23]:
# Hyperparameter configurations to test
configurations = [
    # Original optimizer variations
    {'optimizer': 'Adam', 'lr': 1e-4, 'batch_size': 16, 'weight_decay': 1e-5, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [256, 128]},
    {'optimizer': 'Adam', 'lr': 1e-3, 'batch_size': 16, 'weight_decay': 1e-5, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [256, 128]},
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [256, 128]},
    {'optimizer': 'AdaGrad', 'lr': 1e-3, 'batch_size': 16, 'weight_decay': 1e-4, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [256, 128]},
    {'optimizer': 'AdaGrad', 'lr': 1e-2, 'batch_size': 32, 'weight_decay': 1e-5, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [256, 128]},
    
    # Different activation functions (using best optimizer config: Adam with lr=5e-4, batch=32)
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'Tanh', 'dropout': 0.3, 'hidden_dims': [256, 128]},
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'LeakyReLU', 'dropout': 0.3, 'hidden_dims': [256, 128]},
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'Sigmoid', 'dropout': 0.3, 'hidden_dims': [256, 128]},
    
    # Different dropout rates (using best config so far)
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'ReLU', 'dropout': 0.1, 'hidden_dims': [256, 128]},
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'ReLU', 'dropout': 0.2, 'hidden_dims': [256, 128]},
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'ReLU', 'dropout': 0.5, 'hidden_dims': [256, 128]},
    
    # Different architectures (layer sizes)
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [512, 256]},
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [128, 64]},
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [256, 128, 64]},
    
    # Best combinations
    {'optimizer': 'Adam', 'lr': 5e-4, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'LeakyReLU', 'dropout': 0.2, 'hidden_dims': [256, 128]},
    {'optimizer': 'Adam', 'lr': 1e-3, 'batch_size': 32, 'weight_decay': 1e-4, 'activation': 'Tanh', 'dropout': 0.1, 'hidden_dims': [512, 256]},
]

results = []

def train_epoch(model, optimizer, train_loader, criterion, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, loader, device):
    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch).cpu().numpy()
            all_preds.extend(preds)
            all_targets.extend(y_batch.numpy())
    return np.sqrt(mean_squared_error(all_targets, all_preds))

# Test each configuration
for i, config in enumerate(configurations):
    print(f"\n{'='*60}")
    print(f"Configuration {i+1}: {config}")
    print(f"{'='*60}")
    
    # Create new data loaders with the specified batch size
    train_loader_config = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_loader_config = DataLoader(val_ds, batch_size=config['batch_size'])
    
    # Initialize model with architecture parameters
    model = MLPRegressor(
        X_train.shape[1], 
        hidden_dims=config.get('hidden_dims', [256, 128]),
        dropout_rate=config.get('dropout', 0.3),
        activation=config.get('activation', 'ReLU')
    ).to(device)
    criterion = nn.MSELoss()
    
    # Initialize optimizer based on configuration
    if config['optimizer'] == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
    elif config['optimizer'] == 'AdaGrad':
        optimizer = torch.optim.Adagrad(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
    elif config['optimizer'] == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9, weight_decay=config['weight_decay'])
    
    # Training loop
    best_val_rmse = float('inf')
    epochs_without_improvement = 0
    
    for epoch in range(50):  # Reduced epochs for faster comparison
        train_loss = train_epoch(model, optimizer, train_loader_config, criterion, device)
        val_rmse = evaluate(model, val_loader_config, device)
        
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
        
        if epoch % 10 == 0 or epoch == 49:
            print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val RMSE = {val_rmse:.4f}")
        
        # Early stopping
        if epochs_without_improvement >= 10:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    # Store results
    config_result = config.copy()
    config_result['best_val_rmse'] = best_val_rmse
    config_result['final_epoch'] = epoch + 1
    results.append(config_result)
    
    print(f"Final Best Validation RMSE: {best_val_rmse:.4f}")

# Display results summary
print(f"\n{'='*80}")
print("RESULTS SUMMARY")
print(f"{'='*80}")
print(f"{'#':<3} {'Optimizer':<8} {'LR':<7} {'Batch':<5} {'Activation':<10} {'Dropout':<7} {'Architecture':<15} {'RMSE':<8} {'Epochs':<6}")
print(f"{'-'*95}")

# Sort results by best RMSE
results_sorted = sorted(results, key=lambda x: x['best_val_rmse'])

for i, result in enumerate(results_sorted):
    arch_str = str(result.get('hidden_dims', [256, 128]))
    print(f"{i+1:<3} {result['optimizer']:<8} {result['lr']:<7} {result['batch_size']:<5} {result.get('activation', 'ReLU'):<10} {result.get('dropout', 0.3):<7} {arch_str:<15} {result['best_val_rmse']:<8.4f} {result['final_epoch']:<6}")

print(f"\nBest configuration: {results_sorted[0]}")
print(f"Best Validation RMSE: {results_sorted[0]['best_val_rmse']:.4f}")



Configuration 1: {'optimizer': 'Adam', 'lr': 0.0001, 'batch_size': 16, 'weight_decay': 1e-05, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [256, 128]}
Epoch 1: Train Loss = 2119.0376, Val RMSE = 42.0350
Epoch 11: Train Loss = 1139.9889, Val RMSE = 28.5382
Epoch 21: Train Loss = 1010.0400, Val RMSE = 26.4238
Epoch 31: Train Loss = 905.6336, Val RMSE = 24.3337
Epoch 41: Train Loss = 860.2258, Val RMSE = 23.8004
Epoch 50: Train Loss = 854.9854, Val RMSE = 22.6532
Final Best Validation RMSE: 22.6532

Configuration 2: {'optimizer': 'Adam', 'lr': 0.001, 'batch_size': 16, 'weight_decay': 1e-05, 'activation': 'ReLU', 'dropout': 0.3, 'hidden_dims': [256, 128]}
Epoch 1: Train Loss = 1713.3130, Val RMSE = 33.8424
Epoch 11: Train Loss = 927.9909, Val RMSE = 25.9686
Epoch 21: Train Loss = 714.9109, Val RMSE = 21.8381
Epoch 31: Train Loss = 515.2094, Val RMSE = 20.9530
Epoch 41: Train Loss = 472.3329, Val RMSE = 20.1892
Epoch 50: Train Loss = 394.5341, Val RMSE = 21.7177
Final Best Validati

In [24]:
# Train final model with best configuration
best_config = results_sorted[0]
print(f"Training final model with best configuration: {best_config}")

# Create data loaders with best batch size
test_loader_final = DataLoader(test_ds, batch_size=best_config['batch_size'])
train_loader_final = DataLoader(train_ds, batch_size=best_config['batch_size'], shuffle=True)
val_loader_final = DataLoader(val_ds, batch_size=best_config['batch_size'])

# Initialize final model with best architecture
final_model = MLPRegressor(
    X_train.shape[1],
    hidden_dims=best_config.get('hidden_dims', [256, 128]),
    dropout_rate=best_config.get('dropout', 0.3),
    activation=best_config.get('activation', 'ReLU')
).to(device)
criterion = nn.MSELoss()
print(f"Final model architecture: Hidden dims: {best_config.get('hidden_dims', [256, 128])}, "
      f"Activation: {best_config.get('activation', 'ReLU')}, Dropout: {best_config.get('dropout', 0.3)}")

# Initialize optimizer with best configuration
if best_config['optimizer'] == 'Adam':
    optimizer = torch.optim.Adam(final_model.parameters(), lr=best_config['lr'], weight_decay=best_config['weight_decay'])
elif best_config['optimizer'] == 'AdaGrad':
    optimizer = torch.optim.Adagrad(final_model.parameters(), lr=best_config['lr'], weight_decay=best_config['weight_decay'])
elif best_config['optimizer'] == 'SGD':
    optimizer = torch.optim.SGD(final_model.parameters(), lr=best_config['lr'], momentum=0.9, weight_decay=best_config['weight_decay'])

# Train final model for more epochs
print("Training final model...")
best_val_rmse = float('inf')
epochs_without_improvement = 0

for epoch in range(100):  # More epochs for final model
    train_loss = train_epoch(final_model, optimizer, train_loader_final, criterion, device)
    val_rmse = evaluate(final_model, val_loader_final, device)
    
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        epochs_without_improvement = 0
        # Save best model
        torch.save(final_model.state_dict(), 'best_final_model.pth')
    else:
        epochs_without_improvement += 1
    
    if epoch % 20 == 0 or epoch == 99:
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val RMSE = {val_rmse:.4f}")
    
    # Early stopping
    if epochs_without_improvement >= 15:
        print(f"Early stopping at epoch {epoch+1}")
        break

# Load best model for predictions
final_model.load_state_dict(torch.load('best_final_model.pth'))
print(f"Final model validation RMSE: {best_val_rmse:.4f}")

# Generate test predictions
final_model.eval()
all_preds = []

with torch.no_grad():
    for X_batch in test_loader_final:
        X_batch = X_batch.to(device)
        preds = final_model(X_batch).cpu().numpy()
        all_preds.extend(preds)

test_df["carb"] = all_preds
test_df.to_csv("test_with_predictions.csv", index=False)
print(f"Test predictions saved to test_with_predictions.csv")
print(f"Generated {len(all_preds)} predictions for test set")
print(f"Sample predictions: {all_preds[:5]}")
print(f"Prediction statistics - Mean: {np.mean(all_preds):.4f}, Std: {np.std(all_preds):.4f}")


Training final model with best configuration: {'optimizer': 'Adam', 'lr': 0.0005, 'batch_size': 32, 'weight_decay': 0.0001, 'activation': 'ReLU', 'dropout': 0.1, 'hidden_dims': [256, 128], 'best_val_rmse': np.float64(17.605999255461406), 'final_epoch': 42}
Final model architecture: Hidden dims: [256, 128], Activation: ReLU, Dropout: 0.1
Training final model...
Epoch 1: Train Loss = 1929.8120, Val RMSE = 38.0330
Epoch 21: Train Loss = 503.6693, Val RMSE = 21.1197
Epoch 41: Train Loss = 362.0479, Val RMSE = 19.2998
Epoch 61: Train Loss = 260.8838, Val RMSE = 17.8592
Early stopping at epoch 71
Final model validation RMSE: 16.8615
Test predictions saved to test_with_predictions.csv
Generated 2000 predictions for test set
Sample predictions: [np.float32(8.4503145), np.float32(-4.1927223), np.float32(4.5950465), np.float32(-8.548967), np.float32(-3.3428793)]
Prediction statistics - Mean: 20.6198, Std: 43.6036


In [25]:

# Summary of Hyperparameter Optimization Results
print("="*80)
print("HYPERPARAMETER OPTIMIZATION SUMMARY")
print("="*80)
print("\nTested configurations:")
for i, config in enumerate(configurations):
    result = next(r for r in results if r['optimizer'] == config['optimizer'] and 
                  r['lr'] == config['lr'] and r['batch_size'] == config['batch_size'])
    print(f"{i+1}. {config['optimizer']} | LR: {config['lr']} | Batch: {config['batch_size']} | RMSE: {result['best_val_rmse']:.4f}")

print(f"\nKey Findings:")
print(f"- Best performing optimizer: {results_sorted[0]['optimizer']}")
print(f"- Best learning rate: {results_sorted[0]['lr']}")
print(f"- Best batch size: {results_sorted[0]['batch_size']}")
print(f"- Best activation function: {results_sorted[0].get('activation', 'ReLU')}")
print(f"- Best dropout rate: {results_sorted[0].get('dropout', 0.3)}")
print(f"- Best architecture: {results_sorted[0].get('hidden_dims', [256, 128])}")
print(f"- Best validation RMSE: {results_sorted[0]['best_val_rmse']:.4f}")

print(f"\nPerformance by optimizer:")
for opt in ['Adam', 'AdaGrad', 'SGD']:
    opt_results = [r for r in results if r['optimizer'] == opt]
    if opt_results:
        best_opt = min(opt_results, key=lambda x: x['best_val_rmse'])
        avg_rmse = np.mean([r['best_val_rmse'] for r in opt_results])
        print(f"- {opt}: Best RMSE = {best_opt['best_val_rmse']:.4f}, Average RMSE = {avg_rmse:.4f}")

print(f"\nPerformance by activation function:")
for act in ['ReLU', 'Tanh', 'LeakyReLU', 'Sigmoid']:
    act_results = [r for r in results if r.get('activation', 'ReLU') == act]
    if act_results:
        best_act = min(act_results, key=lambda x: x['best_val_rmse'])
        avg_rmse = np.mean([r['best_val_rmse'] for r in act_results])
        print(f"- {act}: Best RMSE = {best_act['best_val_rmse']:.4f}, Average RMSE = {avg_rmse:.4f}")

print(f"\nPerformance by dropout rate:")
for dr in [0.1, 0.2, 0.3, 0.5]:
    dr_results = [r for r in results if r.get('dropout', 0.3) == dr]
    if dr_results:
        best_dr = min(dr_results, key=lambda x: x['best_val_rmse'])
        avg_rmse = np.mean([r['best_val_rmse'] for r in dr_results])
        print(f"- Dropout {dr}: Best RMSE = {best_dr['best_val_rmse']:.4f}, Average RMSE = {avg_rmse:.4f}")

print(f"\nPerformance by architecture:")
unique_archs = list(set(str(r.get('hidden_dims', [256, 128])) for r in results))
for arch in unique_archs:
    arch_results = [r for r in results if str(r.get('hidden_dims', [256, 128])) == arch]
    if arch_results:
        best_arch = min(arch_results, key=lambda x: x['best_val_rmse'])
        avg_rmse = np.mean([r['best_val_rmse'] for r in arch_results])
        print(f"- {arch}: Best RMSE = {best_arch['best_val_rmse']:.4f}, Average RMSE = {avg_rmse:.4f}")



HYPERPARAMETER OPTIMIZATION SUMMARY

Tested configurations:
1. Adam | LR: 0.0001 | Batch: 16 | RMSE: 22.6532
2. Adam | LR: 0.001 | Batch: 16 | RMSE: 18.2847
3. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
4. AdaGrad | LR: 0.001 | Batch: 16 | RMSE: 31.4877
5. AdaGrad | LR: 0.01 | Batch: 32 | RMSE: 21.6183
6. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
7. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
8. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
9. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
10. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
11. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
12. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
13. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
14. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
15. Adam | LR: 0.0005 | Batch: 32 | RMSE: 18.9184
16. Adam | LR: 0.001 | Batch: 32 | RMSE: 19.2239

Key Findings:
- Best performing optimizer: Adam
- Best learning rate: 0.0005
- Best batch size: 32
- Best activation function: ReLU
- Best dropout 