In [1]:
from torch import nn
from torch import tensor
import pandas as pd
from torch.nn.functional import relu
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from torch import nn
import torch
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np

def preprocess_feats(feats, scaler=StandardScaler()):
    feats.replace([np.inf, -np.inf], np.nan, inplace=True)
    feats.fillna(-1e10, inplace=True)
    feats_columns = feats.columns
    feats.loc[:, feats_columns != 'id'] = scaler.fit_transform(feats.loc[:, feats_columns != 'id'])
    return feats

def train_valid_split(data_x, data_y, train_idx, valid_idx):
    x_train = data_x.iloc[train_idx]
    y_train = data_y[train_idx]
    x_valid = data_x.iloc[valid_idx]
    y_valid = data_y[valid_idx]
    return x_train, y_train, x_valid, y_valid

In [3]:
class MLPModel(nn.Module):
    def __init__(self, feat_dim) -> None:
        super().__init__()
        
        self.feat_dim = feat_dim
        self.input = nn.Linear(feat_dim, 64)
        self.linear = nn.Linear(64,1)


    def forward(self, x):
        x = (self.input(x))
        x = relu(self.linear(x))
        return (x)

In [4]:
train_feats = pd.read_pickle('feature_selection/train_feats.pkl')
train_feats.iloc[:,:-1] = preprocess_feats(train_feats.iloc[:,:-1])

x = tensor(train_feats.drop(['id','score'], axis=1).values.astype(np.float32))
y = tensor(train_feats['score'].values.astype(np.float32))

train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2)

In [5]:
model = MLPModel(x.shape[1])
# Convert your data to TensorDataset for DataLoader
train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)

# DataLoader for batch processing
batch_size = 64  # You can adjust this
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size)

# Loss Function and Optimizer
criterion = nn.MSELoss()
optimizer = Adam(model.parameters())


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Training Loop
epochs = 500  # You can adjust this
for epoch in range(epochs):
    model.train()  # Set model to training mode
    total_loss = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(inputs)  # Forward pass
        loss = torch.sqrt(criterion(outputs.squeeze(), targets))  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        total_loss += loss.item()

    # Validation Loop
    model.eval()  # Set model to evaluation mode
    total_val_loss = 0
    with torch.no_grad():  # No need to track gradients
        for inputs, targets in valid_loader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_val_loss += loss.item()

    # Average losses and print RMSE
    avg_train_loss =torch.tensor(total_loss / len(train_loader))
    avg_val_loss = torch.tensor(total_val_loss / len(valid_loader))
    print(f'Epoch {epoch+1}/{epochs}, Train RMSE: {avg_train_loss:.4f}, Valid RMSE: {avg_val_loss:.4f}')


Epoch 1/500, Train RMSE: 0.6829, Valid RMSE: 0.4473
Epoch 2/500, Train RMSE: 0.6720, Valid RMSE: 0.4403
Epoch 3/500, Train RMSE: 0.6773, Valid RMSE: 0.4638
Epoch 4/500, Train RMSE: 0.6829, Valid RMSE: 0.4477
Epoch 5/500, Train RMSE: 0.6751, Valid RMSE: 0.4558
Epoch 6/500, Train RMSE: 0.6912, Valid RMSE: 0.4450
Epoch 7/500, Train RMSE: 0.6810, Valid RMSE: 0.4634
Epoch 8/500, Train RMSE: 0.6771, Valid RMSE: 0.4533
Epoch 9/500, Train RMSE: 0.6858, Valid RMSE: 0.4433
Epoch 10/500, Train RMSE: 0.6780, Valid RMSE: 0.4341
Epoch 11/500, Train RMSE: 0.6775, Valid RMSE: 0.4403
Epoch 12/500, Train RMSE: 0.6776, Valid RMSE: 0.4358
Epoch 13/500, Train RMSE: 0.6773, Valid RMSE: 0.4506
Epoch 14/500, Train RMSE: 0.6849, Valid RMSE: 0.4593
Epoch 15/500, Train RMSE: 0.6816, Valid RMSE: 0.4641
Epoch 16/500, Train RMSE: 0.6955, Valid RMSE: 0.5595
Epoch 17/500, Train RMSE: 0.6916, Valid RMSE: 0.4602
Epoch 18/500, Train RMSE: 0.6781, Valid RMSE: 0.4545
Epoch 19/500, Train RMSE: 0.6722, Valid RMSE: 0.4401
Ep

In [9]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, TensorDataset
import torch
import numpy as np
from sklearn.metrics import mean_squared_error

epochs = 100

def calculate_rmse(y, yhat):
    return mean_squared_error(y, yhat, squared=False)

def nn_pipeline(train, test, model_class, param, n_splits=10, iterations=5, batch_size=64):
    # Ensure inputs are tensors
    train_x = torch.tensor(train, dtype=torch.float32)
    train_y = torch.tensor(test, dtype=torch.float32)

    test_preds = []
    valid_preds = pd.DataFrame()
    criterion = nn.MSELoss()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

    for iter in range(iterations):
        for i, (train_index, valid_index) in enumerate(skf.split(train_x.numpy(), train_y.numpy().astype(str))):
            # Splitting data
            x_train, y_train = train_x[train_index], train_y[train_index]
            x_valid, y_valid = train_x[valid_index], train_y[valid_index]

            # DataLoader for batch processing
            train_data = TensorDataset(x_train, y_train)
            valid_data = TensorDataset(x_valid, y_valid)
            train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
            valid_loader = DataLoader(valid_data, batch_size=batch_size)

            # Model setup
            model = model_class(**param)
            optimizer = torch.optim.Adam(model.parameters())

            # Training loop
            for epoch in range(epochs):
                model.train()
                for inputs, targets in train_loader:
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = torch.sqrt(criterion(outputs.squeeze(), targets))
                    loss.backward()
                    optimizer.step()

            # Validation predictions
            model.eval()
            valid_predictions = []
            with torch.no_grad():
                for inputs, _ in valid_loader:
                    outputs = model(inputs)
                    valid_predictions.extend(outputs.squeeze().tolist())

            tmp_df = pd.DataFrame({'id': valid_index, 'score': y_valid.numpy(), 'preds': valid_predictions})
            tmp_df['iteration'] = i + 1
            valid_preds = pd.concat([valid_preds, tmp_df])

    final_rmse = np.sqrt(mean_squared_error(valid_preds['score'], valid_preds['preds']))
    cv_rmse = valid_preds.groupby('iteration').apply(lambda g: np.sqrt(mean_squared_error(g['score'], g['preds'])))

    return valid_preds, final_rmse, model

In [10]:
param = {'feat_dim': train_x.shape[1]}

# Using the nn_pipeline function
valid_preds, final_rmse, trained_model = nn_pipeline(
    train_x=train_x.numpy(),  
    train_y=train_y.numpy(),
    model_class=MLPModel,
    param=param,
    n_splits=5,  
    iterations=3  
)

print(f"Final RMSE: {final_rmse}")

Final RMSE: 0.6952714319192461
