In [1]:
from torch import nn
from torch import tensor
import pandas as pd
from torch.nn.functional import relu
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from torch import nn
import torch
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np

def preprocess_feats(feats, scaler=StandardScaler()):
    feats.replace([np.inf, -np.inf], np.nan, inplace=True)
    feats.fillna(-1e10, inplace=True)
    feats_columns = feats.columns
    feats.loc[:, feats_columns != 'id'] = scaler.fit_transform(feats.loc[:, feats_columns != 'id'])
    return feats

def train_valid_split(data_x, data_y, train_idx, valid_idx):
    x_train = data_x.iloc[train_idx]
    y_train = data_y[train_idx]
    x_valid = data_x.iloc[valid_idx]
    y_valid = data_y[valid_idx]
    return x_train, y_train, x_valid, y_valid

In [3]:
class MLPModel(nn.Module):
    def __init__(self, feat_dim) -> None:
        super().__init__()
        
        self.feat_dim = feat_dim
        self.input = nn.Linear(feat_dim, 64)
        self.linear = nn.Linear(64, 32)
        self.out = nn.Linear(32,1)


    def forward(self, x):
        x = (self.input(x))
        x = relu(self.linear(x))
        x = relu(self.out(x))
        return (x)

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
train_feats = pd.read_pickle('feature_selection/train_feats.pkl')
test_feats = pd.read_pickle('feature_selection/test_feats.pkl')

train_feats.iloc[:,:-1] = preprocess_feats(train_feats.iloc[:,:-1])

# x = tensor(train_feats.drop(['id','score'], axis=1).values.astype(np.float32))
# y = tensor(train_feats['score'].values.astype(np.float32))

In [5]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, TensorDataset
import torch
import numpy as np
from sklearn.metrics import mean_squared_error

epochs = 500
model = MLPModel(train_feats.shape[1]).to(device)

def calculate_rmse(y, yhat):
    return mean_squared_error(y, yhat, squared=False)

def nn_pipeline(train, test, model_class, param, n_splits=10, iterations=5, batch_size=64):
    # Ensure inputs are tensors
    test_preds = []
    valid_preds = pd.DataFrame()
    criterion = nn.MSELoss()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

    targets = train_feats['score'].values.copy()
    x = tensor(train_feats.drop(['id','score'], axis=1).values, dtype=torch.float).to(device)
    y = tensor(targets, dtype=torch.float).to(device)


    for iter in range(iterations):
        for i, (train_index, valid_index) in enumerate(skf.split(x, targets.astype(str))):
            # Splitting data
            x_train, y_train = x[train_index], y[train_index]
            x_valid, y_valid = x[valid_index], y[valid_index]
            
            # Model setup
            model = model_class(**param).to(device)
            optimizer = torch.optim.Adam(model.parameters())

            for epoch in range(epochs):
                model.train()
                optimizer.zero_grad()
                outputs = model(x_train)
                outputs = outputs.squeeze()  
                loss = criterion(outputs, y_train)  
                loss = torch.sqrt(loss)  
                loss.backward()
                optimizer.step()        

            # Validation predictions
            model.eval()
            valid_predictions = []
            with torch.no_grad():
                outputs = model(x_valid).squeeze()  # Ensure outputs are squeezed to match target dimensions
                valid_predictions.extend(outputs.tolist())

            tmp_df = pd.DataFrame({'id': valid_index, 'score': y_valid.cpu().numpy(), 'preds': valid_predictions})
            tmp_df['iteration'] = i + 1
            valid_preds = pd.concat([valid_preds, tmp_df])

    final_rmse = np.sqrt(mean_squared_error(valid_preds['score'], valid_preds['preds']))
    cv_rmse = valid_preds.groupby('iteration').apply(lambda g: np.sqrt(mean_squared_error(g['score'], g['preds'])))

    return valid_preds, final_rmse, model

In [6]:
param = {'feat_dim': train_feats.drop(['id','score'], axis=1).shape[1]}

# Using the nn_pipeline function
valid_preds, final_rmse, trained_model = nn_pipeline(
    train=train_feats,  
    test=test_feats,
    model_class=MLPModel,
    param=param,
    n_splits=10,  
    iterations=5  
)

print(f"Final RMSE: {final_rmse}")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
final_rmse

0.7811593622110575