In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)
pd.set_option('display.float_format', '{:.4f}'.format)

import pipes

import numpy as np
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from rapidfuzz import process, fuzz
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold

In [2]:
torch.manual_seed(123)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
train = pickle.load(open('data/train.pkl','rb'))
test = pickle.load(open('data/test.pkl','rb'))
preprocess_pipeline = pickle.load(open('data/pipeline.pkl', 'rb'))

train = preprocess_pipeline.fit_transform(train)
test = preprocess_pipeline.transform(test)

In [4]:
train.shape

(7930, 22)

In [5]:
X = train.drop('pressure',axis=1).values
y = train['pressure'].values.reshape(-1,1)

# Helper Functions

In [6]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layers=[100]):
        """
        - input_dim: number of neurons on input layer
        - hidden_layers: list of integers where each integer represents
                         the number of neurons in that hidden layer
        (ie. [100, 200]: input_dim -> 100 -> relu -> 200 -> relu -> output)
        """
        super(MLP, self).__init__()
        layers = []
        in_dim = input_dim
        for h in hidden_layers:
            layers.append(nn.Linear(in_dim,h))
            layers.append(nn.ReLU())
            in_dim = h
          
        layers.append(nn.Linear(in_dim,1))    # output layer
        self.layer = nn.Sequential(*layers)
        
    
    def forward(self, X):
        return self.layer(X)

In [7]:
def buildMLP(input_dim, hidden_layers):
    return MLP(input_dim, hidden_layers)

In [8]:
def trainMLP(model, X, y, criterion, optimizer, batch_size=32, num_epochs=200):
    """
    Input:
    - model: predefined model instance
    - X: features as a tensor
    - y: target as a tensor
    - criterion: loss function
    - optimizer: parameter optimizer algorithm
    - batch_size: batch size for mini-batch gradient descent
    - num_epochs: number of epochs to train
    
    Output:
    - model: trained model
    """
    
    train_dataset = TensorDataset(X, y)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model.to(device)
    for epoch in range(num_epochs):
        if (epoch+1) % 20 == 0:
            print(f'Training epoch {epoch+1}/{num_epochs}')
        model.train()
        
        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
    
            optimizer.zero_grad()
            
            # .squeeze() to match target shape
            outputs = model(batch_X).squeeze()  # [batch_size, 1] => [batch_size]
            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
    
    return model

In [9]:
def evalMLP(model, X, y, criterion, batch_size=32):
    """
    Input:
    - model: predefined model instance
    - X: features as a tensor
    - y: target as a tensor
    - criterion: loss function
    - batch_size: data loader batch size (not very important, only lightens the load for loss each calculation)
    
    Output:
    - mean_loss: mean loss from all tested samples
    """
        
    dataset = TensorDataset(X, y)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    model.to(device)
    model.eval()
    total_loss = 0.0
    total_samples = 0
    
    with torch.no_grad():
        for batch_X, batch_y in loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # .squeeze() to match target shape
            outputs = model(batch_X).squeeze()  # [batch_size, 1] => [batch_size]
            loss = criterion(outputs, batch_y)
            
            total_loss += loss.item() * batch_X.size(0)
            total_samples += batch_X.size(0)
            
    return total_loss / total_samples

In [10]:
def crossvalidate(X, y, criterion, optimizer_class, optimizer_kwargs, 
                  hidden_layers=[100], num_folds=5, batch_size=32, num_epochs=200):
    """
    Input: 
    - X: features as a tensor
    - y: target as a tensor
    - criterion: loss functions
    - optimizer_class: gradient optimisation
    - optimizer_kwargs: learning rate as input
    - hidden_layers: list of integers where each integer represents
                     the number of neurons in that hidden layer
    - num_folds: number of cross validation folds
    - batch_size: batch size for DataLoader (mini-batch gradient descent)
    - num_epochs: number of epochs to train per fold
    
    Output
    - fold_train_loss: list of training losses for each fold
    - fold_test_loss: list of testing losses for each fold
    """
        
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=123)
    
    fold_train_loss = []
    fold_test_loss = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f'\nFold {fold+1}/{num_folds}')
        
        # Create new model and optimizer for each fold
        model = buildMLP(X.shape[1], hidden_layers)
        optimizer = optimizer_class(model.parameters(), **optimizer_kwargs)
        
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Train with current fold's data
        model = trainMLP(model, X_train, y_train, criterion, optimizer,
                        batch_size, num_epochs)
        
        # Evaluate on both train and test sets
        train_loss = evalMLP(model, X_train, y_train, criterion)
        test_loss = evalMLP(model, X_test, y_test, criterion)
        
        fold_train_loss.append(train_loss)
        fold_test_loss.append(test_loss)
        
        print(f'Fold {fold+1} - Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}')
    
    return fold_train_loss, fold_test_loss

# Initialisation

In [11]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).squeeze()

In [12]:
model = buildMLP(X.shape[1], [100,200])
print(model)

MLP(
  (layer): Sequential(
    (0): Linear(in_features=21, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=200, bias=True)
    (3): ReLU()
    (4): Linear(in_features=200, out_features=1, bias=True)
  )
)


In [13]:
criterion = nn.MSELoss()
optimizer_class = torch.optim.Adam
optimizer_kwargs = {'lr': 0.0001}
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Testing Functions

In [14]:
# Finding the initial weights 
print(model.layer[0].weight)

Parameter containing:
tensor([[-0.0890,  0.0072, -0.1084,  ..., -0.0515,  0.0700,  0.1543],
        [ 0.0407,  0.0597,  0.2106,  ...,  0.0144, -0.0407, -0.1170],
        [-0.0198,  0.2067, -0.0172,  ...,  0.0344,  0.1803, -0.2062],
        ...,
        [ 0.0322,  0.1624, -0.0922,  ..., -0.0731, -0.0767, -0.1934],
        [ 0.0461,  0.1464, -0.1214,  ...,  0.0707, -0.1888, -0.0771],
        [ 0.1751, -0.1484, -0.1763,  ..., -0.1771,  0.0013,  0.1413]],
       requires_grad=True)


In [15]:
model = trainMLP(model, X, y, criterion, optimizer)

Training epoch 20/200
Training epoch 40/200
Training epoch 60/200
Training epoch 80/200
Training epoch 100/200
Training epoch 120/200
Training epoch 140/200
Training epoch 160/200
Training epoch 180/200
Training epoch 200/200


In [16]:
# Updates weights to first hidden layer after training the model
print(model.layer[0].weight)

Parameter containing:
tensor([[-0.1044,  0.0422, -0.1435,  ..., -0.1271,  0.2082,  0.2289],
        [ 0.0812,  0.0372,  0.1736,  ..., -0.1752, -0.0095, -0.0008],
        [-0.0611,  0.2747, -0.0505,  ...,  0.0721,  0.2734, -0.1974],
        ...,
        [ 0.0335,  0.2382, -0.0788,  ...,  0.0092, -0.1270, -0.1845],
        [ 0.0539,  0.1625, -0.0052,  ...,  0.0854, -0.0151, -0.0970],
        [ 0.1674, -0.1501, -0.1517,  ..., -0.2085, -0.0396,  0.2282]],
       requires_grad=True)


In [17]:
loss = evalMLP(model, X, y, criterion)

In [18]:
print(loss)

0.008619799806230231


In [19]:
train_loss, test_loss = crossvalidate(X, y, criterion, optimizer_class, optimizer_kwargs, hidden_layers=[100,200])



Fold 1/5
Training epoch 20/200
Training epoch 40/200
Training epoch 60/200
Training epoch 80/200
Training epoch 100/200
Training epoch 120/200
Training epoch 140/200
Training epoch 160/200
Training epoch 180/200
Training epoch 200/200
Fold 1 - Train loss: 0.0091, Test loss: 0.0291

Fold 2/5
Training epoch 20/200
Training epoch 40/200
Training epoch 60/200
Training epoch 80/200
Training epoch 100/200
Training epoch 120/200
Training epoch 140/200
Training epoch 160/200
Training epoch 180/200
Training epoch 200/200
Fold 2 - Train loss: 0.0084, Test loss: 0.0273

Fold 3/5
Training epoch 20/200
Training epoch 40/200
Training epoch 60/200
Training epoch 80/200
Training epoch 100/200
Training epoch 120/200
Training epoch 140/200
Training epoch 160/200
Training epoch 180/200
Training epoch 200/200
Fold 3 - Train loss: 0.0095, Test loss: 0.0289

Fold 4/5
Training epoch 20/200
Training epoch 40/200
Training epoch 60/200
Training epoch 80/200
Training epoch 100/200
Training epoch 120/200
Trainin

In [20]:
print(train_loss)

[0.009103134811196997, 0.008402398863776785, 0.009535001671883602, 0.008745870433165251, 0.010464281365181429]


In [21]:
print(test_loss)

[0.029074125628434543, 0.027287184023550827, 0.028891845842327776, 0.03060407891005165, 0.030849941845273912]
