This notebook will be the first one for training an MLP. I will create functions for training, testing, crossvalidation and randomsearch to have a seamless analysis. The baseline default value I will use for the initial MLP will come from sklearn's MLPRegressor

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html
<br />
class sklearn.neural_network.MLPRegressor(hidden_layer_sizes=(100,), activation='relu', *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)
pd.set_option('display.float_format', '{:.4f}'.format)
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from tqdm import tqdm


In [2]:
torch.manual_seed(123)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
train = pickle.load(open('data/train.pkl','rb'))
test = pickle.load(open('data/test.pkl','rb'))
preprocess_pipeline = pickle.load(open('data/pipeline.pkl', 'rb'))

train = preprocess_pipeline.fit_transform(train)
test = preprocess_pipeline.transform(test)

In [4]:
train.shape

(7930, 22)

In [5]:
X = train.drop('pressure',axis=1).values
y = train['pressure'].values

In [6]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

# Helper Functions

In [7]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layers=None, dropout_prob=0):
        """
        - input_dim: number of neurons on input layer
        - hidden_layers: list of integers where each integer represents
                         the number of neurons in that hidden layer
        (ie. [100, 200]: input_dim -> 100 -> relu -> 200 -> relu -> output)
        """
        super(MLP, self).__init__()
        hidden_layers = hidden_layers or [100]
        layers = []
        in_dim = input_dim
        for h in hidden_layers:
            layers.append(nn.Linear(in_dim,h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(p=dropout_prob))    # dropout regularisation
            in_dim = h
          
        layers.append(nn.Linear(in_dim,1))    # output layer
        self.layer = nn.Sequential(*layers)
        
    
    def forward(self, X):
        return self.layer(X)

In [8]:
class EarlyStopper:
    def __init__(self, patience=1, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_val_loss = float("inf")
        self.count = 0

    def earlystop(self, val_loss):
        # Significant decrease in validation loss
        if (self.best_val_loss - val_loss) > self.delta:
            self.best_val_loss = val_loss
            self.count = 0

        # Insignificant decrease in validation loss
        elif (self.best_val_loss - val_loss) <= self.delta:
            self.count += 1
            if self.count > self.patience:
                return True
            
        return False

In [None]:
def trainMLP(model, X, y, criterion, optimizer, batch_size=32, num_epochs=200):
    """
    Input:
    - model: predefined model instance
    - X: features as a tensor
    - y: target as a tensor
    - criterion: loss function
    - optimizer: parameter optimizer algorithm
    - batch_size: batch size for mini-batch gradient descent
    - num_epochs: number of epochs to train
    
    Output:
    - model: trained model
    """
    
    train_dataset = TensorDataset(X, y)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    model.to(device)
    for epoch in tqdm(range(num_epochs), desc="Training epochs"):
        model.train()
            
        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
    
            optimizer.zero_grad()
            # .squeeze() to match target shape
            outputs = model(batch_X).squeeze()  # [batch_size, 1] => [batch_size]
            loss = criterion(outputs, batch_y)

            # Backwardprop and optimization
            loss.backward()
            optimizer.step()
    
    return model

In [10]:
def evalMLP(model, X, y, criterion, batch_size=32):
    """
    Input:
    - model: predefined model instance
    - X: features as a tensor
    - y: target as a tensor
    - criterion: loss function
    - batch_size: data loader batch size (not very important, only lightens the load for loss each calculation)
    
    Output:
    - mean_loss: mean loss from all tested samples
    """
        
    dataset = TensorDataset(X, y)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=True)
    
    model.to(device)
    model.eval()
    total_loss = 0.0
    total_samples = 0
    
    with torch.no_grad():
        for batch_X, batch_y in loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # .squeeze() to match target shape
            outputs = model(batch_X).squeeze()  # [batch_size, 1] => [batch_size]
            loss = criterion(outputs, batch_y)
            
            total_loss += loss.item() * batch_X.size(0)
            total_samples += batch_X.size(0)
            
    return total_loss / total_samples

In [None]:
def trainMLP_earlystop(model, X, y, criterion, optimizer, patience=5, delta= 0, batch_size=32, num_epochs=200):
    """
    Input:
    - model: predefined model instance
    - X: features as a tensor
    - y: target as a tensor
    - criterion: loss function
    - optimizer: parameter optimizer algorithm
    - patience: number of consecutive insignificant decreases in validation loss until early stop to activate
    - delta: the magnitude a decrease needs to be significant (val_loss new - val_loss old) 
    - batch_size: batch size for mini-batch gradient descent
    - num_epochs: number of epochs to train
    
    Output:
    - model: trained model
    """
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=123)

    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    
    es = EarlyStopper(patience, delta)
    
    model.to(device)
    for epoch in tqdm(range(num_epochs), desc="Training epochs"):
        model.train()
            
        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
    
            optimizer.zero_grad()
            # .squeeze() to match target shape
            outputs = model(batch_X).squeeze()  # [batch_size, 1] => [batch_size]
            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
        
        # early stopping check
        val_loss = evalMLP(model, X_val, y_val, criterion, batch_size)
        if es.earlystop(val_loss):
            # Break epoch training loop if earlystop returns True
            print(f'Early Stopped at epoch {epoch}')
            break
        
    return model

In [None]:
def crossvalidate(X, y, criterion, optimizer_class, optimizer_kwargs, patience=15, delta= 0,
                  hidden_layers=[100], dropout_prob=0, num_folds=5, batch_size=32, num_epochs=200):
    """
    Input: 
    - X: features as a tensor
    - y: target as a tensor
    - criterion: loss functions
    - optimizer_class: gradient optimisation
    - optimizer_kwargs: learning rate as input
    - patience: number of consecutive insignificant decreases in validation loss until early stop to activate
    - delta: the magnitude a decrease needs to be significant (val_loss new - val_loss old) 
    - hidden_layers: list of integers where each integer represents
                     the number of neurons in that hidden layer
    - dropout_prob: probability of neurons dropping out
    - num_folds: number of cross validation folds
    - batch_size: batch size for DataLoader (mini-batch gradient descent)
    - num_epochs: number of epochs to train per fold
    
    Output
    - fold_train_loss: list of training losses for each fold
    - fold_test_loss: list of testing losses for each fold
    """
        
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=123)
    
    fold_train_loss = []
    fold_test_loss = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        # Create new model and optimizer for each fold
        model = MLP(X.shape[1], hidden_layers, dropout_prob)
        optimizer = optimizer_class(model.parameters(), **optimizer_kwargs)
        
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Train with current fold's data
        model = trainMLP_earlystop(model, X_train, y_train, criterion, optimizer,
                                   patience, delta, batch_size, num_epochs)
        
        # Evaluate on both train and test sets
        train_loss = evalMLP(model, X_train, y_train, criterion)
        test_loss = evalMLP(model, X_test, y_test, criterion)
        
        fold_train_loss.append(train_loss)
        fold_test_loss.append(test_loss)
        
        print(f'Fold {fold+1} - Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}')
    
    return fold_train_loss, fold_test_loss

In [13]:
def sample_config(param_dict, random_state=None):
    """
    Samples a hyperparameter configuration from the given search space.
    
    Parameters:
    - param_dict (dict): Dictionary defining the distributions for hyperparameters.
        Expected keys and tuple values:
          'lr': (min, max) -> log-uniform sample
          'weight_decay': (min, max) -> log-uniform sample
          'num_layers': (min, max) -> integer uniform sample (inclusive)
          'num_neurons': (min, max) -> integer uniform sample for each layer (inclusive)
          'dropout_prob': (min, max) -> uniform sample
    - random_state (int, optional): Seed for the random number generator to ensure reproducibility.

    Returns:
    - config (dict): A dictionary containing a sampled configuration.
      For the neural network architecture, key 'hidden_layers' contains a 
      list with a random integer (number of neurons) for each hidden layer.
    """
    
    # Set the random state if provided
    if random_state is not None:
        np.random.seed(random_state)
    
    # Default hyperparameter values
    defaults = {
        'lr': 1e-2,
        'weight_decay': 0.0,
        'num_layers': 1,
        'num_neurons': 100,
        'dropout_prob': 0.0
    }
    
    config = {}
    
    # Sample learning rate from log-uniform distribution.
    if 'lr' in param_dict:
        low, high = param_dict['lr']
        config['lr'] = np.exp(np.random.uniform(np.log(low), np.log(high)))
    else:
        config['lr'] = defaults['lr']
        
    # Sample weight decay from log-uniform distribution.
    if 'weight_decay' in param_dict:
        low, high = param_dict['weight_decay']
        config['weight_decay'] = np.exp(np.random.uniform(np.log(low), np.log(high)))
    else:
        config['weight_decay'] = defaults['weight_decay']
        
    # Sample number of layers from an integer uniform distribution.
    if 'num_layers' in param_dict:
        low, high = param_dict['num_layers']
        # np.random.randint is exclusive on the upper bound so add 1.
        config['num_layers'] = np.random.randint(low, high + 1)
    else:
        config['num_layers'] = defaults['num_layers']
        
    # Sample dropout probability from a uniform distribution.
    if 'dropout_prob' in param_dict:
        low, high = param_dict['dropout_prob']
        config['dropout_prob'] = np.random.uniform(low, high)
    else:
        config['dropout_prob'] = defaults['dropout_prob']
    
    # Sample number of neurons for each hidden layer from an integer uniform distribution.
    if 'num_neurons' in param_dict:
        low, high = param_dict['num_neurons']
        n_layers = config.get('num_layers', defaults['num_layers'])
        config['hidden_layers'] = [np.random.randint(low, high + 1) for _ in range(n_layers)]
    else:
        n_layers = config.get('num_layers', defaults['num_layers'])
        config['hidden_layers'] = [defaults['num_neurons'] for _ in range(n_layers)]
        
    return config

In [14]:
############    Sample search space    ############

# param_dict = {
#     'lr': (1e-5, 1e-1),               # Log-uniform 
#     'weight_decay': (1e-7, 1e-3),     # Log-uniform 
#     'num_layers': (1,5),              # Integer uniform
#     'num_neurons': (50, 200),         # Integer uniform for each layer
#     'dropout_prob': (0.0, 0.5)        # Uniform 
# }

In [None]:
def random_search(X, y, param_dict, criterion, optimizer_class, num_trials=400, 
                  num_folds=5, patience=16, delta=0, batch_size=512, 
                  num_epochs=2000, random_state=None):
    """
    Perform random search hyperparameter optimization with cross-validation.
    
    Parameters:
    - X: Features tensor
    - y: Target tensor
    - param_dict: Hyperparameter search space configuration
    - criterion: Loss function
    - num_trials: Number of random configurations to try
    - num_folds: Number of cross-validation folds
    - Other params: Match crossvalidate() parameters
    
    Returns:
    - best_config: Dictionary of best hyperparameters
    - best_loss: lowest crossvalidation loss
    """
    best_config = None
    best_loss = float('inf')
    all_results = []
    
    for trial in range(num_trials):
        print(f"\n============   Trial {trial+1}/{num_trials}   ============")
        
        # Generate unique seed for each trial if random_state provided
        trial_seed = random_state + trial if random_state else None
        
        # Sample hyperparameter configuration
        config = sample_config(param_dict, random_state=trial_seed)
        
        # Model architecture
        model_params = {
            'hidden_layers': config['hidden_layers'],
            'dropout_prob': config['dropout_prob']
        }
        # Optimizer parameters
        optimizer_params = {
            'lr': config['lr'],
            'weight_decay': config['weight_decay']
        }
        
        # Cross-validation
        fold_train_loss, fold_test_loss = crossvalidate(
            X, y, criterion, optimizer_class, optimizer_params,
            patience=patience, delta=delta, **model_params,
            num_folds=num_folds, batch_size=batch_size, num_epochs=num_epochs
        )
        test_loss = np.mean(fold_test_loss)
        
        # Track results
        trial_result = {
            'config': config,
            'avg_test_loss': test_loss,
            'fold_test_losses': fold_test_loss,
            'fold_train_losses': fold_train_loss
        }
        all_results.append(trial_result)
        
        # 7. Update best configuration
        if test_loss < best_loss:
            best_loss = test_loss
            best_config = config
            print(f"New best! Loss: {best_loss:.4f} | Config: {config}")
    
    return best_config, best_loss

# Testing Functions

In [16]:
model = MLP(X.shape[1], [100,200])
print(model)

MLP(
  (layer): Sequential(
    (0): Linear(in_features=21, out_features=100, bias=True)
    (1): ReLU()
    (2): Dropout(p=0, inplace=False)
    (3): Linear(in_features=100, out_features=200, bias=True)
    (4): ReLU()
    (5): Dropout(p=0, inplace=False)
    (6): Linear(in_features=200, out_features=1, bias=True)
  )
)


In [17]:
criterion = nn.MSELoss()
optimizer_class = torch.optim.Adam
optimizer_kwargs = {'lr': 0.0001}
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [18]:
model = trainMLP(model, X, y, criterion, optimizer)

100%|██████████| 200/200 [01:19<00:00,  2.50it/s]


In [19]:
loss = evalMLP(model, X, y, criterion)

In [20]:
print(loss)

0.008307149163390763


In [21]:
train_loss, test_loss = crossvalidate(X, y, criterion, optimizer_class, optimizer_kwargs, hidden_layers=[100,200])


Fold 1/5


100%|██████████| 200/200 [00:50<00:00,  3.98it/s]


Fold 1 - Train loss: 0.0114, Test loss: 0.0306
Fold 2/5


 70%|███████   | 141/200 [00:47<00:19,  3.00it/s]

Early Stopped at epoch 142





Fold 2 - Train loss: 0.0154, Test loss: 0.0334
Fold 3/5


 72%|███████▎  | 145/200 [00:42<00:16,  3.39it/s]


Early Stopped at epoch 146
Fold 3 - Train loss: 0.0130, Test loss: 0.0316
Fold 4/5


 79%|███████▉  | 158/200 [00:59<00:15,  2.67it/s]


Early Stopped at epoch 159
Fold 4 - Train loss: 0.0142, Test loss: 0.0341
Fold 5/5


 98%|█████████▊| 197/200 [00:57<00:00,  3.43it/s]

Early Stopped at epoch 198
Fold 5 - Train loss: 0.0120, Test loss: 0.0316





In [22]:
print(train_loss)

[0.011359043847393207, 0.015398503876187734, 0.012962180367351105, 0.014221516618919041, 0.012033879446486631]


In [23]:
print(test_loss)

[0.03062584287277898, 0.03340053318866661, 0.03159881535233283, 0.03405488088574945, 0.031592662123088935]


In [24]:
param_dict = {
    'lr': (1e-5, 1e-1),               # Log-uniform 
    'weight_decay': (1e-7, 1e-3),     # Log-uniform 
    'num_layers': (1,5),              # Integer uniform
    'num_neurons': (50, 200),         # Integer uniform for each layer
    'dropout_prob': (0.0, 0.5)        # Uniform 
}
sample_config(param_dict, random_state=122)

{'lr': 4.245876440324545e-05,
 'weight_decay': 6.439322405138398e-05,
 'num_layers': 5,
 'dropout_prob': 0.4182060713012252,
 'hidden_layers': [158, 65, 112, 71, 81]}

# RandomSearch

In [None]:
param_dict = {
    'lr': (1e-5, 1e-1),               # Log-uniform 
    'weight_decay': (1e-7, 1e-3),     # Log-uniform 
    'num_layers': (1,5),              # Integer uniform
    'num_neurons': (32, 512),         # Integer uniform for each layer
    'dropout_prob': (0.0, 0.5)        # Uniform 
}

# Run random search
best_config, best_loss = random_search(
    X, y,
    param_dict=param_dict,
    criterion=nn.MSELoss(),
    optimizer_class=torch.optim.AdamW,
    num_trials=512,
    random_state=123
)


Trial 1/3
Fold 1/5


 17%|█▋        | 34/200 [00:06<00:33,  4.89it/s]


Early Stopped at epoch 35
Fold 1 - Train loss: 0.1403, Test loss: 0.1449
Fold 2/5


 12%|█▏        | 23/200 [00:05<00:39,  4.51it/s]


Early Stopped at epoch 24
Fold 2 - Train loss: 0.1509, Test loss: 0.1465
Fold 3/5


  8%|▊         | 16/200 [00:03<00:37,  4.84it/s]


Early Stopped at epoch 17
Fold 3 - Train loss: 0.1771, Test loss: 0.1834
Fold 4/5


 14%|█▍        | 29/200 [00:06<00:36,  4.73it/s]


Early Stopped at epoch 30
Fold 4 - Train loss: 0.1642, Test loss: 0.1654
Fold 5/5


 23%|██▎       | 46/200 [00:12<00:41,  3.69it/s]


Early Stopped at epoch 47
Fold 5 - Train loss: 0.1184, Test loss: 0.1192
New best! Loss: 0.1519 | Config: {'lr': 0.049712909978071915, 'weight_decay': 0.0010000000000000002, 'num_layers': 1, 'dropout_prob': 0.34544242751343085, 'hidden_layers': [67]}

Trial 2/3
Fold 1/5


 10%|█         | 21/200 [00:04<00:42,  4.26it/s]


Early Stopped at epoch 22
Fold 1 - Train loss: 0.0827, Test loss: 0.0859
Fold 2/5


 10%|▉         | 19/200 [00:05<00:51,  3.49it/s]


Early Stopped at epoch 20
Fold 2 - Train loss: 0.0826, Test loss: 0.0843
Fold 3/5


  9%|▉         | 18/200 [00:08<01:24,  2.15it/s]


Early Stopped at epoch 19
Fold 3 - Train loss: 0.0950, Test loss: 0.0979
Fold 4/5


 24%|██▍       | 49/200 [00:14<00:43,  3.49it/s]


Early Stopped at epoch 50
Fold 4 - Train loss: 0.0768, Test loss: 0.0811
Fold 5/5


 10%|█         | 20/200 [00:07<01:11,  2.52it/s]


Early Stopped at epoch 21
Fold 5 - Train loss: 0.0965, Test loss: 0.1015
New best! Loss: 0.0901 | Config: {'lr': 0.012766295887411357, 'weight_decay': 0.0010000000000000002, 'num_layers': 1, 'dropout_prob': 0.29516228086591206, 'hidden_layers': [166]}

Trial 3/3
Fold 1/5


  8%|▊         | 17/200 [00:10<01:53,  1.61it/s]

Early Stopped at epoch 18





Fold 1 - Train loss: 0.2449, Test loss: 0.2456
Fold 2/5


  8%|▊         | 17/200 [00:09<01:46,  1.73it/s]


Early Stopped at epoch 18
Fold 2 - Train loss: 0.3681, Test loss: 0.3705
Fold 3/5


  9%|▉         | 18/200 [00:07<01:13,  2.48it/s]


Early Stopped at epoch 19
Fold 3 - Train loss: 0.2105, Test loss: 0.2255
Fold 4/5


 14%|█▍        | 28/200 [00:13<01:25,  2.01it/s]


Early Stopped at epoch 29
Fold 4 - Train loss: 0.2040, Test loss: 0.2058
Fold 5/5


 10%|█         | 21/200 [00:07<01:05,  2.71it/s]

Early Stopped at epoch 22





Fold 5 - Train loss: 0.1632, Test loss: 0.1692
