# Imports

In [1]:
# Data processing
import pandas as pd
# Preprocessing modules
import absenteeism_at_work_preprocessor
import students_dropout_and_academic_success_preprocessor
import loan_preprocessor
# Sci-kit learn
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
# Other utilities
import random
import warnings

# Common functionalities

## Helper function to keep input data consistent across folds
To prevent data leakage, preprocessing is done while performing cross validation. Because not all categories are present in each fold, one hot encoding can result in a different number of columns. To keep the columns consistent between the train and test data in one fold, the intersection of the columns is used.

In [22]:
def take_common_columns(X_train, X_test):
    common_columns = list(set(X_train.columns) & set(X_test.columns))
    return X_train[common_columns], X_test[common_columns]

## Scoring function for comparison table
This function is the central comparison utility function. It takes a dictionary of networks and returns a summary for each network with various metrics.

In [23]:
def compare_networks(networks, X_test, y_test):
    results = []
    
    for network_name, network in networks.items():
        y_pred = network.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted")
        
        results.append({
            "Network": network_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1
        })

    return pd.DataFrame(results)

## K-fold network validation framework
The following function defines the framework for comparing multiple neural network tuners. In a k-fold validation loop, networks are generated with the strategies provided as input paramters. In order to keep this function dataset agnostic, a preprocessor must be provided. The preprocessor takes a subset of the data, cleans it and splits it into X (features) and y (target).

In [24]:
def compare_tuning_algorithms(dataset, preprocessor, network_generators, n_folds, shuffle_train_test):
    metrics_table = pd.DataFrame()

    folds = KFold(n_splits=n_folds, shuffle=shuffle_train_test, random_state=seed)

    for fold, (train_idx, test_idx) in enumerate(folds.split(dataset)):
        X_train, y_train = preprocessor(dataset.iloc[train_idx])
        X_test, y_test = preprocessor(dataset.iloc[test_idx])
        X_train, X_test = take_common_columns(X_train, X_test)
        
        networks = {
            name: generator(X_train, y_train, seed=seed, parameters=parameters) 
            for name, generator in network_generators.items() 
        }
        
        fold_comparison = compare_networks(networks, X_test, y_test)
        metrics_table = pd.concat([metrics_table, fold_comparison], axis=0)

        print("Fold %s" % fold)
        print(fold_comparison)

    return metrics_table.groupby("Network").mean()   

## Benchmark neural network

In [25]:
def fit_benchmark_neural_network(X_train, y_train, seed, parameters=None):
    return MLPClassifier(random_state=seed).fit(X_train, y_train)

## Benchmark support vector classifier

In [26]:
def fit_benchmark_support_vector_classifier(X_train, y_train, seed, parameters=None):
    return SVC(random_state=seed).fit(X_train, y_train)

# Random Search

### Creates the network

Creates an MLP network with specified layers, nodes, activation function, learning rate, and early stopping parameters.

In [27]:
def create_network(layers, nodes, activation, learning_rate=0.01, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10):
    
    hidden_layer_sizes = tuple(nodes)
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, 
                          max_iter=100, learning_rate_init=learning_rate, 
                          early_stopping=early_stopping, validation_fraction=validation_fraction, 
                          n_iter_no_change=n_iter_no_change)
    return model

### Train and evaluates the model

Trains the MLPClassifier model and evaluates its performance on the train set.


In [28]:
def train_and_evaluate(model, X_train, y_train):
    """
    Trains the MLPClassifier model and evaluates its performance on the train set.
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy = accuracy_score(y_train, y_pred)
    return accuracy

## Custom random search algorithm

random_configuration: Generates a random configuration for the neural network.

tune_custom_random_neural_network: Tunes a custom random neural network based on specified parameters.

In [29]:
def random_configuration(max_layers, max_nodes, activation_functions):

    # Randomly select the number of layers within the allowed range
    layers = random.randint(1, max_layers)
    # Randomly select an activation function from the provided list
    activation = random.choice(activation_functions)
    # Generate a random number of nodes for each layer
    nodes = [random.randint(1, max_nodes) for _ in range(layers)]
    return layers, nodes, activation

def tune_custom_random_neural_network(X_train, y_train, parameters, seed, iterations=20):
    
    random.seed(seed)
    best_performance = None
    # Extracting maximum permissible values for layers, nodes, and activation functions
    max_layers, max_nodes, activation_functions = parameters['max_layers'], parameters['max_nodes'], parameters["activation_functions"]

    for _ in range(iterations):
        layers, nodes, activation = random_configuration(max_layers, max_nodes, activation_functions)
        model = create_network(layers, nodes, activation)
        performance = train_and_evaluate(model, X_train, y_train)
        
        # Update the best performance and configuration if this model is better
        if best_performance is None or performance > best_performance:
            best_performance = performance
            best_layers = layers
            best_nodes = nodes
            best_activation = activation

    return create_network(best_layers, best_nodes, best_activation).fit(X_train, y_train)

# Local Search

### Creates the network

Creates an MLP network with specified layers, nodes, activation function, learning rate, and early stopping parameters.

In [30]:
def create_networkh(layers, nodes, activation, learning_rate=0.01, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10):
    
    hidden_layer_sizes = tuple([nodes] * layers)
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, 
                          max_iter=100, learning_rate_init=learning_rate, 
                          early_stopping=early_stopping, validation_fraction=validation_fraction, 
                          n_iter_no_change=n_iter_no_change)
    return model

### Train and evaluates the model

Trains the MLPClassifier model and evaluates its performance on the train set.

In [31]:
def train_and_evaluate(model, X_train, y_train):

    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy = accuracy_score(y_train, y_pred)
    return accuracy

## Custom local search algorithm

tune_custom_local_search_network: Performs hill climbing to find a better neural network configuration.


In [32]:
def tune_custom_local_search_network(X_train, y_train, parameters, seed):

    max_layers, max_nodes, activation_functions = parameters["max_layers"], parameters["max_nodes"], parameters["activation_functions"]
    #Initializing the current nodes with starting values
    current_layers, current_nodes, current_activation = 1, 1, "relu"

    best_performance = None
 
    while True:
        neighbors = []
    
        # Generating neighbors by varying one parameter at a time
        # Increase layer count if below max
        if current_layers < max_layers:
            neighbors.append((current_layers + 1, current_nodes, current_activation))
        
        # Increase node count if below max
        if current_nodes < max_nodes:
            neighbors.append((current_layers, current_nodes + 1, current_activation))
        
        # Change activation function to each alternative
        for activation in activation_functions:
            if activation != current_activation:
                neighbors.append((current_layers, current_nodes, activation))

        # Evaluating each neighboring configuration
        # if no configuration is better then best_neighbor is set/stays None
        best_neighbor = None
        for neighbor in neighbors:
            layers, nodes, activation = neighbor
            #creating network
            model = create_networkh(layers, nodes, activation)
            #evaluating its performance
            performance = train_and_evaluate(model, X_train, y_train)

            if best_performance is None or performance > best_performance:
                best_performance = performance
                best_neighbor = neighbor

        # Check if no improvement
        if best_neighbor is None:
            return create_networkh(current_layers, current_nodes, current_activation).fit(X_train, y_train)

        current_layers, current_nodes, current_activation = best_neighbor

# Experiment parameters

In [33]:
disable_warnings = True
n_folds = 5
seed = 0
shuffle_train_test = True
network_generators = {
    "Benchmark neural network": fit_benchmark_neural_network,
    "Benchmark support vector classifier": fit_benchmark_support_vector_classifier,
    "Random search tuned neural network": tune_custom_random_neural_network,
    "Local search tuned neural network": tune_custom_local_search_network
}
parameters = {
    "max_layers": 5,
    "max_nodes": 50,
    "activation_functions": ["relu", "tanh", "logistic"]
}

In [34]:
if disable_warnings:
    warnings.filterwarnings("ignore")

# Absenteeism at work

## Data loading

In [35]:
absenteeism_at_work = pd.read_csv("../../data/absenteeism-at-work/data.csv", delimiter=";", index_col="ID")

## Network generator comparison

In [36]:
compare_tuning_algorithms(
    dataset=absenteeism_at_work, 
    preprocessor=absenteeism_at_work_preprocessor.preprocess, 
    network_generators=network_generators, 
    n_folds=n_folds, 
    shuffle_train_test=shuffle_train_test
)

Fold 0
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.591837   0.594404  0.591837   
1  Benchmark support vector classifier  0.625850   0.440252  0.625850   
2   Random search tuned neural network  0.571429   0.584053  0.571429   
3    Local search tuned neural network  0.163265   0.026656  0.163265   

   F1 Score  
0  0.580772  
1  0.510948  
2  0.560814  
3  0.045829  
Fold 1
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.623288   0.597194  0.623288   
1  Benchmark support vector classifier  0.554795   0.357401  0.554795   
2   Random search tuned neural network  0.719178   0.671305  0.719178   
3    Local search tuned neural network  0.554795   0.437665  0.554795   

   F1 Score  
0  0.593481  
1  0.422548  
2  0.687127  
3  0.470254  
Fold 2
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural net

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Benchmark neural network,0.580863,0.57088,0.580863,0.559935
Benchmark support vector classifier,0.545588,0.350055,0.545588,0.413866
Local search tuned neural network,0.297666,0.16081,0.297666,0.189775
Random search tuned neural network,0.670283,0.652164,0.670283,0.646356


# Students' dropout and academic success

## Data loading

In [37]:
students_dropout_and_academic_success = pd.read_csv("../../data/predict-students-dropout-and-academic-success/data.csv", delimiter=";")

## Network generator comparison

In [38]:
compare_tuning_algorithms(
    dataset=students_dropout_and_academic_success, 
    preprocessor=students_dropout_and_academic_success_preprocessor.preprocess, 
    network_generators=network_generators, 
    n_folds=n_folds, 
    shuffle_train_test=shuffle_train_test
)

Fold 0
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.605650   0.622648  0.605650   
1  Benchmark support vector classifier  0.508475   0.258546  0.508475   
2   Random search tuned neural network  0.749153   0.731968  0.749153   
3    Local search tuned neural network  0.508475   0.258546  0.508475   

   F1 Score  
0  0.527361  
1  0.342792  
2  0.716770  
3  0.342792  
Fold 1
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.361582   0.546253  0.361582   
1  Benchmark support vector classifier  0.480226   0.230617  0.480226   
2   Random search tuned neural network  0.707345   0.679048  0.707345   
3    Local search tuned neural network  0.480226   0.230617  0.480226   

   F1 Score  
0  0.226252  
1  0.311597  
2  0.646448  
3  0.311597  
Fold 2
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural net

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Benchmark neural network,0.480307,0.58823,0.480307,0.399212
Benchmark support vector classifier,0.49638,0.269398,0.49638,0.33728
Local search tuned neural network,0.49638,0.269398,0.49638,0.33728
Random search tuned neural network,0.663612,0.601756,0.663612,0.595828


# Loan

## Data loading

In [39]:
loan = pd.read_csv("../../data/kaggle-competitions/loan/loan-10k.lrn.csv", index_col="ID")

## Network generator comparison

In [40]:
compare_tuning_algorithms(
    dataset=loan, 
    preprocessor=loan_preprocessor.preprocess, 
    network_generators=network_generators, 
    n_folds=n_folds, 
    shuffle_train_test=shuffle_train_test
)

Fold 0
                               Network  Accuracy  Precision  Recall  F1 Score
0             Benchmark neural network    0.2205   0.349758  0.2205  0.183426
1  Benchmark support vector classifier    0.3510   0.282374  0.3510  0.287180
2   Random search tuned neural network    0.3310   0.274137  0.3310  0.215967
3    Local search tuned neural network    0.3180   0.101124  0.3180  0.153451
Fold 1
                               Network  Accuracy  Precision  Recall  F1 Score
0             Benchmark neural network    0.3085   0.318028  0.3085  0.178850
1  Benchmark support vector classifier    0.3240   0.282401  0.3240  0.257294
2   Random search tuned neural network    0.3075   0.275321  0.3075  0.260002
3    Local search tuned neural network    0.2845   0.080940  0.2845  0.126026
Fold 2
                               Network  Accuracy  Precision  Recall  F1 Score
0             Benchmark neural network    0.2885   0.347955  0.2885  0.199530
1  Benchmark support vector classifier    0

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Benchmark neural network,0.2649,0.341667,0.2649,0.180759
Benchmark support vector classifier,0.3287,0.277229,0.3287,0.26287
Local search tuned neural network,0.2989,0.089526,0.2989,0.137733
Random search tuned neural network,0.3154,0.278272,0.3154,0.237265
