# Imports

In [1]:
# Data processing
import pandas as pd
# Preprocessing modules
import absenteeism_at_work_preprocessor
import students_dropout_and_academic_success_preprocessor
import loan_preprocessor
# Sci-kit learn
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
# Other utilities
import random
import warnings

# Common functionalities

## Helper function to keep input data consistent across folds
To prevent data leakage, preprocessing is done while performing cross validation. Because not all categories are present in each fold, one hot encoding can result in a different number of columns. To keep the columns consistent between the train and test data in one fold, the intersection of the columns is used.

In [None]:
def take_common_columns(X_train, X_test):
    common_columns = list(set(X_train.columns) & set(X_test.columns))
    return X_train[common_columns], X_test[common_columns]

## Scoring function for comparison table
This function is the central comparison utility function. It takes a dictionary of networks and returns a summary for each network with various metrics.

In [2]:
def compare_networks(networks, X_test, y_test):
    results = []
    
    for network_name, network in networks.items():
        y_pred = network.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted")
        
        results.append({
            "Network": network_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1
        })

    return pd.DataFrame(results)

## K-fold network validation framework
The following function defines the framework for comparing multiple neural network tuners. In a k-fold validation loop, networks are generated with the strategies provided as input paramters. In order to keep this function dataset agnostic, a preprocessor must be provided. The preprocessor takes a subset of the data, cleans it and splits it into X (features) and y (target).

In [3]:
def compare_tuning_algorithms(dataset, preprocessor, network_generators, n_folds, shuffle_train_test):
    metrics_table = pd.DataFrame()

    folds = KFold(n_splits=n_folds, shuffle=shuffle_train_test, random_state=seed)

    for fold, (train_idx, test_idx) in enumerate(folds.split(dataset)):
        X_train, y_train = preprocessor(dataset.iloc[train_idx])
        X_test, y_test = preprocessor(dataset.iloc[test_idx])
        X_train, X_test = take_common_columns(X_train, X_test)
        
        networks = {
            name: generator(X_train, y_train, seed=seed, parameters=parameters) 
            for name, generator in network_generators.items() 
        }
        
        fold_comparison = compare_networks(networks, X_test, y_test)
        metrics_table = pd.concat([metrics_table, fold_comparison], axis=0)

        print("Fold %s" % fold)
        print(fold_comparison)

    return metrics_table.groupby("Network").mean()   

## Benchmark neural network

In [4]:
def fit_benchmark_neural_network(X_train, y_train, seed, parameters=None):
    return MLPClassifier(random_state=seed).fit(X_train, y_train)

## Benchmark support vector classifier

In [5]:
def fit_benchmark_support_vector_classifier(X_train, y_train, seed, parameters=None):
    return SVC(random_state=seed).fit(X_train, y_train)

## Custom random search algorith

In [6]:
def create_network(layers, nodes, activation, learning_rate=0.01, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10):
    """
    Creates an MLP network with specified layers, nodes, activation function, learning rate, and early stopping parameters.
    """
    hidden_layer_sizes = tuple(nodes)
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, 
                          max_iter=100, learning_rate_init=learning_rate, 
                          early_stopping=early_stopping, validation_fraction=validation_fraction, 
                          n_iter_no_change=n_iter_no_change)
    return model

def train_and_evaluate(model, X_train, y_train):
    """
    Trains the MLPClassifier model and evaluates its performance on the test set.
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy = accuracy_score(y_train, y_pred)
    return accuracy

def random_configuration(max_layers, max_nodes, activation_functions):
    """
    Generates a random configuration for the neural network.
    """
    layers = random.randint(1, max_layers)
    activation = random.choice(activation_functions)
    nodes = [random.randint(1, max_nodes) for _ in range(layers)]
    return layers, nodes, activation

def tune_custom_random_neural_network(X_train, y_train, parameters, seed, iterations=20):
    """
    Tunes a custom random neural network based on specified parameters, seed, scoring method, and number of folds.
    """
    random.seed(seed)
    best_performance = None
    max_layers, max_nodes, activation_functions = parameters['max_layers'], parameters['max_nodes'], parameters["activation_functions"]

    for _ in range(iterations):
        layers, nodes, activation = random_configuration(max_layers, max_nodes, activation_functions)
        model = create_network(layers, nodes, activation)
        performance = train_and_evaluate(model, X_train, y_train)

        if best_performance is None or performance > best_performance:
            best_performance = performance
            best_layers = layers
            best_nodes = nodes
            best_activation = activation

    return create_network(best_layers, best_nodes, best_activation).fit(X_train, y_train)

## Custom local search algorithm

In [7]:
def create_networkh(layers, nodes, activation, learning_rate=0.01, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10):
    """
    Creates an MLP network with specified layers, nodes, activation function, learning rate, and early stopping parameters.
    """
    hidden_layer_sizes = tuple([nodes] * layers)
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, 
                          max_iter=100, learning_rate_init=learning_rate, 
                          early_stopping=early_stopping, validation_fraction=validation_fraction, 
                          n_iter_no_change=n_iter_no_change)
    return model

def train_and_evaluate(model, X_train, y_train):
    """
    Trains the MLPClassifier model and evaluates its performance on the test set.
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy = accuracy_score(y_train, y_pred)
    return accuracy

def tune_custom_local_search_network(X_train, y_train, parameters, seed):
    """
    Performs hill climbing to find a better neural network configuration.
    """
    max_layers, max_nodes, activation_functions = parameters["max_layers"], parameters["max_nodes"], parameters["activation_functions"]
    current_layers, current_nodes, current_activation = 1, 1, "relu"
    best_performance = None
    best_config = None

    while True:
        neighbors = []

        # Generating neighbors by varying one parameter at a time
        if current_layers < max_layers:
            neighbors.append((current_layers + 1, current_nodes, current_activation))
        if current_nodes < max_nodes:
            neighbors.append((current_layers, current_nodes + 1, current_activation))
        for activation in activation_functions:
            if activation != current_activation:
                neighbors.append((current_layers, current_nodes, activation))

        # Evaluating neighbors
        best_neighbor = None
        for neighbor in neighbors:
            layers, nodes, activation = neighbor
            model = create_networkh(layers, nodes, activation)
            performance = train_and_evaluate(model, X_train, y_train)

            if best_performance is None or performance > best_performance:
                best_performance = performance
                best_neighbor = neighbor

        # Check if no improvement
        if best_neighbor is None:
            return create_networkh(current_layers, current_nodes, current_activation).fit(X_train, y_train)

        current_layers, current_nodes, current_activation = best_neighbor

# Experiment parameters

In [8]:
disable_warnings = True
n_folds = 3
seed = 0
shuffle_train_test = True
network_generators = {
    "Benchmark neural network": fit_benchmark_neural_network,
    "Benchmark support vector classifier": fit_benchmark_support_vector_classifier,
    "Random search tuned neural network": tune_custom_random_neural_network,
    "Local search tuned neural network": tune_custom_local_search_network
}
parameters = {
    "max_layers": 5,
    "max_nodes": 50,
    "activation_functions": ["relu", "tanh", "logistic"]
}

In [9]:
if disable_warnings:
    warnings.filterwarnings("ignore")

# Absenteeism at work

## Data loading

In [10]:
absenteeism_at_work = pd.read_csv("../../data/absenteeism-at-work/data.csv", delimiter=";", index_col="ID")

## Network generator comparison

In [11]:
compare_tuning_algorithms(
    dataset=absenteeism_at_work, 
    preprocessor=absenteeism_at_work_preprocessor.preprocess, 
    network_generators=network_generators, 
    n_folds=n_folds, 
    shuffle_train_test=shuffle_train_test
)

Fold 0
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.583673   0.593848  0.583673   
1  Benchmark support vector classifier  0.555102   0.334260  0.555102   
2   Random search tuned neural network  0.673469   0.692202  0.673469   
3    Local search tuned neural network  0.224490   0.060874  0.224490   

   F1 Score  
0  0.568714  
1  0.413446  
2  0.637256  
3  0.092632  
Fold 1
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.597561   0.593072  0.597561   
1  Benchmark support vector classifier  0.548780   0.329476  0.548780   
2   Random search tuned neural network  0.504065   0.488363  0.504065   
3    Local search tuned neural network  0.199187   0.085167  0.199187   

   F1 Score  
0  0.583461  
1  0.408462  
2  0.465311  
3  0.118415  
Fold 2
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural net

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Benchmark neural network,0.584802,0.583306,0.584802,0.563523
Benchmark support vector classifier,0.531917,0.314161,0.531917,0.391579
Local search tuned neural network,0.206266,0.062908,0.206266,0.093697
Random search tuned neural network,0.614734,0.578595,0.614734,0.565588


# Students' dropout and academic success

## Data loading

In [12]:
students_dropout_and_academic_success = pd.read_csv("../../data/predict-students-dropout-and-academic-success/data.csv", delimiter=";")

## Network generator comparison

In [13]:
compare_tuning_algorithms(
    dataset=students_dropout_and_academic_success, 
    preprocessor=students_dropout_and_academic_success_preprocessor.preprocess, 
    network_generators=network_generators, 
    n_folds=n_folds, 
    shuffle_train_test=shuffle_train_test
)

Fold 0
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.652203   0.614776  0.652203   
1  Benchmark support vector classifier  0.497627   0.247633  0.497627   
2   Random search tuned neural network  0.711864   0.674774  0.711864   
3    Local search tuned neural network  0.497627   0.247633  0.497627   

   F1 Score  
0  0.580487  
1  0.330700  
2  0.664435  
3  0.330700  
Fold 1
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.717966   0.680439  0.717966   
1  Benchmark support vector classifier  0.496949   0.246958  0.496949   
2   Random search tuned neural network  0.658983   0.625404  0.658983   
3    Local search tuned neural network  0.496949   0.246958  0.496949   

   F1 Score  
0  0.645964  
1  0.329949  
2  0.600170  
3  0.329949  
Fold 2
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural net

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Benchmark neural network,0.689423,0.665,0.689423,0.618491
Benchmark support vector classifier,0.497288,0.28726,0.497288,0.342684
Local search tuned neural network,0.499323,0.249332,0.499323,0.332586
Random search tuned neural network,0.693494,0.653042,0.693494,0.636854


# Loan

## Data loading

In [14]:
loan = pd.read_csv("../../data/kaggle-competitions/loan/loan-10k.lrn.csv", index_col="ID")

## Network generator comparison

In [15]:
compare_tuning_algorithms(
    dataset=loan, 
    preprocessor=loan_preprocessor.preprocess, 
    network_generators=network_generators, 
    n_folds=n_folds, 
    shuffle_train_test=shuffle_train_test
)

Fold 0
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.236053   0.288442  0.236053   
1  Benchmark support vector classifier  0.335633   0.272581  0.335633   
2   Random search tuned neural network  0.289742   0.142957  0.289742   
3    Local search tuned neural network  0.304139   0.092501  0.304139   

   F1 Score  
0  0.182031  
1  0.270512  
2  0.190173  
3  0.141857  
Fold 1
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural network  0.273927   0.327749  0.273927   
1  Benchmark support vector classifier  0.322232   0.271679  0.322232   
2   Random search tuned neural network  0.327033   0.305081  0.327033   
3    Local search tuned neural network  0.276928   0.076689  0.276928   

   F1 Score  
0  0.197324  
1  0.260445  
2  0.245297  
3  0.120115  
Fold 2
                               Network  Accuracy  Precision    Recall  \
0             Benchmark neural net

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score
Network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Benchmark neural network,0.258902,0.319997,0.258902,0.195382
Benchmark support vector classifier,0.326399,0.278644,0.326399,0.259962
Local search tuned neural network,0.288198,0.083193,0.288198,0.129078
Random search tuned neural network,0.311102,0.255876,0.311102,0.241298
