# MLP robustness

this notebook aims to investigate the robustness of MLP models, both `sklearn` and `pytorch` implementations, to out data (mainly on `Complex_protein_embeddings` that is showing good results).

Will be training the same model several times and evaluating the variance of the results.

In [12]:
import sys,os
sys.path.append(os.path.abspath('../../src/ml'))

from load_matrix import load_df
from model_trainer import MLModel
from evaluator import Evaluator
from collector import ResultsCollector
from visualizer import DashboardVisualizer
import pandas as pd

In [28]:
df=load_df('gene_expression')

## sklearn

In [33]:
mlp_classifiers={}
number_of_trials=10
norm='none'
version='2.10'
MLModel.DEFAULT_SAVE #--not interested in saving the model

False

In [31]:
for i in range(number_of_trials):
    model=MLModel(dataset_name='gene_expression',df=df,model_type='sklearn_mlp',version=version,normalization=norm)
    model.train_evaluate()
    mlp_classifiers[i]=model

-- [sklearn_mlp_gene_expression] Initialized MLModel with model_type='SKLEARN_MLP', dataset_name='GENE_EXPRESSION' --
-- [sklearn_mlp_gene_expression] split ratio: 0.2
-- [sklearn_mlp_gene_expression] random state: 42
-- [sklearn_mlp_gene_expression] CACHE_DIR is: .cache/ --
--------------------------------------------------------------------------------
-- [sklearn_mlp_gene_expression] Training SKLEARN_MLP model on dataset 'gene_expression' --
--------------------------------------------------------------------------------

-- MLP Hyperparameters --
  hidden_layer_sizes: [(50,), (100,), (100, 50)]
  activation: ['relu']
  solver: ['adam', 'sgd']
  learning_rate_init: [0.001, 0.01, 0.1]



-- Best Parameters --
  activation: relu
  hidden_layer_sizes: (100, 50)
  learning_rate_init: 0.001
  solver: sgd


-- [sklearn_mlp_gene_expression] best model parameters: {'activation': 'relu', 'hidden_layer_sizes': (100, 50), 'learning_rate_init': 0.001, 'solver': 'sgd'} --
-----------------------

## pytorch

In [None]:
# mlp_classifiers={}
# number_of_trials=10
# norm='none'
# version='2.10'
# MLModel.DEFAULT_SAVE #--not interested in saving the model

False

In [26]:
# for i in range(number_of_trials):
#     model=MLModel(dataset_name='Comlex_protein_embeddings',df=df,model_type='pytorch_mlp',version=version,normalization=norm)
#     model.train_evaluate()
#     mlp_classifiers[i]=model

## evaluation

In [32]:
collector = ResultsCollector()
metrics_list = []
for i, ml_model in mlp_classifiers.items():
        print(f'-- Evaluating model{i}: {version} {norm} --')
        y_test, y_pred, y_proba = ml_model.y_test, ml_model.y_pred, ml_model.y_proba
        metrics = Evaluator(y_test, y_pred, y_proba).compute_metrics()
        metrics_list.append({
                        "model": ml_model.model_type,
                        "input": ml_model.dataset_name,
                        "normalization": norm,  
                        "version": version,
                        **metrics
                })

        collector.add(ml_model.model_type, ml_model.dataset_name, y_test, y_pred, y_proba)
results=pd.DataFrame(metrics_list)

-- Evaluating model0: 2.10 none --
-- Evaluating model1: 2.10 none --


-- Evaluating model2: 2.10 none --
-- Evaluating model3: 2.10 none --
-- Evaluating model4: 2.10 none --
-- Evaluating model5: 2.10 none --
-- Evaluating model6: 2.10 none --
-- Evaluating model7: 2.10 none --
-- Evaluating model8: 2.10 none --
-- Evaluating model9: 2.10 none --


In [34]:
results

Unnamed: 0,model,input,normalization,version,balanced_accuracy,precision,recall,f1,mcc,auroc,auprc,brier
0,sklearn_mlp,gene_expression,none,2.1,0.928571,0.962963,1.0,0.981132,0.908514,1.0,1.0,0.020148
1,sklearn_mlp,gene_expression,none,2.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.014521
2,sklearn_mlp,gene_expression,none,2.1,0.785714,0.896552,1.0,0.945455,0.715762,0.901099,0.954977,0.090989
3,sklearn_mlp,gene_expression,none,2.1,0.837912,0.925926,0.961538,0.943396,0.716328,0.906593,0.96699,0.087382
4,sklearn_mlp,gene_expression,none,2.1,0.980769,1.0,0.961538,0.980392,0.917249,1.0,1.0,0.030303
5,sklearn_mlp,gene_expression,none,2.1,0.961538,1.0,0.923077,0.96,0.847319,0.989011,0.997096,0.059941
6,sklearn_mlp,gene_expression,none,2.1,0.980769,1.0,0.961538,0.980392,0.917249,1.0,1.0,0.030303
7,sklearn_mlp,gene_expression,none,2.1,0.818681,0.923077,0.923077,0.923077,0.637363,0.967033,0.991445,0.084155
8,sklearn_mlp,gene_expression,none,2.1,0.909341,0.961538,0.961538,0.961538,0.818681,0.994505,0.998575,0.04723
9,sklearn_mlp,gene_expression,none,2.1,0.89011,0.96,0.923077,0.941176,0.744282,0.923077,0.960173,0.090909


In [17]:
MLModel.SKLEARN_MLP_HYPERPARAMS

{'hidden_layer_sizes': [(50,), (100,), (100, 50)],
 'activation': ['relu'],
 'solver': ['adam', 'sgd'],
 'learning_rate_init': [0.001, 0.01, 0.1]}

In [27]:
MLModel.PYTORCH_MLP_HYPERPARAMS

{'hidden_layer_sizes': [(50,), (100,), (100, 50)],
 'activation': ['relu', 'leaky_relu'],
 'solver': ['adam', 'sgd'],
 'learning_rate_init': [0.001, 0.01, 0.1],
 'batch_size': [16]}