This notebook aim to train all models on all datasets

In [None]:
import os
os.chdir(os.path.dirname(os.getcwd()))

from load_matrix import load_df
from training_utils import MLModel

prepare_df = lambda df: df.set_index('label').iloc[:, 3:]

Experimenting with different constants and parameters for the models

In [2]:
MLModel.set_global_variable('DEFAULT_SPLIT_RATIO', 0.3)

## Generalization to train for all datasets and models

In [None]:
def train_all(datasets:list=['gene_expression', 'RGCN_sample_embeddings', 'Complex_sample_embeddings', 'concatenated_sample_embeddings', 'RGCN_protein_embeddings', 'Complex_protein_embeddings', 'concatenated_protein_embeddings'],model_types=MLModel.MODELS):
    trained_MLModels={}

    for dataset in datasets:
        df=prepare_df( load_df(dataset))
        for model in models:
            ml_model=MLModel(model_type=model, df=df,dataset_name=f'{model}_{dataset}')
            ml_model.train_evaluate()
            trained_MLModels[f'{model}_{dataset}']=ml_model
    return trained_MLModels

In [None]:
ALL_MODELS=train_all()

-- Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
{'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}



-- Best Parameters --
{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


-- best model parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- predicting SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --
-- Initialized MLModel with model_type='XGBOOST', dataset_name='XGBOOST_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
---------

Will still go through each model of gene xexression to show the process of training one model at a time for a dataset

## Gene expression dataset:

In [6]:
df=prepare_df(load_df('gene_expression'))

svm_gex=MLModel(model_type='svm', df=df,dataset_name='svm_gene_expression')
svm_gex.train_evaluate()

-- Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
{'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}



-- Best Parameters --
{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


-- best model parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- predicting SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --


(array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([0.8601056 , 0.97352314, 0.96268682, 0.97207285, 0.99998585,
        0.99999743, 0.03188881, 0.94029305, 0.99161448, 0.47347599,
        0.98266576, 0.99999882, 0.01768346, 0.97846755, 0.91158256,
        0.97354565, 0.9775124 , 0.98380876, 0.08846772, 0.15821683,
        0.99600484, 0.91512376, 0.25635128, 0.09353043, 0.98269152,
        0.09087359, 0.96918257, 0.97190346, 0.94390318, 0.10575014,
        0.99654882, 0.97689126, 0.96141841, 0.98037672, 0.29569448,
        0.99999952, 0.67420122, 0.33721608, 0.98773903, 0.13358725,
        0.99193194, 0.96308908, 0.95894933, 0.94894598, 0.96224116,
        0.97730671, 0.99998675, 0.94424105

In [7]:
xgboost_gex=MLModel(model_type='xgboost', df=df,dataset_name='xgboost_gene_expression')
xgboost_gex.train_evaluate()

-- Initialized MLModel with model_type='XGBOOST', dataset_name='XGBOOST_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training XGBOOST model on dataset 'xgboost_gene_expression' --
--------------------------------------------------------------------------------

-- XGBoost Hyperparameters --
{   'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]}



-- Best Parameters --
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}


-- best model parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} --
--------------------------------------------------------------------------------
-- predicting XGBOOST model on dataset 'xgboost_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --


(array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([0.9508815 , 0.94228107, 0.95510554, 0.92286396, 0.95444006,
        0.91965187, 0.1783064 , 0.95187634, 0.95664155, 0.38541287,
        0.88830286, 0.95405406, 0.20738281, 0.9517222 , 0.940617  ,
        0.9556913 , 0.9558435 , 0.9525777 , 0.6744385 , 0.40748125,
        0.95342803, 0.8916246 , 0.9555806 , 0.32341167, 0.91538525,
        0.31467322, 0.95209074, 0.9546635 , 0.94811803, 0.5652626 ,
        0.9501477 , 0.9157928 , 0.9511113 , 0.9385763 , 0.6873158 ,
        0.9558859 , 0.80047673, 0.24086325, 0.9501561 , 0.6252285 ,
        0.9468857 , 0.94882977, 0.9530658 , 0.9367032 , 0.89716655,
        0.8965393 , 0.9546563 , 0.8950229 

In [8]:
mlp_gex=MLModel(model_type='mlp', df=df,dataset_name='mlp_gene_expression')
mlp_gex.train_evaluate()

-- Initialized MLModel with model_type='MLP', dataset_name='MLP_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training MLP model on dataset 'mlp_gene_expression' --
--------------------------------------------------------------------------------

-- MLP Hyperparameters --
{   'activation': ['relu', 'tanh'],
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'solver': ['adam', 'sgd']}



-- Best Parameters --
{   'activation': 'relu',
    'hidden_layer_sizes': (50,),
    'learning_rate_init': 0.1,
    'solver': 'adam'}


-- best model parameters: {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.1, 'solver': 'adam'} --
--------------------------------------------------------------------------------
-- predicting MLP model on dataset 'mlp_gene_expression' --
---------------------------------------------------------

(array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

## Preliminary results and analysis

In [None]:
# results_df=svm_gex.compile_results(ALL_MODELS) # -- suggestion - maybe ask joelle

# -- dataframe that has accuracy and roc_auc as columns and model+dataset as index
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
results_df=pd.DataFrame(columns=['accuracy','roc_auc'])

for model_name, ml_model in ALL_MODELS.items():
    y_true=ml_model.y_test
    y_pred=ml_model.y_pred
    accuracy=accuracy_score(y_true, y_pred)
    roc_auc=roc_auc_score(y_true, y_pred)
    results_df.loc[model_name]=[accuracy, roc_auc]