This notebook aim to train all models on all datasets

In [2]:
import os
os.chdir(os.path.dirname(os.getcwd()))

from load_matrix import load_df
from training_utils import MLModel

prepare_df = lambda df: df.set_index('label').iloc[:, 2:]

Experimenting with different constants and parameters for the models

In [2]:
MLModel.set_global_variable('DEFAULT_SPLIT_RATIO', 0.3)

## Generalization to train for all datasets and models

In [6]:
def train_all(datasets:list=['gene_expression', 'RGCN_sample_embeddings', 'Complex_sample_embeddings', 'concatenated_sample_embeddings', 'RGCN_protein_embeddings', 'Complex_protein_embeddings', 'concatenated_protein_embeddings'],model_types=MLModel.MODELS):
    trained_MLModels={}

    for dataset in datasets:
        df=prepare_df( load_df(dataset))
        for model in model_types:
            ml_model=MLModel(model_type=model, df=df,dataset_name=f'{model}_{dataset}')
            ml_model.train_evaluate()
            trained_MLModels[f'{model}_{dataset}']=ml_model
    return trained_MLModels

In [None]:
ALL_MODELS=train_all()

-- Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
{'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}



-- Best Parameters --
{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


-- best model parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- predicting SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --
-- Initialized MLModel with model_type='XGBOOST', dataset_name='XGBOOST_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
---------

Will still go through each model of gene xexression to show the process of training one model at a time for a dataset

## Gene expression dataset:

In [6]:
df=prepare_df(load_df('gene_expression'))

svm_gex=MLModel(model_type='svm', df=df,dataset_name='svm_gene_expression')
svm_gex.train_evaluate()

-- Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
{'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}



-- Best Parameters --
{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


-- best model parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- predicting SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --


(array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([0.8601056 , 0.97352314, 0.96268682, 0.97207285, 0.99998585,
        0.99999743, 0.03188881, 0.94029305, 0.99161448, 0.47347599,
        0.98266576, 0.99999882, 0.01768346, 0.97846755, 0.91158256,
        0.97354565, 0.9775124 , 0.98380876, 0.08846772, 0.15821683,
        0.99600484, 0.91512376, 0.25635128, 0.09353043, 0.98269152,
        0.09087359, 0.96918257, 0.97190346, 0.94390318, 0.10575014,
        0.99654882, 0.97689126, 0.96141841, 0.98037672, 0.29569448,
        0.99999952, 0.67420122, 0.33721608, 0.98773903, 0.13358725,
        0.99193194, 0.96308908, 0.95894933, 0.94894598, 0.96224116,
        0.97730671, 0.99998675, 0.94424105

In [7]:
xgboost_gex=MLModel(model_type='xgboost', df=df,dataset_name='xgboost_gene_expression')
xgboost_gex.train_evaluate()

-- Initialized MLModel with model_type='XGBOOST', dataset_name='XGBOOST_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training XGBOOST model on dataset 'xgboost_gene_expression' --
--------------------------------------------------------------------------------

-- XGBoost Hyperparameters --
{   'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]}



-- Best Parameters --
{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}


-- best model parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} --
--------------------------------------------------------------------------------
-- predicting XGBOOST model on dataset 'xgboost_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --


(array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([0.9508815 , 0.94228107, 0.95510554, 0.92286396, 0.95444006,
        0.91965187, 0.1783064 , 0.95187634, 0.95664155, 0.38541287,
        0.88830286, 0.95405406, 0.20738281, 0.9517222 , 0.940617  ,
        0.9556913 , 0.9558435 , 0.9525777 , 0.6744385 , 0.40748125,
        0.95342803, 0.8916246 , 0.9555806 , 0.32341167, 0.91538525,
        0.31467322, 0.95209074, 0.9546635 , 0.94811803, 0.5652626 ,
        0.9501477 , 0.9157928 , 0.9511113 , 0.9385763 , 0.6873158 ,
        0.9558859 , 0.80047673, 0.24086325, 0.9501561 , 0.6252285 ,
        0.9468857 , 0.94882977, 0.9530658 , 0.9367032 , 0.89716655,
        0.8965393 , 0.9546563 , 0.8950229 

In [8]:
mlp_gex=MLModel(model_type='mlp', df=df,dataset_name='mlp_gene_expression')
mlp_gex.train_evaluate()

-- Initialized MLModel with model_type='MLP', dataset_name='MLP_GENE_EXPRESSION' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training MLP model on dataset 'mlp_gene_expression' --
--------------------------------------------------------------------------------

-- MLP Hyperparameters --
{   'activation': ['relu', 'tanh'],
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'solver': ['adam', 'sgd']}



-- Best Parameters --
{   'activation': 'relu',
    'hidden_layer_sizes': (50,),
    'learning_rate_init': 0.1,
    'solver': 'adam'}


-- best model parameters: {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.1, 'solver': 'adam'} --
--------------------------------------------------------------------------------
-- predicting MLP model on dataset 'mlp_gene_expression' --
---------------------------------------------------------

(array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

## Preliminary results and analysis

In [None]:
# results_df=svm_gex.compile_results(ALL_MODELS) # -- suggestion - maybe ask joelle

# -- dataframe that has accuracy and roc_auc as columns and model+dataset as index
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
results_df=pd.DataFrame(columns=['accuracy','roc_auc'])

for model_name, ml_model in ALL_MODELS.items():
    y_true=ml_model.y_test
    y_pred=ml_model.y_pred
    accuracy=accuracy_score(y_true, y_pred)
    roc_auc=roc_auc_score(y_true, y_pred)
    results_df.loc[model_name]=[accuracy, roc_auc]

testing v 2.10 and 2.11

In [5]:
load_df('RGCN_protein_embeddings',folder_version='v2.10')

Unnamed: 0,label,hasAge,hasGender,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,emb_91,emb_92,emb_93,emb_94,emb_95,emb_96,emb_97,emb_98,emb_99,disease_status
0,Sample_GSM1317896,42,F,1.041564e+08,-1.856555e+07,-7.171535e+07,5.491731e+06,-4.688113e+07,-2.219227e+07,-8.172785e+07,...,1.413883e+07,2.491150e+07,5.089528e+07,-1.249001e+06,-1.710872e+07,-2.895010e+07,7.386023e+07,-1.465597e+07,1.211688e+08,0
1,Sample_GSM1317897,40,F,1.031635e+08,-1.825880e+07,-7.059662e+07,5.366919e+06,-4.653230e+07,-2.225576e+07,-8.143072e+07,...,1.363377e+07,2.455231e+07,5.155139e+07,-9.796111e+05,-1.663286e+07,-2.913977e+07,7.317417e+07,-1.402071e+07,1.208207e+08,0
2,Sample_GSM1317898,66,M,1.047366e+08,-1.869426e+07,-7.171226e+07,5.661578e+06,-4.684151e+07,-2.189953e+07,-8.179352e+07,...,1.417070e+07,2.504259e+07,5.123181e+07,-1.810621e+06,-1.724271e+07,-2.924297e+07,7.386671e+07,-1.497433e+07,1.217964e+08,0
3,Sample_GSM1317899,24,M,1.034176e+08,-1.831647e+07,-7.098888e+07,5.409559e+06,-4.638779e+07,-2.182975e+07,-8.111944e+07,...,1.398713e+07,2.467164e+07,5.069539e+07,-1.382218e+06,-1.679584e+07,-2.884257e+07,7.309873e+07,-1.420960e+07,1.203769e+08,0
4,Sample_GSM1317900,70,F,1.058105e+08,-1.897021e+07,-7.234077e+07,5.964116e+06,-4.710452e+07,-2.181876e+07,-8.238839e+07,...,1.447247e+07,2.537208e+07,5.138284e+07,-1.925043e+06,-1.762980e+07,-2.949406e+07,7.455681e+07,-1.557661e+07,1.227097e+08,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,Sample_GSM1318054,24,M,1.076453e+08,-1.946107e+07,-7.356761e+07,6.299761e+06,-4.769971e+07,-2.169028e+07,-8.321721e+07,...,1.496724e+07,2.596743e+07,5.154868e+07,-2.545830e+06,-1.830986e+07,-2.980980e+07,7.561600e+07,-1.664725e+07,1.242212e+08,0
159,Sample_GSM1318055,25,F,1.063361e+08,-1.909217e+07,-7.285957e+07,5.913768e+06,-4.731963e+07,-2.169396e+07,-8.249121e+07,...,1.468669e+07,2.561875e+07,5.109485e+07,-2.257103e+06,-1.780786e+07,-2.946089e+07,7.479548e+07,-1.586912e+07,1.229256e+08,0
160,Sample_GSM1318056,24,F,1.068225e+08,-1.920245e+07,-7.298084e+07,6.140278e+06,-4.731799e+07,-2.151655e+07,-8.265454e+07,...,1.480805e+07,2.572119e+07,5.128251e+07,-2.430390e+06,-1.797269e+07,-2.964812e+07,7.497479e+07,-1.613756e+07,1.233735e+08,0
161,Sample_GSM1318057,27,F,1.065718e+08,-1.918149e+07,-7.284724e+07,6.108736e+06,-4.730587e+07,-2.168676e+07,-8.267803e+07,...,1.474727e+07,2.563000e+07,5.138622e+07,-2.250409e+06,-1.790370e+07,-2.957862e+07,7.495265e+07,-1.602392e+07,1.232748e+08,0
