This notebook aim to train all models on all datasets

In [None]:
import os
os.chdir(os.path.dirname(os.getcwd()))

from load_matrix import load_df
from training_utils import MLModel

prepare_df = lambda df: df.set_index('label').iloc[:, 3:]

Experimenting with different constants and parameters for the models

In [3]:
MLModel.set_global_variable('DEFAULT_SPLIT_RATIO', 0.3)

## Generalization to train for all datasets and models

In [4]:
from joblib import Parallel, delayed,dump
import os

dump_dir='dump/'

def train_one(dataset, model):
    pid = os.getpid()
    print(f"[PID {pid}] Training model: {model} on dataset: {dataset}")
    df = prepare_df(load_df(dataset))
    ml_model = MLModel(model_type=model, df=df, dataset_name=f'{model}_{dataset}')
    ml_model.train_evaluate()

    joblib_path = os.path.join(dump_dir, f'{model}_{dataset}_MLmodel.joblib')
    dump(ml_model, joblib_path)
    print(f'[PID {pid}] Model saved to {joblib_path}')

    return 

def train_all(datasets:list=['gene_expression', 'RGCN_sample_embeddings', 'Complex_sample_embeddings', 'concatenated_sample_embeddings', 'RGCN_protein_embeddings', 'Complex_protein_embeddings', 'concatenated_protein_embeddings'],model_types=MLModel.MODELS):
    results = Parallel(n_jobs=8)(
        delayed(train_one)(dataset, model)
        for dataset in datasets
        for model in model_types
    )
    return dict(results)


In [None]:
l1=['RGCN_sample_embeddings', 'Complex_sample_embeddings', 'RGCN_protein_embeddings', 'Complex_protein_embeddings']
l2=['gene_expression', 'concatenated_sample_embeddings', 'concatenated_protein_embeddings']
train_all(datasets=l2)

models that did not work:

- svm on RGCN_protein_embeddings

In [20]:
df=load_df('RGCN_protein_embeddings')
df=prepare_df(df)

In [21]:
model=MLModel(model_type='svm',df=df, dataset_name='RGCN_protein_embeddings')
model.train_evaluate()
dump(model, os.path.join(dump_dir, 'svm_RGCN_protein_embeddings_MLmodel.joblib'))

-- Initialized MLModel with model_type='SVM', dataset_name='RGCN_PROTEIN_EMBEDDINGS' --
-- split ratio: 0.3
-- random state: 42
--------------------------------------------------------------------------------
-- Training SVM model on dataset 'RGCN_protein_embeddings' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
{'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}




KeyboardInterrupt: 

In [13]:
import joblib
model_from_file_test=joblib.load('dump/svm_Complex_sample_embeddings_MLmodel.joblib')

In [None]:
ALL_MODELS={}
for joblib_file in os.listdir(dump_dir):
    if joblib_file.endswith('_MLmodel.joblib'):
        model_path=os.path.join(dump_dir, joblib_file)
        print(f'-- loading model from {model_path}')
        model_loaded=joblib.load(model_path)
        print(f'-- model {model_loaded.dataset_name} loaded')
        ALL_MODELS[joblib_file.replace('_MLmodel.joblib','')]=model_loaded

-- oading model from dump/mlp_Complex_protein_embeddings_MLmodel.joblib
-- model mlp_Complex_protein_embeddings loaded
-- oading model from dump/mlp_Complex_sample_embeddings_MLmodel.joblib
-- model mlp_Complex_sample_embeddings loaded
-- oading model from dump/mlp_RGCN_protein_embeddings_MLmodel.joblib
-- model mlp_RGCN_protein_embeddings loaded
-- oading model from dump/mlp_RGCN_sample_embeddings_MLmodel.joblib
-- model mlp_RGCN_sample_embeddings loaded
-- oading model from dump/svm_Complex_protein_embeddings_MLmodel.joblib
-- model svm_Complex_protein_embeddings loaded
-- oading model from dump/svm_Complex_sample_embeddings_MLmodel.joblib
-- model svm_Complex_sample_embeddings loaded
-- oading model from dump/svm_RGCN_sample_embeddings_MLmodel.joblib
-- model svm_RGCN_sample_embeddings loaded
-- oading model from dump/xgboost_Complex_protein_embeddings_MLmodel.joblib
-- model xgboost_Complex_protein_embeddings loaded
-- oading model from dump/xgboost_Complex_sample_embeddings_MLmode

In [19]:
# results_df=svm_gex.compile_results(ALL_MODELS) # -- suggestion - maybe ask joelle

# -- dataframe that has accuracy and roc_auc as columns and model+dataset as index
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
results_df=pd.DataFrame(columns=['accuracy','roc_auc'])

for model_name, ml_model in ALL_MODELS.items():
    y_true=ml_model.y_test
    y_pred=ml_model.y_pred
    accuracy=accuracy_score(y_true, y_pred)
    roc_auc=roc_auc_score(y_true, y_pred)
    results_df.loc[model_name]=[accuracy, roc_auc]

results_df

Unnamed: 0,accuracy,roc_auc
mlp_Complex_protein_embeddings,0.939394,0.888889
mlp_Complex_sample_embeddings,0.818182,0.666667
mlp_RGCN_protein_embeddings,0.727273,0.5
mlp_RGCN_sample_embeddings,0.727273,0.5
svm_Complex_protein_embeddings,0.909091,0.833333
svm_Complex_sample_embeddings,0.727273,0.5
svm_RGCN_sample_embeddings,0.727273,0.5
xgboost_Complex_protein_embeddings,0.969697,0.944444
xgboost_Complex_sample_embeddings,0.818182,0.666667
xgboost_RGCN_protein_embeddings,0.878788,0.8125
