In [1]:
import sys,os
sys.path.append(os.path.abspath('../../src/ml'))

from load_matrix import load_df
from model_trainer import MLModel

## doc

In [2]:
help(MLModel)

Help on class MLModel in module model_trainer:

class MLModel(builtins.object)
 |  MLModel(
 |      df,
 |      model_type,
 |      dataset_name,
 |      version='2.10',
 |      normalization='robust',
 |      hyperparameters=None,
 |      split_ratio=None,
 |      random_state=None,
 |      kfold=None,
 |      save_model=None
 |  )
 |
 |  Machine learning utility class for training and evaluating models with
 |  automatic hyperparameter tuning via GridSearchCV
 |
 |  Parameters
 |  ----------
 |  df : pandas.DataFrame
 |      Input dataframe containing features and a `disease_status` column.
 |  model_type : str
 |      Type of model: {'svm', 'xgboost', 'mlp', 'all'}.
 |  dataset_name : str
 |      Name of the dataset (used for saving models)
 |  version : str, optional
 |      Version identifier for the dataset of the format 'v2.x'
 |  normalization : str, optional
 |      nromalization type to perform on gene exp data when creating teh protein embeddings (saved for logging the steps

In [3]:
help(load_df)

Help on function load_df in module load_matrix:

load_df(key: str, folder_version: str = 'v2.9', normalization: str = 'robust') -> pandas.core.frame.DataFrame
    Generic interface to load expression data or Knowledge Graph embeddings.

    Parameters
    ----------
    key : str
        Type of data to load:

        - 'gene_expression' : Gene expression matrix (5000 probes) with sample metadata
        - 'RGCN_sample_embeddings' : Sample embeddings from RGCN model (128 dims)
        - 'Complex_sample_embeddings' : Sample embeddings from ComplEx model (128 dims)
        - 'concatenated_sample_embeddings' : Both RGCN + ComplEx concatenated (256 dims)
        - 'RGCN_protein_embeddings' : Protein embeddings from RGCN, weighted by gene expression
        - 'Complex_protein_embeddings' : Protein embeddings from ComplEx, weighted by gene expression
        - 'concatenated_protein_embeddings' : Both RGCN + ComplEx protein embeddings (256 dims)

        GNN models from results/embeddings/ (1

In [5]:
# -- prep gene exdpression
df=load_df('weighted_RGCN_protein_embeddings', normalization='minmax',folder_version='v2.11')
df.head()

Unnamed: 0_level_0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_91,emb_92,emb_93,emb_94,emb_95,emb_96,emb_97,emb_98,emb_99,disease_status
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sample_GSM1317896,-0.02983,0.008707,-0.018898,-0.000626,0.000153,0.138497,0.036395,-0.044243,-0.077607,0.003112,...,-0.004067,0.011081,0.027488,0.015967,0.056857,-0.056766,0.06086,-0.057555,0.004139,0
Sample_GSM1317897,-0.028333,0.009941,-0.01747,-0.002338,-0.000275,0.136319,0.034355,-0.047815,-0.073828,0.003645,...,-0.004305,0.009518,0.028369,0.016272,0.057786,-0.056868,0.059604,-0.058087,0.004715,0
Sample_GSM1317898,-0.028327,0.008403,-0.017324,-0.001915,0.001199,0.134352,0.034921,-0.046322,-0.072257,0.006336,...,-0.000583,0.011752,0.027627,0.016888,0.061091,-0.058142,0.059826,-0.058576,0.007605,0
Sample_GSM1317899,-0.02949,0.008151,-0.018023,-0.001672,0.0029,0.134482,0.035343,-0.047886,-0.072153,0.004762,...,0.000411,0.010267,0.028333,0.017289,0.058419,-0.05733,0.059888,-0.057331,0.007183,0
Sample_GSM1317900,-0.029308,0.008142,-0.019081,-0.001448,-0.001712,0.136601,0.03396,-0.045991,-0.080931,0.004638,...,-0.003442,0.009692,0.026978,0.017358,0.057294,-0.058938,0.060991,-0.056957,0.003391,0


In [4]:
help(load_df)

Help on function load_df in module load_matrix:

load_df(key: str, folder_version: str = 'v2.9', normalization: str = 'robust') -> pandas.core.frame.DataFrame
    Generic interface to load expression data or Knowledge Graph embeddings.

    Parameters
    ----------
    key : str
        Type of data to load:

        - 'gene_expression' : Gene expression matrix (5000 probes) with sample metadata
        - 'RGCN_sample_embeddings' : Sample embeddings from RGCN model (128 dims)
        - 'Complex_sample_embeddings' : Sample embeddings from ComplEx model (128 dims)
        - 'concatenated_sample_embeddings' : Both RGCN + ComplEx concatenated (256 dims)
        - 'RGCN_protein_embeddings' : Protein embeddings from RGCN, weighted by gene expression
        - 'Complex_protein_embeddings' : Protein embeddings from ComplEx, weighted by gene expression
        - 'concatenated_protein_embeddings' : Both RGCN + ComplEx protein embeddings (256 dims)

    folder_version : str, optional
        Ver

## some global variables

In [5]:
"models available to test:", MLModel.AVAILABLE_MODELS

('models available to test:',
 {'pytorch_mlp', 'random_forest', 'sklearn_mlp', 'svm', 'xgboost'})

In [6]:
"split ratio used by default:", MLModel.DEFAULT_SPLIT_RATIO

('split ratio used by default:', 0.2)

In [7]:
"random state:", MLModel.DEFAULT_RANDOM_STATE

('random state:', 42)

to change any of global variables can use this setter

In [8]:
# -- to change any of global variables can use this setter (cahnges it for all the trrainings)
MLModel.set_global_variable('DEFAULT_SPLIT_RATIO',0.4)
"changed split ratio:", MLModel.DEFAULT_SPLIT_RATIO

('changed split ratio:', 0.4)

This step is important to oerform on a global scale, as if we want to change any of the split ratio or kfold training, we'd most want it to be reflected across all models and datasets trained, this way we ensure consistency and reproducibility of our experiments.

As for individual model training, we can override these global variables by passing them as parameters to the `MLModel` class during instantiation without affecting the global settings.

list of global variables:

In [9]:
MLModel.get_global_variables()

{'CACHE_DIR': '.cache/',
 'SYSOUT_FILE': None,
 'DEFAULT_SAVE': False,
 'DEFAULT_LOGGING': False,
 'DEFAULT_KFOLD': 3,
 'DEFAULT_SPLIT_RATIO': 0.4,
 'DEFAULT_RANDOM_STATE': 42,
 'DEFAULT_SCORING': 'accuracy',
 'SVM_HYPERPARAMS': {'C': [0.1, 1],
  'kernel': ['linear', 'rbf'],
  'gamma': ['scale', 'auto'],
  'class_weight': ['balanced', {0: 2, 1: 1}, {0: 3, 1: 1}]},
 'XGBOOST_HYPERPARAMS': {'n_estimators': [50, 100, 200],
  'max_depth': [3, 5, 7],
  'learning_rate': [0.01, 0.1, 0.2],
  'subsample': [0.6, 0.8, 1.0],
  'scale_pos_weight': [0.28346456692913385, 0.4, 0.2]},
 'PYTORCH_MLP_HYPERPARAMS': {'hidden_layer_sizes': [(50,), (100,)],
  'max_iter': [40],
  'activation': ['relu'],
  'solver': ['adam', 'adamW'],
  'learning_rate_init': [0.001, 0.0001],
  'batch_size': [16],
  'dropout_rate': [0.0, 0.1]},
 'SKLEARN_MLP_HYPERPARAMS': {'hidden_layer_sizes': [(50,), (100,), (100, 50)],
  'activation': ['relu'],
  'solver': ['adam', 'sgd'],
  'learning_rate_init': [0.001, 0.01, 0.1]}}

In [10]:
MLModel.SVM_HYPERPARAMS

{'C': [0.1, 1],
 'kernel': ['linear', 'rbf'],
 'gamma': ['scale', 'auto'],
 'class_weight': ['balanced', {0: 2, 1: 1}, {0: 3, 1: 1}]}

# training example

first example of trianing, testing on gene expression dataset with SVM

In [6]:
svm_gex=MLModel(model_type='svm', df=df,dataset_name='svm_gene_expression') #-- minimal paramteresm default v2.10

-- [svm_svm_gene_expression] Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION' --
-- [svm_svm_gene_expression] split ratio: 0.2
-- [svm_svm_gene_expression] random state: 42
-- [svm_svm_gene_expression] CACHE_DIR is: .cache/ --
-- [svm_svm_gene_expression] setting SYSOUT_FILE to: svm_svm_gene_expression_2.10_robust_training_utils.log --


In [7]:
print(svm_gex)
# -- repr representation
svm_gex

MLModel
├─ normalization: robust
├─ model_type: svm
├─ dataset_name: svm_gene_expression
├─ version: 2.10
├─ split_ratio: 0.2
├─ random_state: 42
├─ best_model:
    None
└─ save_model: False


MLModel(
    model_type=svm,
    dataset_name=svm_gene_expression,
    version=2.10,
    normalization=robust,
    split_ratio=0.2,
    kfold=3,
    random_state=42,
    best_model=None
    sysout_file=svm_svm_gene_expression_2.10_robust_training_utils.log,
    cache_dir=.cache/,
    logging=False,
    save_model=False,
)

In [8]:
y_test,y_pred,y_proba = svm_gex.train_evaluate()

--------------------------------------------------------------------------------
-- [svm_svm_gene_expression] Training SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
  C: [0.1, 1]
  kernel: ['linear', 'rbf']
  gamma: ['scale', 'auto']
  class_weight: ['balanced', {0: 2, 1: 1}, {0: 3, 1: 1}]





-- Best Parameters --
  C: 0.1
  class_weight:
    0: 2
    1: 1
  gamma: scale
  kernel: linear


-- [svm_svm_gene_expression] best model parameters: {'C': 0.1, 'class_weight': {0: 2, 1: 1}, 'gamma': 'scale', 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- [svm_svm_gene_expression] predicting SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --


In [9]:
print(svm_gex)
svm_gex

MLModel
├─ normalization: robust
├─ model_type: svm
├─ dataset_name: svm_gene_expression
├─ version: 2.10
├─ split_ratio: 0.2
├─ random_state: 42
├─ best_model:
    SVC(
      ├─ C: 0.1
      ├─ break_ties: False
      ├─ cache_size: 200
      ├─ class_weight: {0: 2, 1: 1}
      ├─ coef0: 0.0
      ├─ decision_function_shape: ovr
      ├─ degree: 3
      ├─ gamma: scale
      ├─ kernel: linear
      ├─ max_iter: 10000
      ├─ probability: True
      ├─ random_state: 42
      ├─ shrinking: True
      ├─ tol: 0.001
      ├─ verbose: False
    )
└─ save_model: False


MLModel(
    model_type=svm,
    dataset_name=svm_gene_expression,
    version=2.10,
    normalization=robust,
    split_ratio=0.2,
    kfold=3,
    random_state=42,
    best_model=SVC(C=0.1, class_weight={0: 2, 1: 1}, kernel='linear', max_iter=10000,
    probability=True, random_state=42)
    sysout_file=svm_svm_gene_expression_2.10_robust_training_utils.log,
    cache_dir=.cache/,
    logging=False,
    save_model=False,
)

second example to show options we re able to put while training

In [15]:
svm_gex_2=MLModel(model_type='svm', df=df,dataset_name='svm_gene_expression_2', split_ratio=0.3, random_state=123,
                  hyperparameters={'C':[0.1,1,10], 'kernel':['linear','rbf']})
svm_gex_2.train() # -- train and evaluate run seperately 
svm_gex_2.evaluate()

print(svm_gex_2)

-- [svm_svm_gene_expression_2] Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION_2' --
-- [svm_svm_gene_expression_2] split ratio: 0.3
-- [svm_svm_gene_expression_2] random state: 123
-- [svm_svm_gene_expression_2] CACHE_DIR is: .cache/ --
--------------------------------------------------------------------------------
-- [svm_svm_gene_expression_2] Training SVM model on dataset 'svm_gene_expression_2' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
  C: [0.1, 1, 10]
  kernel: ['linear', 'rbf']



-- Best Parameters --
  C: 0.1
  kernel: linear


-- [svm_svm_gene_expression_2] best model parameters: {'C': 0.1, 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- [svm_svm_gene_expression_2] predicting SVM model on dataset 'svm_gene_expression_2' --
--------------------------------------------------------------------------------
-- Predictions ma

To see how to access results after training (all from the MLModel object)

In [16]:
svm_gex.grid_search_model

0,1,2
,estimator,SVC(max_iter=...ndom_state=42)
,param_grid,"{'C': [0.1, 1], 'class_weight': ['balanced', {0: 2, 1: 1}, ...], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,C,0.1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [17]:
svm_gex_2.best_model

0,1,2
,C,0.1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [18]:
if svm_gex.y_test is not None: print('y_test attribute exists')
if svm_gex.y_pred is not None: print('y_pred attribute exists')
if svm_gex.y_proba is not None: print('y_proba attribute exists')

y_test attribute exists
y_pred attribute exists
y_proba attribute exists


In [19]:
df_sample=load_df('Complex_sample_embeddings')

 Complex embeddings detected in /mnt/c/Users/rayan/Documents/saclay/courses/deep-learning-and-artifical-intelligence/project/2526-m2geniomhe-GNN-sepsis/models/executions/GSE54514_enriched_ontology_degfilterv2.9/outputmodel_Complex_entity_embeddings.npy, splitting real+imag.
