In [1]:
import sys,os
sys.path.append(os.path.abspath('../src/ml'))

from load_matrix import load_df
from model_trainer import MLModel

## doc

In [2]:
help(MLModel)

Help on class MLModel in module model_trainer:

class MLModel(builtins.object)
 |  MLModel(
 |      df,
 |      model_type,
 |      dataset_name,
 |      version='2.10',
 |      hyperparameters=None,
 |      split_ratio=None,
 |      random_state=None,
 |      kfold=None,
 |      save_model=None
 |  )
 |
 |  Machine learning utility class for training and evaluating models with
 |  automatic hyperparameter tuning via GridSearchCV
 |
 |  Parameters
 |  ----------
 |  df : pandas.DataFrame
 |      Input dataframe containing features and a `disease_status` column.
 |  model_type : str
 |      Type of model: {'svm', 'xgboost', 'mlp', 'all'}.
 |  dataset_name : str
 |      Name of the dataset (used for saving models)
 |  version : str, optional
 |      Version identifier for the dataset of the format 'v2.x'
 |  hyperparameters : dict, optional
 |      Custom hyperparameters for the selected model.
 |  split_ratio : float, optional
 |      Fraction of data to use for testing (default is clas

In [3]:
# -- prep gene exdpression
df=load_df('gene_expression')
df.head()

Unnamed: 0_level_0,ILMN_2055271,ILMN_2383229,ILMN_1806310,ILMN_1653355,ILMN_2359168,ILMN_1735045,ILMN_1680754,ILMN_1755321,ILMN_1698554,ILMN_1814092,...,ILMN_1685547,ILMN_2348512,ILMN_1743643,ILMN_1656676,ILMN_2371169,ILMN_1701875,ILMN_1786396,ILMN_1653618,ILMN_2137536,disease_status
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sample_GSM1317896,7.478478,7.73289,7.346,7.541464,7.080655,7.190928,7.043338,7.282018,7.467964,7.253072,...,7.085921,7.725375,7.403544,9.052076,10.826351,11.510887,7.748866,7.265069,7.52005,0
Sample_GSM1317897,7.194329,7.091616,7.127377,7.13905,7.076957,7.101918,7.574589,7.151289,7.164247,7.159713,...,7.336513,7.182846,7.689322,10.352952,11.409938,12.015053,7.99669,7.173827,7.24176,0
Sample_GSM1317898,7.118431,7.008995,7.147905,7.251543,7.031998,7.064355,7.070183,7.30601,7.2779,7.067508,...,7.122465,7.274913,7.49892,9.43999,10.910384,11.717704,8.072896,7.261295,7.426808,0
Sample_GSM1317899,7.28612,7.12294,7.106208,7.241961,7.050389,7.096496,7.110168,7.278481,7.289622,7.22406,...,7.104158,7.215575,7.570657,9.394814,10.858485,12.1528,7.79662,7.399535,7.567951,0
Sample_GSM1317900,7.225378,7.035687,7.268438,7.289307,7.099056,7.138676,7.375244,7.279981,7.543007,7.231372,...,7.223078,7.218124,7.576488,9.480348,11.233749,12.124186,7.987269,7.235961,7.218149,0


## some global variables

In [4]:
"models available to test:", MLModel.AVAILABLE_MODELS

('models available to test:',
 {'pytorch_mlp', 'random_forest', 'sklearn_mlp', 'svm', 'xgboost'})

In [5]:
"split ratio used by default:", MLModel.DEFAULT_SPLIT_RATIO

('split ratio used by default:', 0.2)

In [6]:
"random state:", MLModel.DEFAULT_RANDOM_STATE

('random state:', 42)

to change any of global variables can use this setter

In [4]:
# -- to change any of global variables can use this setter (cahnges it for all the trrainings)
MLModel.set_global_variable('DEFAULT_SPLIT_RATIO',0.4)
"changed split ratio:", MLModel.DEFAULT_SPLIT_RATIO

('changed split ratio:', 0.4)

This step is important to oerform on a global scale, as if we want to change any of the split ratio or kfold training, we'd most want it to be reflected across all models and datasets trained, this way we ensure consistency and reproducibility of our experiments.

As for individual model training, we can override these global variables by passing them as parameters to the `MLModel` class during instantiation without affecting the global settings.

list of global variables:

In [8]:
MLModel.get_global_variables()

{'CACHE_DIR': '.cache/',
 'SYSOUT_FILE': None,
 'DEFAULT_SAVE': False,
 'DEFAULT_LOGGING': False,
 'DEFAULT_KFOLD': 3,
 'DEFAULT_SPLIT_RATIO': 0.4,
 'DEFAULT_RANDOM_STATE': 42,
 'DEFAULT_SCORING': 'accuracy',
 'SVM_HYPERPARAMS': {'C': [0.1, 1],
  'kernel': ['linear', 'rbf'],
  'gamma': ['scale', 'auto'],
  'class_weight': ['balanced', {0: 2, 1: 1}, {0: 3, 1: 1}]},
 'XGBOOST_HYPERPARAMS': {'n_estimators': [50, 100, 200],
  'max_depth': [3, 5, 7],
  'learning_rate': [0.01, 0.1, 0.2],
  'subsample': [0.6, 0.8, 1.0],
  'scale_pos_weight': [0.28346456692913385, 0.4, 0.2]},
 'PYTORCH_MLP_HYPERPARAMS': {'hidden_layer_sizes': [(50,), (100,), (100, 50)],
  'activation': ['relu', 'leaky_relu'],
  'solver': ['adam', 'sgd'],
  'learning_rate_init': [0.001, 0.01, 0.1],
  'batch_size': 16},
 'SKLEARN_MLP_HYPERPARAMS': {'hidden_layer_sizes': [(50,), (100,), (100, 50)],
  'activation': ['relu', 'tanh'],
  'solver': ['adam', 'sgd'],
  'learning_rate_init': [0.001, 0.01, 0.1]}}

In [9]:
MLModel.SVM_HYPERPARAMS

{'C': [0.1, 1],
 'kernel': ['linear', 'rbf'],
 'gamma': ['scale', 'auto'],
 'class_weight': ['balanced', {0: 2, 1: 1}, {0: 3, 1: 1}]}

# training example

first example of trianing, testing on gene expression dataset with SVM

In [5]:
svm_gex=MLModel(model_type='svm', df=df,dataset_name='svm_gene_expression') #-- minimal paramteresm default v2.10

-- [svm_svm_gene_expression] Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION' --
-- [svm_svm_gene_expression] split ratio: 0.4
-- [svm_svm_gene_expression] random state: 42
-- [svm_svm_gene_expression] CACHE_DIR is: .cache/ --
-- [svm_svm_gene_expression] setting SYSOUT_FILE to: svm_svm_gene_expression_2.10_training_utils.log --


In [9]:
print(svm_gex)
# -- repr representation
svm_gex

MLModel
├─ model_type: svm
├─ dataset_name: svm_gene_expression
├─ version: 2.10
├─ split_ratio: 0.4
├─ random_state: 42
├─ best_model:
    None
└─ save_model: False


MLModel(
    model_type=svm,
    dataset_name=svm_gene_expression,
    version=2.10,
    split_ratio=0.4,
    kfold=3,
    random_state=42,
    best_model=None
    sysout_file=svm_svm_gene_expression_2.10_training_utils.log,
    cache_dir=.cache/,
    logging=False,
    save_model=False,
)

In [10]:
y_test,y_pred,y_proba = svm_gex.train_evaluate()

--------------------------------------------------------------------------------
-- [svm_svm_gene_expression] Training SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
  C: [0.1, 1]
  kernel: ['linear', 'rbf']
  gamma: ['scale', 'auto']
  class_weight: ['balanced', {0: 2, 1: 1}, {0: 3, 1: 1}]





-- Best Parameters --
  C: 0.1
  class_weight: balanced
  gamma: scale
  kernel: linear


-- [svm_svm_gene_expression] best model parameters: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- [svm_svm_gene_expression] predicting SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --


In [12]:
print(svm_gex)
svm_gex

MLModel
├─ model_type: svm
├─ dataset_name: svm_gene_expression
├─ version: 2.10
├─ split_ratio: 0.4
├─ random_state: 42
├─ best_model:
    SVC(
      ├─ C: 0.1
      ├─ break_ties: False
      ├─ cache_size: 200
      ├─ class_weight: balanced
      ├─ coef0: 0.0
      ├─ decision_function_shape: ovr
      ├─ degree: 3
      ├─ gamma: scale
      ├─ kernel: linear
      ├─ max_iter: 10000
      ├─ probability: True
      ├─ random_state: None
      ├─ shrinking: True
      ├─ tol: 0.001
      ├─ verbose: False
    )
└─ save_model: False


MLModel(
    model_type=svm,
    dataset_name=svm_gene_expression,
    version=2.10,
    split_ratio=0.4,
    kfold=3,
    random_state=42,
    best_model=SVC(C=0.1, class_weight='balanced', kernel='linear', max_iter=10000,
    probability=True)
    sysout_file=svm_svm_gene_expression_2.10_training_utils.log,
    cache_dir=.cache/,
    logging=False,
    save_model=False,
)

second example to show options we re able to put while training

In [13]:
svm_gex_2=MLModel(model_type='svm', df=df,dataset_name='svm_gene_expression_2', split_ratio=0.3, random_state=123,
                  hyperparameters={'C':[0.1,1,10], 'kernel':['linear','rbf']})
svm_gex_2.train() # -- train and evaluate run seperately 
svm_gex_2.evaluate()

print(svm_gex_2)

-- [svm_svm_gene_expression_2] Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION_2' --
-- [svm_svm_gene_expression_2] split ratio: 0.3
-- [svm_svm_gene_expression_2] random state: 123
-- [svm_svm_gene_expression_2] CACHE_DIR is: .cache/ --
--------------------------------------------------------------------------------
-- [svm_svm_gene_expression_2] Training SVM model on dataset 'svm_gene_expression_2' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
  C: [0.1, 1, 10]
  kernel: ['linear', 'rbf']



-- Best Parameters --
  C: 0.1
  kernel: linear


-- [svm_svm_gene_expression_2] best model parameters: {'C': 0.1, 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- [svm_svm_gene_expression_2] predicting SVM model on dataset 'svm_gene_expression_2' --
--------------------------------------------------------------------------------
-- Predictions ma

To see how to access results after training (all from the MLModel object)

In [20]:
svm_gex.grid_search_model

0,1,2
,estimator,SVC(max_iter=...bability=True)
,param_grid,"{'C': [0.1, 1], 'class_weight': ['balanced', {0: 2, 1: 1}, ...], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,C,0.1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [22]:
svm_gex_2.best_model

0,1,2
,C,0.1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [None]:
if svm_gex.y_test is not None: print('y_test attribute exists')
if svm_gex.y_pred is not None: print('y_pred attribute exists')
if svm_gex.y_proba is not None: print('y_proba attribute exists')

y_test attribute exists
y_pred attribute exists
y_proba attribute exists
