In [3]:
from load_matrix import load_df
from training_utils import MLModel

## doc

In [4]:
help(MLModel)

Help on class MLModel in module training_utils:

class MLModel(builtins.object)
 |  MLModel(
 |      df,
 |      model_type,
 |      dataset_name,
 |      hyperparameters=None,
 |      split_ratio=None,
 |      random_state=None,
 |      save_model=None
 |  )
 |
 |  Machine learning utility class for training and evaluating models with
 |  automatic hyperparameter tuning via GridSearchCV.
 |
 |  Parameters
 |  ----------
 |  df : pandas.DataFrame
 |      Input dataframe containing features and a `disease_status` column.
 |  model_type : str
 |      Type of model: {'svm', 'xgboost', 'mlp', 'all'}.
 |  dataset_name : str
 |      Name of the dataset (used for saving models).
 |  hyperparameters : dict, optional
 |      Custom hyperparameters for the selected model.
 |  split_ratio : float, optional
 |      Fraction of data to use for testing (default is class attribute).
 |  random_state : int, optional
 |      Seed for reproducibility.
 |  save_model : bool, optional
 |      Whether to s

## data prep

In [5]:
# -- prep gene exdpression
df=load_df('gene_expression')
df.index=df['label']
df=df.iloc[:,3:]
df.head()

Unnamed: 0_level_0,ILMN_2055271,ILMN_2383229,ILMN_1806310,ILMN_1653355,ILMN_2359168,ILMN_1735045,ILMN_1680754,ILMN_1755321,ILMN_1698554,ILMN_1814092,...,ILMN_1685547,ILMN_2348512,ILMN_1743643,ILMN_1656676,ILMN_2371169,ILMN_1701875,ILMN_1786396,ILMN_1653618,ILMN_2137536,disease_status
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sample_GSM1317896,7.478478,7.73289,7.346,7.541464,7.080655,7.190928,7.043338,7.282018,7.467964,7.253072,...,7.085921,7.725375,7.403544,9.052076,10.826351,11.510887,7.748866,7.265069,7.52005,0
Sample_GSM1317897,7.194329,7.091616,7.127377,7.13905,7.076957,7.101918,7.574589,7.151289,7.164247,7.159713,...,7.336513,7.182846,7.689322,10.352952,11.409938,12.015053,7.99669,7.173827,7.24176,0
Sample_GSM1317898,7.118431,7.008995,7.147905,7.251543,7.031998,7.064355,7.070183,7.30601,7.2779,7.067508,...,7.122465,7.274913,7.49892,9.43999,10.910384,11.717704,8.072896,7.261295,7.426808,0
Sample_GSM1317899,7.28612,7.12294,7.106208,7.241961,7.050389,7.096496,7.110168,7.278481,7.289622,7.22406,...,7.104158,7.215575,7.570657,9.394814,10.858485,12.1528,7.79662,7.399535,7.567951,0
Sample_GSM1317900,7.225378,7.035687,7.268438,7.289307,7.099056,7.138676,7.375244,7.279981,7.543007,7.231372,...,7.223078,7.218124,7.576488,9.480348,11.233749,12.124186,7.987269,7.235961,7.218149,0


## some global variables

In [4]:
"models available to test:", MLModel.MODELS

('models available to test:', {'mlp', 'svm', 'xgboost'})

In [5]:
"split ratio used by default:", MLModel.DEFAULT_SPLIT_RATIO

('split ratio used by default:', 0.2)

In [6]:
"random state:", MLModel.DEFAULT_RANDOM_STATE

('random state:', 42)

to change any of global variables can use this setter

In [8]:
# -- to change any of global variables can use this setter (cahnges it for all the trrainings)
MLModel.set_global_variable('DEFAULT_SPLIT_RATIO',0.4)
"changed split ratio:", MLModel.DEFAULT_SPLIT_RATIO

('changed split ratio:', 0.4)

list of global variables:

In [5]:
class_attr_methods=[a for a in dir(MLModel) if not a.startswith("__")]
class_attr_methods

['CACHE_DIR',
 'DEFAULT_KFOLD',
 'DEFAULT_RANDOM_STATE',
 'DEFAULT_SAVE',
 'DEFAULT_SCORING',
 'DEFAULT_SPLIT_RATIO',
 'MLP_HYPERPARAMS',
 'MODELS',
 'SVM_HYPERPARAMS',
 'SYSOUT_FILE',
 'XGBOOST_HYPERPARAMS',
 '_define_model',
 '_pretty_print_dict',
 '_validate_hyperparameters',
 'evaluate',
 'initialize_logging',
 'pp',
 'set_global_variable',
 'train',
 'train_evaluate']

In [12]:
MLModel.SVM_HYPERPARAMS

{'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}

# training example

first example of trianing, testing on gene expression dataset with SVM

In [8]:
svm_gex=MLModel(model_type='svm', df=df,dataset_name='svm_gene_expression') #-- minimal paramteres
svm_gex.train_evaluate()

-- Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION' --
-- split ratio: 0.2
-- random state: 42
--------------------------------------------------------------------------------
-- Training SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
{'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}





-- Best Parameters --
{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


-- best model parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- predicting SVM model on dataset 'svm_gene_expression' --
--------------------------------------------------------------------------------
-- Predictions made on test set --


(array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1]),
 array([0.86840864, 0.97775017, 0.99226401, 0.97470238, 0.99999907,
        0.99999957, 0.01472093, 0.89910849, 0.99690299, 0.38468029,
        0.98056958, 0.99999992, 0.00630791, 0.99142857, 0.94043569,
        0.98376861, 0.99146575, 0.98846432, 0.02708513, 0.04328038,
        0.99999611, 0.76805226, 0.0427885 , 0.0675675 , 0.99196798,
        0.04790184, 0.97017932, 0.95357299, 0.96849681, 0.02769614,
        0.99999257, 0.99345595, 0.96296986]))

second example to show options we re able to put while training

In [10]:
svm_gex_2=MLModel(model_type='svm', df=df,dataset_name='svm_gene_expression_2', split_ratio=0.3, random_state=123,
                  hyperparameters={'C':[0.1,1,10], 'kernel':['linear','rbf']})
svm_gex_2.train() # -- train and evaluate run seperately 
svm_gex_2.evaluate()

''

-- Initialized MLModel with model_type='SVM', dataset_name='SVM_GENE_EXPRESSION_2' --
-- split ratio: 0.3
-- random state: 123
--------------------------------------------------------------------------------
-- Training SVM model on dataset 'svm_gene_expression_2' --
--------------------------------------------------------------------------------

-- SVM Hyperparameters --
{'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}



-- Best Parameters --
{'C': 0.1, 'kernel': 'linear'}


-- best model parameters: {'C': 0.1, 'kernel': 'linear'} --
--------------------------------------------------------------------------------
-- predicting SVM model on dataset 'svm_gene_expression_2' --
--------------------------------------------------------------------------------
-- Predictions made on test set --


''

To see how to access results after training (all from the MLModel object)

In [11]:
svm_gex.grid_search_model

0,1,2
,estimator,SVC(probability=True)
,param_grid,"{'C': [0.1, 1, ...], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,C,0.1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [22]:
if svm_gex.df is not None: print('df attribute exists')
if svm_gex.X is not None: print('X attribute exists')
if svm_gex.y is not None: print('y attribute exists')
if svm_gex.X_train is not None: print('X_train attribute exists')
if svm_gex.X_test is not None: print('X_test attribute exists')
if svm_gex.y_train is not None: print('y_train attribute exists')
if svm_gex.y_test is not None: print('y_test attribute exists')
if svm_gex.y_pred is not None: print('y_pred attribute exists')
if svm_gex.y_proba is not None: print('y_proba attribute exists')



df attribute exists
X attribute exists
y attribute exists
X_train attribute exists
X_test attribute exists
y_train attribute exists
y_test attribute exists
y_pred attribute exists
y_proba attribute exists


Some preliminary results to test how

In [23]:
# -- get some accuracies 
from sklearn.metrics import accuracy_score, roc_auc_score

y_true,y_pred, y_proba=svm_gex.y_test,svm_gex.y_pred,svm_gex.y_proba

accuracy=accuracy_score(y_true,y_pred)
roc_auc=roc_auc_score(y_true,y_pred)
print(f"Accuracy: {accuracy}, ROC-AUC: {roc_auc}")

Accuracy: 1.0, ROC-AUC: 1.0


trying on mlp

In [11]:
mlp_gex=MLModel(model_type='mlp', df=df,dataset_name='mlp_gene_expression') #-- minimal paramteres
mlp_gex.train_evaluate()

accuracy=accuracy_score(mlp_gex.y_test,mlp_gex.y_pred)
roc_auc=roc_auc_score(mlp_gex.y_test,mlp_gex.y_proba)
print(f"Accuracy: {accuracy}, ROC-AUC: {roc_auc}")

-- Initialized MLModel with model_type='MLP', dataset_name='MLP_GENE_EXPRESSION' --
-- split ratio: 0.4
-- random state: 42
--------------------------------------------------------------------------------
-- Training MLP model on dataset 'mlp_gene_expression' --
--------------------------------------------------------------------------------

-- MLP Hyperparameters --
{   'activation': ['relu', 'tanh'],
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'solver': ['adam', 'sgd']}



-- Best Parameters --
{   'activation': 'relu',
    'hidden_layer_sizes': (50,),
    'learning_rate_init': 0.01,
    'solver': 'sgd'}


-- best model parameters: {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.01, 'solver': 'sgd'} --
--------------------------------------------------------------------------------
-- predicting MLP model on dataset 'mlp_gene_expression' --
---------------------------------------------------------

accuracy
recall
precision
f1-score
roc-auc 