# Model training 

## Import libraries

In [1]:
import pandas as pd 

## Read ML ready dataset

In [2]:
df = pd.read_csv('data/ml_ready_genomics.csv', index_col=False)
df.head()

Unnamed: 0,Clinical_Significance_Encoded,Chromosome_Encoded,Clinical_Review_Status_Encoded,Gene_Symbol_Encoded,POS_Percentile,IS_SNP,IS_INDEL
0,1,1,1,12,3e-06,0,1
1,0,1,1,12,6e-06,1,0
2,1,1,1,12,9e-06,1,0
3,1,1,1,12,1.2e-05,1,0
4,1,1,1,12,1.5e-05,1,0


## Split dataset into dependent and independent features

In [3]:
X = df.drop('Clinical_Significance_Encoded', axis=1)
y = df['Clinical_Significance_Encoded']
X, y

(         Chromosome_Encoded  Clinical_Review_Status_Encoded  \
 0                         1                               1   
 1                         1                               1   
 2                         1                               1   
 3                         1                               1   
 4                         1                               1   
 ...                     ...                             ...   
 3682181                  27                               1   
 3682182                  27                               1   
 3682183                  27                               1   
 3682184                  27                               1   
 3682185                  30                               1   
 
          Gene_Symbol_Encoded  POS_Percentile  IS_SNP  IS_INDEL  
 0                         12        0.000003       0         1  
 1                         12        0.000006       1         0  
 2                         12   

## Splitting dataset into train/test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_test

(         Chromosome_Encoded  Clinical_Review_Status_Encoded  \
 2252313                  12                               1   
 1105919                   5                               1   
 540507                    2                               2   
 2724236                  16                               2   
 3154590                  19                               1   
 ...                     ...                             ...   
 2392564                  13                               1   
 200829                    1                               1   
 75500                     1                               1   
 2342151                  13                               1   
 738676                    3                               2   
 
          Gene_Symbol_Encoded  POS_Percentile  IS_SNP  IS_INDEL  
 2252313                   85        0.521106       1         0  
 1105919                  388        0.620232       1         0  
 540507                 36437   

In [5]:
y_train, y_test

(2252313    0
 1105919    1
 540507     2
 2724236    1
 3154590    1
           ..
 2392564    1
 200829     1
 75500      1
 2342151    0
 738676     2
 Name: Clinical_Significance_Encoded, Length: 2945748, dtype: int64,
 1597416    0
 2262377    0
 840549     0
 873582     1
 451629     1
           ..
 3472256    1
 1296667    1
 1229269    1
 1782198    1
 3565806    0
 Name: Clinical_Significance_Encoded, Length: 736438, dtype: int64)

## Train the RandomForestClassifier

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', 
                                       n_jobs=-1)
rf_classifier.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Evaluate the model metrics

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
y_pred = rf_classifier.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.78      0.58    253225
           1       0.82      0.44      0.57    387994
           2       0.69      0.63      0.66     92775
           3       0.01      0.54      0.01       103
           4       0.42      0.92      0.58       372
           5       0.01      0.77      0.03        92
           6       0.92      0.99      0.95      1877

    accuracy                           0.58    736438
   macro avg       0.48      0.73      0.48    736438
weighted avg       0.68      0.58      0.59    736438

[[196609  35607  14384   4143    156   2272     54]
 [199416 171057  11826   3917    183   1553     42]
 [ 27111   2895  58751   2681    139   1141     57]
 [     5      1      8     56      0     33      0]
 [     3      0      1     13    344     11      0]
 [     0      4      2     14      1     71      0]
 [     1      0      2     14      0      5   1855]]


## Oversampling pipeline with SMOTE

In [8]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [9]:
rf_classifier_smote = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_classifier_smote.fit(X_res, y_res)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Evaluate the model after SMOTE

In [10]:
y_pred = rf_classifier_smote.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.78      0.58    253225
           1       0.82      0.44      0.57    387994
           2       0.68      0.64      0.66     92775
           3       0.00      0.50      0.01       103
           4       0.43      0.93      0.59       372
           5       0.02      0.67      0.04        92
           6       0.93      0.99      0.96      1877

    accuracy                           0.58    736438
   macro avg       0.48      0.71      0.49    736438
weighted avg       0.68      0.58      0.59    736438



## Train the HistGradientBoosting

In [11]:
from sklearn.ensemble import HistGradientBoostingClassifier
hgb_classifier = HistGradientBoostingClassifier(max_iter=200, early_stopping=True)
hgb_classifier.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,max_iter,200
,max_leaf_nodes,31
,max_depth,
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [12]:
y_pred = hgb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.17      0.26    253225
           1       0.61      0.91      0.73    387994
           2       0.75      0.60      0.67     92775
           3       0.04      0.03      0.03       103
           4       0.64      0.88      0.74       372
           5       0.04      0.10      0.06        92
           6       0.95      0.96      0.96      1877

    accuracy                           0.62    736438
   macro avg       0.51      0.52      0.49    736438
weighted avg       0.61      0.62      0.56    736438



### Hyper parameter tuning (HistGradientBoosting)

In [13]:
hgb_classifier_pt = HistGradientBoostingClassifier(max_iter=400,
    learning_rate=0.05,
    max_depth=10,
    class_weight='balanced',
    early_stopping=True,
    random_state=42)
hgb_classifier_pt.fit(X_train, y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.05
,max_iter,400
,max_leaf_nodes,31
,max_depth,10
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [14]:
y_pred = hgb_classifier_pt.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.47      0.78      0.58    253225
           1       0.82      0.44      0.57    387994
           2       0.67      0.64      0.66     92775
           3       0.01      0.75      0.01       103
           4       0.25      0.96      0.40       372
           5       0.01      0.74      0.03        92
           6       0.86      1.00      0.92      1877

    accuracy                           0.58    736438
   macro avg       0.44      0.76      0.45    736438
weighted avg       0.68      0.58      0.59    736438



## Train the LightGBM

In [15]:
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(
    n_estimators=1000,
    objective='multiclass',
    num_class=8,
    class_weight='balanced',
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 547
[LightGBM] [Info] Number of data points in the train set: 2945748, number of used features: 6
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910








































































0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [16]:
y_pred = lgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.40      0.47      0.43    253225
           1       0.62      0.45      0.52    387994
           2       0.27      0.06      0.09     92775
           3       0.00      0.24      0.00       103
           4       0.00      0.44      0.01       372
           5       0.00      0.29      0.00        92
           6       0.02      0.41      0.04      1877

    accuracy                           0.41    736438
   macro avg       0.19      0.34      0.16    736438
weighted avg       0.50      0.41      0.44    736438



### Hyper parameter tuning (LightGBM)

In [17]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=8,
    boosting_type='gbdt',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=10,
    num_leaves=64,
    is_unbalance=True,          
    min_child_samples=50,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 547
[LightGBM] [Info] Number of data points in the train set: 2945748, number of used features: 6
[LightGBM] [Info] Start training from score -1.067548
[LightGBM] [Info] Start training from score -0.640836
[LightGBM] [Info] Start training from score -2.071652
[LightGBM] [Info] Start training from score -8.870007
[LightGBM] [Info] Start training from score -7.590013
[LightGBM] [Info] Start training from score -8.979671
[LightGBM] [Info] Start training from score -5.972282




























0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,10
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [18]:
y_pred = lgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.40      0.44    253225
           1       0.64      0.70      0.67    387994
           2       0.51      0.46      0.48     92775
           3       0.07      0.11      0.08       103
           4       0.00      0.00      0.00       372
           5       0.03      0.08      0.04        92
           6       0.00      0.00      0.00      1877

    accuracy                           0.57    736438
   macro avg       0.25      0.25      0.24    736438
weighted avg       0.56      0.57      0.56    736438



## XGBoost Model

In [19]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',     # multiclass classification
    num_class=7,                   # total number of labels
    tree_method='hist',            # fast histogram-based method
    learning_rate=0.05,            # slower learning = better generalization
    n_estimators=1000,             # more boosting rounds
    max_depth=10,                  # control complexity
    subsample=0.8,                 # row sampling
    colsample_bytree=0.8,          # feature sampling
    min_child_weight=5,            # prevents overfitting small leaves
    gamma=1,                       # minimum gain to split
    reg_lambda=1.0,                # L2 regularization
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
import numpy as np
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6])

In [21]:
np.unique(y_test)

array([0, 1, 2, 3, 4, 5, 6])

In [22]:
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.47      0.52    253225
           1       0.68      0.80      0.74    387994
           2       0.80      0.62      0.70     92775
           3       0.75      0.03      0.06       103
           4       0.96      0.90      0.93       372
           5       0.61      0.22      0.32        92
           6       0.96      0.99      0.98      1877

    accuracy                           0.66    736438
   macro avg       0.76      0.58      0.61    736438
weighted avg       0.66      0.66      0.66    736438

