# Model training 

## Import libraries

In [1]:
import pandas as pd 
import joblib

## Read ML ready dataset

In [2]:
df = pd.read_csv('/Users/vishalsaxena/Downloads/proj_data/ml_ready_genomics.csv', index_col=False)
df.head()

Unnamed: 0,Clinical_Significance_Encoded,Chromosome_Encoded,Gene_Symbol_Encoded,IS_SNP,IS_INDEL
0,2,1,1,0,1
1,0,1,1,1,0
2,2,1,1,1,0
3,2,1,1,1,0
4,2,1,1,1,0


## Split dataset into dependent and independent features

In [3]:
X = df.drop('Clinical_Significance_Encoded', axis=1)
y = df['Clinical_Significance_Encoded']
X, y

(         Chromosome_Encoded  Gene_Symbol_Encoded  IS_SNP  IS_INDEL
 0                         1                    1       0         1
 1                         1                    1       1         0
 2                         1                    1       1         0
 3                         1                    1       1         0
 4                         1                    1       1         0
 ...                     ...                  ...     ...       ...
 3661962                  27                18403       1         0
 3661963                  27                18403       1         0
 3661964                  27                18403       1         0
 3661965                  27                18403       1         0
 3661966                  30                 8675       0         1
 
 [3661967 rows x 4 columns],
 0          2
 1          0
 2          2
 3          2
 4          2
           ..
 3661962    2
 3661963    0
 3661964    2
 3661965    2
 3661966    0

## Splitting dataset into train/test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_test

(         Chromosome_Encoded  Gene_Symbol_Encoded  IS_SNP  IS_INDEL
 3202158                  19                15603       1         0
 1996671                  11                 9801       1         0
 2828529                  16                13823       1         0
 997434                    5                 4833       1         0
 2666994                  16                13152       1         0
 ...                     ...                  ...     ...       ...
 2579373                  15                12775       1         0
 3035089                  17                14810       1         0
 634742                    2                 2980       0         1
 683192                    3                 3200       1         0
 2321672                  13                11663       1         0
 
 [2929573 rows x 4 columns],
          Chromosome_Encoded  Gene_Symbol_Encoded  IS_SNP  IS_INDEL
 2856261                  17                13964       1         0
 2867909         

In [5]:
y_train, y_test

(3202158    2
 1996671    5
 2828529    1
 997434     0
 2666994    2
           ..
 2579373    0
 3035089    2
 634742     0
 683192     2
 2321672    0
 Name: Clinical_Significance_Encoded, Length: 2929573, dtype: int64,
 2856261    2
 2867909    2
 806260     2
 2780447    1
 63729      2
           ..
 1636912    2
 2705138    0
 205360     2
 1546048    1
 1208097    2
 Name: Clinical_Significance_Encoded, Length: 732394, dtype: int64)

## Train the RandomForestClassifier

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', 
                                       n_jobs=-1)
rf_classifier.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Evaluate the model metrics

In [7]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
y_pred = rf_classifier.predict(X_test)
print(classification_report(y_test, y_pred))
joblib.dump(rf_classifier, "../models/rft_model.joblib")
# print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.41      0.27      0.32    251569
           1       0.41      0.36      0.38     92318
           2       0.60      0.51      0.55    386275
           3       0.00      0.43      0.00        94
           4       0.45      0.88      0.60       365
           5       0.00      0.45      0.00        84
           6       0.02      0.59      0.04      1689

    accuracy                           0.41    732394
   macro avg       0.27      0.50      0.27    732394
weighted avg       0.51      0.41      0.45    732394



['../models/rft_model.joblib']

## Oversampling pipeline with SMOTE

In [8]:
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state=42)
# X_res, y_res = sm.fit_resample(X_train, y_train)

In [9]:
# rf_classifier_smote = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
# rf_classifier_smote.fit(X_res, y_res)

### Evaluate the model after SMOTE

In [10]:
# y_pred = rf_classifier_smote.predict(X_test)
# print(classification_report(y_test, y_pred))

## Train the HistGradientBoosting

In [11]:
# from sklearn.ensemble import HistGradientBoostingClassifier
# hgb_classifier = HistGradientBoostingClassifier(max_iter=200, early_stopping=True)
# hgb_classifier.fit(X_train, y_train)

In [12]:
# y_pred = hgb_classifier.predict(X_test)
# print(classification_report(y_test, y_pred))

### Hyper parameter tuning (HistGradientBoosting)

In [13]:
# hgb_classifier_pt = HistGradientBoostingClassifier(max_iter=400,
#     learning_rate=0.05,
#     max_depth=10,
#     class_weight='balanced',
#     early_stopping=True,
#     random_state=42)
# hgb_classifier_pt.fit(X_train, y_train)

In [14]:
# y_pred = hgb_classifier_pt.predict(X_test)
# print(classification_report(y_test, y_pred))

## Train the LightGBM

In [15]:
# import lightgbm as lgb
# lgb_model = lgb.LGBMClassifier(
#     n_estimators=1000,
#     objective='multiclass',
#     num_class=8,
#     class_weight='balanced',
#     learning_rate=0.05,
#     random_state=42,
#     n_jobs=-1
# )
# lgb_model.fit(X_train, y_train)

In [16]:
# y_pred = lgb_model.predict(X_test)
# print(classification_report(y_test, y_pred))

### Hyper parameter tuning (LightGBM)

In [17]:
# import lightgbm as lgb

# lgb_model = lgb.LGBMClassifier(
#     objective='multiclass',
#     num_class=8,
#     boosting_type='gbdt',
#     n_estimators=1000,
#     learning_rate=0.05,
#     max_depth=10,
#     num_leaves=64,
#     is_unbalance=True,          
#     min_child_samples=50,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42,
#     n_jobs=-1
# )
# lgb_model.fit(X_train, y_train)

In [18]:
# y_pred = lgb_model.predict(X_test)
# print(classification_report(y_test, y_pred))

## XGBoost Model

In [19]:
import xgboost as xgb
import joblib
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',     # multiclass classification
    num_class=7,                   # total number of labels
    tree_method='hist',            # fast histogram-based method
    learning_rate=0.05,            # slower learning = better generalization
    n_estimators=1000,             # more boosting rounds
    max_depth=10,                  # control complexity
    subsample=0.8,                 # row sampling
    colsample_bytree=0.8,          # feature sampling
    min_child_weight=5,            # prevents overfitting small leaves
    gamma=1,                       # minimum gain to split
    reg_lambda=1.0,                # L2 regularization
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, "../models/xgb_model.joblib")

['../models/xgb_model.joblib']

In [20]:
import numpy as np
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6])

In [21]:
np.unique(y_test)

array([0, 1, 2, 3, 4, 5, 6])

In [22]:
from sklearn.metrics import classification_report
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.04      0.07    251569
           1       0.55      0.31      0.40     92318
           2       0.56      0.95      0.71    386275
           3       0.00      0.00      0.00        94
           4       0.00      0.00      0.00       365
           5       0.00      0.00      0.00        84
           6       0.00      0.00      0.00      1689

    accuracy                           0.56    732394
   macro avg       0.22      0.19      0.17    732394
weighted avg       0.52      0.56      0.45    732394



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [23]:
y_pred

array([2, 2, 2, ..., 2, 2, 2], shape=(732394,), dtype=int32)

In [30]:
# X_test['Pred'] = y_pred
#X_test['True'] = y_test
X_test[X_test['Pred'] == 1]

Unnamed: 0,Chromosome_Encoded,Gene_Symbol_Encoded,IS_SNP,IS_INDEL,Pred,True
63729,1,295,0,1,1,2
1416607,7,6969,0,1,1,2
2353746,13,11734,0,1,1,1
986605,4,4805,0,1,1,2
271086,1,1724,0,1,1,0
...,...,...,...,...,...,...
2474342,14,12368,0,1,1,1
2423493,14,12131,0,1,1,1
3604585,23,18015,0,1,1,2
2955878,17,14457,0,1,1,1


In [25]:
xgb_model.predict(X_train)

array([2, 2, 2, ..., 1, 2, 2], shape=(2929573,), dtype=int32)

In [26]:
y_train

3202158    2
1996671    5
2828529    1
997434     0
2666994    2
          ..
2579373    0
3035089    2
634742     0
683192     2
2321672    0
Name: Clinical_Significance_Encoded, Length: 2929573, dtype: int64