# Model training 

## Import libraries

In [1]:
import pandas as pd 

## Read ML ready dataset

In [2]:
df = pd.read_csv('../data/ml_ready_genomics.csv', index_col=False)
df.head()

Unnamed: 0,Clinical_Significance_Encoded,Chromosome_Encoded,Clinical_Review_Status_Encoded,Gene_Symbol_Encoded,POS_Percentile,IS_SNP,IS_INDEL
0,1,1,1,0,3e-06,0,1
1,0,1,1,0,6e-06,1,0
2,1,1,1,0,9e-06,1,0
3,1,1,1,0,1.2e-05,1,0
4,1,1,1,0,1.5e-05,1,0


## Split dataset into dependent and independent features

In [3]:
X = df.drop('Clinical_Significance_Encoded', axis=1)
y = df['Clinical_Significance_Encoded']
X, y

(         Chromosome_Encoded  Clinical_Review_Status_Encoded  \
 0                         1                               1   
 1                         1                               1   
 2                         1                               1   
 3                         1                               1   
 4                         1                               1   
 ...                     ...                             ...   
 3682181                  27                               1   
 3682182                  27                               1   
 3682183                  27                               1   
 3682184                  27                               1   
 3682185                  30                               1   
 
          Gene_Symbol_Encoded  POS_Percentile  IS_SNP  IS_INDEL  
 0                          0        0.000003       0         1  
 1                          0        0.000006       1         0  
 2                          0   

## Splitting dataset into train/test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_test

(         Chromosome_Encoded  Clinical_Review_Status_Encoded  \
 2252313                  12                               1   
 1105919                   5                               1   
 540507                    2                               2   
 2724236                  16                               2   
 3154590                  19                               1   
 ...                     ...                             ...   
 2392564                  13                               1   
 200829                    1                               1   
 75500                     1                               1   
 2342151                  13                               1   
 738676                    3                               2   
 
          Gene_Symbol_Encoded  POS_Percentile  IS_SNP  IS_INDEL  
 2252313                11261        0.521106       1         0  
 1105919                 5229        0.620232       1         0  
 540507                  2753   

In [5]:
y_train, y_test

(2252313    0
 1105919    1
 540507     2
 2724236    1
 3154590    1
           ..
 2392564    1
 200829     1
 75500      1
 2342151    0
 738676     2
 Name: Clinical_Significance_Encoded, Length: 2945748, dtype: int64,
 1597416    0
 2262377    0
 840549     0
 873582     1
 451629     1
           ..
 3472256    1
 1296667    1
 1229269    1
 1782198    1
 3565806    0
 Name: Clinical_Significance_Encoded, Length: 736438, dtype: int64)

## Train the RandomForestClassifier

In [6]:
# from sklearn.ensemble import RandomForestClassifier
# rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', 
#                                        n_jobs=-1)
# rf_classifier.fit(X_train, y_train)

### Evaluate the model metrics

In [7]:
# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
# y_pred = rf_classifier.predict(X_test)
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

## Oversampling pipeline with SMOTE

In [8]:
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state=42)
# X_res, y_res = sm.fit_resample(X_train, y_train)

In [9]:
# rf_classifier_smote = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
# rf_classifier_smote.fit(X_res, y_res)

### Evaluate the model after SMOTE

In [10]:
# y_pred = rf_classifier_smote.predict(X_test)
# print(classification_report(y_test, y_pred))

## Train the HistGradientBoosting

In [11]:
# from sklearn.ensemble import HistGradientBoostingClassifier
# hgb_classifier = HistGradientBoostingClassifier(max_iter=200, early_stopping=True)
# hgb_classifier.fit(X_train, y_train)

In [12]:
# y_pred = hgb_classifier.predict(X_test)
# print(classification_report(y_test, y_pred))

### Hyper parameter tuning (HistGradientBoosting)

In [13]:
# hgb_classifier_pt = HistGradientBoostingClassifier(max_iter=400,
#     learning_rate=0.05,
#     max_depth=10,
#     class_weight='balanced',
#     early_stopping=True,
#     random_state=42)
# hgb_classifier_pt.fit(X_train, y_train)

In [14]:
# y_pred = hgb_classifier_pt.predict(X_test)
# print(classification_report(y_test, y_pred))

## Train the LightGBM

In [15]:
# import lightgbm as lgb
# lgb_model = lgb.LGBMClassifier(
#     n_estimators=1000,
#     objective='multiclass',
#     num_class=8,
#     class_weight='balanced',
#     learning_rate=0.05,
#     random_state=42,
#     n_jobs=-1
# )
# lgb_model.fit(X_train, y_train)

In [16]:
# y_pred = lgb_model.predict(X_test)
# print(classification_report(y_test, y_pred))

### Hyper parameter tuning (LightGBM)

In [17]:
# import lightgbm as lgb

# lgb_model = lgb.LGBMClassifier(
#     objective='multiclass',
#     num_class=8,
#     boosting_type='gbdt',
#     n_estimators=1000,
#     learning_rate=0.05,
#     max_depth=10,
#     num_leaves=64,
#     is_unbalance=True,          
#     min_child_samples=50,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42,
#     n_jobs=-1
# )
# lgb_model.fit(X_train, y_train)

In [18]:
# y_pred = lgb_model.predict(X_test)
# print(classification_report(y_test, y_pred))

## XGBoost Model

In [19]:
import xgboost as xgb
import joblib
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',     # multiclass classification
    num_class=7,                   # total number of labels
    tree_method='hist',            # fast histogram-based method
    learning_rate=0.05,            # slower learning = better generalization
    n_estimators=1000,             # more boosting rounds
    max_depth=10,                  # control complexity
    subsample=0.8,                 # row sampling
    colsample_bytree=0.8,          # feature sampling
    min_child_weight=5,            # prevents overfitting small leaves
    gamma=1,                       # minimum gain to split
    reg_lambda=1.0,                # L2 regularization
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, "../models/xgb_model.joblib")

['../models/xgb_model.joblib']

In [20]:
import numpy as np
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6])

In [21]:
np.unique(y_test)

array([0, 1, 2, 3, 4, 5, 6])

In [23]:
from sklearn.metrics import classification_report
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.27      0.37    253225
           1       0.63      0.88      0.73    387994
           2       0.78      0.62      0.69     92775
           3       1.00      0.01      0.02       103
           4       0.93      0.88      0.91       372
           5       0.62      0.11      0.19        92
           6       0.96      0.99      0.97      1877

    accuracy                           0.64    736438
   macro avg       0.78      0.54      0.55    736438
weighted avg       0.63      0.64      0.60    736438



In [24]:
y_pred

array([2, 1, 2, ..., 1, 1, 0], shape=(736438,), dtype=int32)

In [25]:
X_test['Prediction'] = pd.concat(X_test, y_pred)

Unnamed: 0,Chromosome_Encoded,Clinical_Review_Status_Encoded,Gene_Symbol_Encoded,POS_Percentile,IS_SNP,IS_INDEL
1597416,8,1,7879,0.621447,0,1
2262377,12,1,11319,0.581152,1,0
840549,3,1,3969,0.888870,0,1
873582,4,1,4167,0.082110,1,0
451629,2,1,2378,0.369994,1,0
...,...,...,...,...,...,...
3472256,22,1,17345,0.137391,1,0
1296667,6,1,6373,0.738271,1,0
1229269,6,1,6075,0.329434,0,0
1782198,9,1,8725,0.833941,1,0
