# Model training 

## Import libraries

In [None]:
import pandas as pd 
import joblib

## Read ML ready dataset

In [None]:
df = pd.read_csv('/Users/vishalsaxena/Downloads/proj_data/ml_ready_genomics.csv', index_col=False)
df.head()

## Split dataset into dependent and independent features

In [None]:
X = df.drop('Clinical_Significance_Encoded', axis=1)
y = df['Clinical_Significance_Encoded']
X, y

## Splitting dataset into train/test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
X_train, X_test

In [None]:
y_train, y_test

## Train the RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced', 
                                       n_jobs=-1)
rf_classifier.fit(X_train, y_train)

### Evaluate the model metrics

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
y_pred = rf_classifier.predict(X_test)
print(classification_report(y_test, y_pred))
joblib.dump(rf_classifier, "../models/rft_model.joblib")
# print(confusion_matrix(y_test, y_pred))

## Oversampling pipeline with SMOTE

In [None]:
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state=42)
# X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
# rf_classifier_smote = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
# rf_classifier_smote.fit(X_res, y_res)

### Evaluate the model after SMOTE

In [None]:
# y_pred = rf_classifier_smote.predict(X_test)
# print(classification_report(y_test, y_pred))

## Train the HistGradientBoosting

In [None]:
# from sklearn.ensemble import HistGradientBoostingClassifier
# hgb_classifier = HistGradientBoostingClassifier(max_iter=200, early_stopping=True)
# hgb_classifier.fit(X_train, y_train)

In [None]:
# y_pred = hgb_classifier.predict(X_test)
# print(classification_report(y_test, y_pred))

### Hyper parameter tuning (HistGradientBoosting)

In [None]:
# hgb_classifier_pt = HistGradientBoostingClassifier(max_iter=400,
#     learning_rate=0.05,
#     max_depth=10,
#     class_weight='balanced',
#     early_stopping=True,
#     random_state=42)
# hgb_classifier_pt.fit(X_train, y_train)

In [None]:
# y_pred = hgb_classifier_pt.predict(X_test)
# print(classification_report(y_test, y_pred))

## Train the LightGBM

In [None]:
# import lightgbm as lgb
# lgb_model = lgb.LGBMClassifier(
#     n_estimators=1000,
#     objective='multiclass',
#     num_class=8,
#     class_weight='balanced',
#     learning_rate=0.05,
#     random_state=42,
#     n_jobs=-1
# )
# lgb_model.fit(X_train, y_train)

In [None]:
# y_pred = lgb_model.predict(X_test)
# print(classification_report(y_test, y_pred))

### Hyper parameter tuning (LightGBM)

In [None]:
# import lightgbm as lgb

# lgb_model = lgb.LGBMClassifier(
#     objective='multiclass',
#     num_class=8,
#     boosting_type='gbdt',
#     n_estimators=1000,
#     learning_rate=0.05,
#     max_depth=10,
#     num_leaves=64,
#     is_unbalance=True,          
#     min_child_samples=50,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42,
#     n_jobs=-1
# )
# lgb_model.fit(X_train, y_train)

In [None]:
# y_pred = lgb_model.predict(X_test)
# print(classification_report(y_test, y_pred))

## XGBoost Model

In [None]:
import xgboost as xgb
import joblib
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',     # multiclass classification
    num_class=7,                   # total number of labels
    tree_method='hist',            # fast histogram-based method
    learning_rate=0.05,            # slower learning = better generalization
    n_estimators=1000,             # more boosting rounds
    max_depth=10,                  # control complexity
    subsample=0.8,                 # row sampling
    colsample_bytree=0.8,          # feature sampling
    min_child_weight=5,            # prevents overfitting small leaves
    gamma=1,                       # minimum gain to split
    reg_lambda=1.0,                # L2 regularization
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, "../models/xgb_model.joblib")

In [None]:
import numpy as np
np.unique(y_train)

In [None]:
np.unique(y_test)

In [None]:
from sklearn.metrics import classification_report
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
y_pred

In [None]:
# X_test['Pred'] = y_pred
#X_test['True'] = y_test
X_test[X_test['Pred'] == 1]

In [None]:
xgb_model.predict(X_train)

In [None]:
y_train