In [None]:
#Setup
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,recall_score
from sklearn.metrics import roc_auc_score

In [None]:
#For dealing with unbalanced data
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

These tree based models can also deal with unb. data with the weighting parameters. Respectively scale_pos_weight=2 (arbitrary) and class_weight =  "balanced", however it is typically less effective than resampling strategy

#### Build simple Pipeline using the oversampling strategy

Remember to perform the sampling strategy after train-test split and only on training set for avoiding bias

In [1]:
## With SMOTE
over1 = SMOTE(random_state = 22, sampling_strategy=0.2, k_neighbors=5)
under1 = RandomUnderSampler(random_state = 22, sampling_strategy=0.5)
pipeline1 = Pipeline(steps = [('o',over),('u',under)])

In [None]:
## With ROS
over2 = RandomOverSampler(random_state = 22, sampling_strategy=0.2)
under2 = RandomUnderSampler(random_state = 22, sampling_strategy=0.5)
pipeline2 = Pipeline(steps = [('o',over),('u',under)])

Train-Test split + proportion of unbalanceness left on the table (useful for reducing bias in the prediction)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_m, Y_m, test_size = 0.3, random_state = 22, stratify = Y_m)
X_train, y_train = pipeline1.fit_resample(X_train, y_train)
print(len(y_train[y_train == 0])/len(y_train[y_train == 1]))

#### XGBoost + some parameters at choice that should be tuned

In [None]:
xgboost_model = XGBClassifier(random_state = 22, gamma=0.1, learning_rate=0.01, 
                              max_depth=10, n_estimators=1000, scale_pos_weight=2, subsample = 0.75)
start = time.time()
xgboost_model.fit(X_train, y_train)
end = time.time()
print(end-start, " seconds")
y_pred = xgboost_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
# Do not use accuracy
print("ROC AUC: ", roc_auc_score(y_test, y_pred))  #useful for unbalaned data

#### Random Forest + some parameters at choice that should be tuned

In [None]:
#n_jobs parallelize
rf_model = RandomForestClassifier(criterion = "gini", max_depth = 15, min_samples_leaf = 1, n_jobs = -1, verbose = 1,
                                    random_state = 22,  n_estimators = 1000, class_weight =  "balanced")
start = time.time()
rf_model.fit(X_train, y_train)
end = time.time()
print(end-start, " seconds")
y_pred = rf_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
# Do not use accuracy
print("ROC AUC: ", roc_auc_score(y_test, y_pred))  #useful for unbalaned data