In [2]:
%load_ext autoreload
%autoreload 2

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import RandomOverSampler

from datetime import datetime
import numpy as np
import pandas as pd

import random

In [29]:
# custom stuff
from src.RandomForest import CustomRandomForestClassifier, StratifiedRandomForest, BalancedRandomForest, OverUnderRandomForest
from src.eval import eval_models

## Compare our custom RF algo

Just a quick sanity check.. our custom algo has comparible performance to the sklearn implementation.

So now we know it's safe to start experimenting with adjustments to our custom implementation.

In [17]:
X, y = make_classification(
            n_samples = 10000, 
            n_features = 20, 
            n_informative=15, 
            n_redundant=2, 
            n_classes=2, 
            weights = [.9], 
            flip_y=0.05,
            random_state = 42)
X = pd.DataFrame(X)


In [26]:
rf_params = {"max_depth": 7, "min_samples_leaf": 200, "n_estimators": 100, "max_features": 'sqrt', "random_state": 42}

eval_models(models={
    'sklearn RF' : RandomForestClassifier(**rf_params),
    'custom RF' : CustomRandomForestClassifier(**rf_params)
    }, X=X, y=y, n_trials=100
)

100%|██████████| 2/2 [05:03<00:00, 151.74s/it]


Unnamed: 0,model,auc_train,auc_test,delta,n_models
0,custom RF,0.8815,0.8459,0.0356,100
0,sklearn RF,0.8682,0.8375,0.0307,100


## Test variants

In [None]:
eval_models(models={
    'class_weight balanced RF' : RandomForestClassifier(**rf_params, class_weight="balanced"),
    'Stratified RF' : StratifiedRandomForest(**rf_params),
    'Balanced RF' : BalancedRandomForest(**rf_params),
    'OverUnder RF' : OverUnderRandomForest(**rf_params),
    }, X=X, y=y, n_trials=100
)

## with 70% imbalance instead of 90%

In [None]:
# What about the same, but for class imbalance 70% ? 

X, y = make_classification(
            n_samples = 10000, 
            n_features = 20, 
            n_informative=15, 
            n_redundant=2, 
            n_classes=2, 
            flip_y=0.05,
            weights = [.7], 
            random_state = 42)
X = pd.DataFrame(X)

eval_models(models={
    'sklearn RF' : RandomForestClassifier(**rf_params),
    'custom RF' : CustomRandomForestClassifier(**rf_params),
    'class_weight balanced RF' : RandomForestClassifier(**rf_params, class_weight="balanced"),
    'Stratified RF' : StratifiedRandomForest(**rf_params),
    'Balanced RF' : BalancedRandomForest(**rf_params),
    'OverUnder RF' : OverUnderRandomForest(**rf_params),
    }, X=X, y=y, n_trials=100
)