In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import product
import scipy.stats as ss
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import sklearn.metrics as mtr
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn import svm
import pickle
from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler
from ray.air.config import RunConfig

In [4]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [4]:
with open('./datasets/soft/gen/xgb_folds.pickle', 'rb') as handle:
    cv_folds = pickle.load(handle)

In [5]:
train_folds = {key:df for key, df in cv_folds.items() if key != 0}
train_df = pd.concat(train_folds, axis=0)
X_train = train_df.drop('Dismissed', axis=1)
y_train = train_df['Dismissed']
test_df = cv_folds[0]
X_test = test_df.drop('Dismissed', axis=1)
y_test = test_df['Dismissed']

In [6]:
def calc_metrics(preds, y_test):
    cfm = mtr.confusion_matrix(y_test, preds)
    prec = cfm[1][1] / (cfm[0][1] + cfm[1][1])
    rec = cfm[1][1] / (cfm[1][0] + cfm[1][1])
    f1 = 2 * (prec * rec) / (prec + rec)
    return prec, rec, f1

model = XGBClassifier(use_label_encoder=False)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(mtr.confusion_matrix(y_test, preds))
print(calc_metrics(preds, y_test))

[[427   8]
 [ 17  84]]
(0.9130434782608695, 0.8316831683168316, 0.8704663212435233)


In [7]:
importances = model.feature_importances_

feature_importance = {}
features = X_train.columns.tolist()
for i, feature in enumerate(features):
    feature_importance[feature] = round(importances[i], 3)

feature_importance = dict(sorted(feature_importance.items(), key=lambda item: item[1], reverse=True))
print("Feature importance by model\n")
for key, val in feature_importance.items():
    print(f'{key}: {val}\n')

Feature importance by model

PaymentTypeId_9: 0.546999990940094

PaymentTypeId_22: 0.0689999982714653

LastLevelPeriod: 0.05700000002980232

MonthOnSalary: 0.04100000113248825

PosRate: 0.03099999949336052

CompGeoNum: 0.03099999949336052

WorkingPeriod: 0.026000000536441803

APM: 0.024000000208616257

Utilization: 0.023000000044703484

ProjRateCompar: 0.020999999716877937

PosRateCompar: 0.019999999552965164

IntProjTime: 0.01899999938905239

PosStrNum: 0.017000000923871994

CustDismRate: 0.017000000923871994

PosLevGeoNum: 0.013000000268220901

ProjRate: 0.013000000268220901

MeanHourVacation: 0.012000000104308128

MonthsOnProject: 0.009999999776482582

WageRate: 0.00800000037997961



In [8]:
# def get_cv_metrics_imps(cv_folds):
#     runs_metrics = []
#     feat_importances = []
#     feat_names = None
#     for i in range(10):
#         train_folds = {key:df for key, df in cv_folds.items() if key != i}
#         train_df = pd.concat(train_folds, axis=0)
#         X_train = train_df.drop('Dismissed', axis=1)
#         y_train = train_df['Dismissed']
#         test_df = cv_folds[i]
#         X_test = test_df.drop('Dismissed', axis=1)
#         y_test = test_df['Dismissed']
        
#         model = RandomForestClassifier()
#         model.fit(X_train, y_train)
#         preds = model.predict(X_test)
        
#         runs_metrics.append(calc_metrics(preds, y_test))
        
#         feat_names = X_train.columns.tolist()
#         feat_importances.append(model.feature_importances_)
#     runs_metrics = pd.DataFrame(runs_metrics, columns=['Prec', 'Rec', 'F1'])
#     feat_importances = pd.DataFrame(feat_importances, columns=feat_names)
#     return runs_metrics.mean(axis=0), feat_importances.mean(axis=0).sort_values(ascending=False)

# path = './datasets/soft/gen/'
# f_names = ['0m_cv_folds.pickle', '0m_lowcorr_cv_folds.pickle', '0m_lowcorr_imp_cv_folds.pickle']
# for f_name in f_names:
#     with open(f'{path}{f_name}', 'rb') as handle:
#         cv_folds = pickle.load(handle)
#     cv_metrics, imps = get_cv_metrics_imps(cv_folds)
#     print(cv_metrics, '\n')
#     print(imps, '\n**********************************\n')

In [6]:
def get_iqr_values_mask(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1

    filter_mask = (col >= Q1 - 1.5 * IQR) & (col <= Q3 + 1.5 *IQR)
    return filter_mask

def get_metrics_mean(cv_metrics):
    mask = get_iqr_values_mask(cv_metrics['Prec']) & get_iqr_values_mask(cv_metrics['Rec'])
    # print(mask)
    filt_metrics = cv_metrics[mask]
    return filt_metrics.mean()

def calc_metrics(preds, y_test):
    cfm = mtr.confusion_matrix(y_test, preds)
    prec = cfm[1][1] / (cfm[0][1] + cfm[1][1])
    rec = cfm[1][1] / (cfm[1][0] + cfm[1][1])
    f1 = 2 * (prec * rec) / (prec + rec)
    return prec, rec, f1

def get_cv_metrics(cv_folds, model_cls, model_hps={}):
    runs_metrics = []
    for i in range(10):
        train_folds = {key:df for key, df in cv_folds.items() if key != i}
        train_df = pd.concat(train_folds, axis=0)
        X_train = train_df.drop('Dismissed', axis=1)
        y_train = train_df['Dismissed']
        test_df = cv_folds[i]
        X_test = test_df.drop('Dismissed', axis=1)
        y_test = test_df['Dismissed']
        
        model = model_cls(**model_hps)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        runs_metrics.append(calc_metrics(preds, y_test))
    runs_metrics = pd.DataFrame(runs_metrics, columns=['Prec', 'Rec', 'F1'])
    return runs_metrics

In [9]:
path = './datasets/soft/gen/'
f_name = 'xgb_folds.pickle'

models_info = [(DecisionTreeClassifier, {}),
               (RandomForestClassifier, {}),
               (XGBClassifier, {'use_label_encoder': False,
                               'verbosity': 0,
                               'random_state': 10})
               # (svm.SVC, {'kernel': 'linear'}),
               # (svm.SVC, {'kernel': 'rbf'}),
               # (svm.SVC, {'kernel': 'poly'}),
               ]

with open(f'{path}{f_name}', 'rb') as handle:
    cv_folds = pickle.load(handle)
for cl, hps in models_info:
    cv_metrics = get_cv_metrics(cv_folds, cl, hps)
    print("Metrics std < 0.05:")
    display(cv_metrics.std(axis=0) < 0.05)
    metrics_means = get_metrics_mean(cv_metrics)
    print('Filtered means:')
    display(metrics_means)
    print('**********************************')

Metrics std < 0.05:


Prec    True
Rec     True
F1      True
dtype: bool

Filtered means:


Prec    0.741459
Rec     0.754767
F1      0.747872
dtype: float64

**********************************
Metrics std < 0.05:


Prec    True
Rec     True
F1      True
dtype: bool

Filtered means:


Prec    0.934359
Rec     0.737074
F1      0.823749
dtype: float64

**********************************
Metrics std < 0.05:


Prec    True
Rec     True
F1      True
dtype: bool

Filtered means:


Prec    0.923203
Rec     0.825195
F1      0.871316
dtype: float64

**********************************


In [11]:
def get_cv_results(cv_folds, model):
    runs_metrics = []
    for i in range(10):
        train_folds = {key:df for key, df in cv_folds.items() if key != i}
        train_df = pd.concat(train_folds, axis=0)
        X_train = train_df.drop('Dismissed', axis=1)
        y_train = train_df['Dismissed']
        test_df = cv_folds[i]
        X_test = test_df.drop('Dismissed', axis=1)
        y_test = test_df['Dismissed']
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        runs_metrics.append(calc_metrics(preds, y_test))
    runs_metrics = pd.DataFrame(runs_metrics, columns=['Prec', 'Rec', 'F1'])
    
    return runs_metrics

search_space = {
    "n_estimators": tune.loguniform(100, 10000),
    "max_depth": tune.randint(0, 5),
    "subsample": tune.quniform(0.25, 0.75, 0.01),
    "colsample_bytree": tune.quniform(0.05, 0.5, 0.01),
    "colsample_bylevel": tune.quniform(0.05, 0.5, 0.01),    
    "learning_rate": tune.quniform(-3.0, -1.0, 0.5) # pows of 10
}

params = [k for k in search_space.keys() if k != 'wandb']

def objective(config):
    config['n_estimators'] = int(config['n_estimators'])
    config['max_depth'] = int(config['max_depth']) + 2
    config['learning_rate'] = 10 ** config['learning_rate']
    
    xgb = XGBClassifier(
        random_state=RANDOMSTATE,
        booster='gbtree',
        scale_pos_weight=1,
        use_label_encoder=False,
        **config
    )
    cv_metrics = get_cv_results(cv_folds, xgb)
    metrics_means = get_metrics_mean(cv_metrics)
    rec = metrics_means['Rec']
    
    return {"rec": rec}

In [None]:
RANDOMSTATE = 10
NUM_SAMPLES = 10

algo = HyperOptSearch(random_state_seed=RANDOMSTATE)

scheduler = ASHAScheduler()

tuner = tune.Tuner(objective,
               param_space=search_space,
               tune_config=tune.TuneConfig(
                    num_samples=NUM_SAMPLES,
                    search_alg=algo,
                    scheduler=scheduler,
                    metric="rec",
                    mode="max"
               ),
                run_config=RunConfig(
                    verbose=1,
                    name="hyperopt_xgb",
                    local_dir="~/tune_results"
                ))

results = tuner.fit()

[2m[36m(pid=12812)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(objective pid=12812)[0m   elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[2m[36m(pid=12844)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(pid=12853)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(pid=12850)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(pid=12847)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(pid=12849)[0m   from pandas import MultiIndex, Int64Index
[2m[36m(objective pid=12850)[0m   elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[2m[36m(objective pid=12844)[0m   elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[2m[36m(objective pid=12853)[0m   elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[2m[36m(objective pid=12849)[0m   elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[2m[36m(objective pid=12847)[0m   elif isinstance(data.columns, (pd.Int64Index, pd.

In [None]:
results.get_best_result(metric="rec", mode="max").config

In [None]:
# path = './datasets/soft/gen/'
# f_names = ['xgb_folds.pickle']
# models_info = [(RandomForestClassifier, {}),
#                (XGBClassifier, {'use_label_encoder': False})]
#                # (svm.SVC, {'kernel': 'linear'}),
#                # (svm.SVC, {'kernel': 'rbf'}),
#                # (svm.SVC, {'kernel': 'poly'}),
#                # (DecisionTreeClassifier, {})]
# for f_name in f_names:
#     with open(f'{path}{f_name}', 'rb') as handle:
#         cv_folds = pickle.load(handle)
#     for cl, hps in models_info[1:]:
#         cv_metrics = get_cv_metrics(cv_folds, cl, hps)
#         print(cv_metrics,
#               '\n**********************************')
#     print('----------------------------------\n')