In [17]:
%matplotlib inline
%load_ext autoreload
%reload_ext autoreload
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from pandas.tseries.offsets import MonthEnd
warnings.filterwarnings("ignore")

In [None]:
from v2.data_transformers.SequenceAttritionTrsTransformerNew import SequenceAttritionTrsTransformerNew

current_month = '05'

input_paths = {
    'data': f'<path_to_file>full_till_{current_month}.2023.xlsx',
    'training': f'<path_to_file>.xlsx'
}

outputh_paths = {
    'general':'<output_path>'
}

cleanup = False
transformer = SequenceAttritionTrsTransformerNew(path_to_data=input_paths['data'],
                                                 training_data_path=input_paths['training'],
                                                 predict=False,
                                                 cleanup=False)

data = transformer.prepare_data_for_attrition_prediction(min_date='01.01.2018')

In [20]:
import unidecode
finance['Employee'] = finance['Employee'].apply(lambda x: unidecode.unidecode(x))

In [21]:
# Setting date to last day of month due to different days of nTRS extract and align with FINANCE data.
data['report_date_dt'] = pd.to_datetime(data['report_date_dt'], format='%Y-%m-%d') + MonthEnd(0)
finance['change_date'] = pd.to_datetime(finance['change_date'], format='%Y-%m-%d') + MonthEnd(0)

In [22]:
data = data.merge(finance, how='left', left_on=['employee', 'report_date_dt'], right_on=['Employee', 'change_date'])

In [23]:
#data = data.drop(['SAP ID', 'change_date', 'Employee'], axis=1)
data = data.rename(columns={'change_value': 'salary_raise'})
data['salary_raise'] = data['salary_raise'].fillna(0.0)

In [24]:
data = data.drop(['Employee', 'change_date', 'SAP ID'], axis=1)

In [27]:
# Split the dataset for training, backtest and prediction
#backtest_date = '2023-04-28'
prediction_start_date = '2023-01-31'

In [28]:
data = data[data['report_date_dt'] < prediction_start_date]

In [29]:
data[data['report_date_dt'] > prediction_start_date]

Unnamed: 0,name,surname,grade,technology,office location,client,current project,sap id,line manager,employee,...,report_date_dt,start_date_dt,max_date,mapped_grade,left,office_location_prob_ratio,technology_prob_ratio,mapped_client_prob_ratio,has_training,salary_raise


In [30]:
data.report_date_dt.value_counts()

2022-10-31    1234
2022-11-30    1229
2022-09-30    1219
2022-12-31    1210
2022-07-31    1207
2022-08-31    1206
2022-06-30    1185
2022-05-31    1161
2022-04-30    1152
2022-02-28    1131
2022-01-31    1126
2022-03-31    1124
2021-12-31    1107
2021-11-30    1097
2021-10-31    1061
2021-09-30    1034
2021-08-31     987
2021-07-31     963
2021-06-30     929
2021-05-31     910
2021-04-30     864
2021-03-31     816
2021-02-28     803
2021-01-31     791
2020-11-30     778
2020-12-31     777
2020-10-31     767
2020-09-30     729
2020-07-31     709
2020-06-30     708
2020-05-31     707
2020-08-31     704
2020-04-30     684
2020-03-31     662
2020-02-29     652
2020-01-31     643
2018-09-30     632
2018-08-31     626
2018-07-31     621
2019-12-31     617
2019-11-30     614
2018-06-30     611
2018-10-31     599
2019-10-31     599
2018-05-31     597
2018-11-30     580
2019-09-30     572
2018-04-30     571
2018-12-31     569
2019-01-31     563
2019-02-28     545
2019-08-31     540
2018-03-31  

In [31]:
terminated = data[data['left'] == 1]
terminated.shape

(18476, 21)

In [32]:
terminated.columns

Index(['name', 'surname', 'grade', 'technology', 'office location', 'client',
       'current project', 'sap id', 'line manager', 'employee', 'unique_id',
       'report_date_dt', 'start_date_dt', 'max_date', 'mapped_grade', 'left',
       'office_location_prob_ratio', 'technology_prob_ratio',
       'mapped_client_prob_ratio', 'has_training', 'salary_raise'],
      dtype='object')

In [34]:
from dateutil import relativedelta


def calc_2(x, y):
    delta = relativedelta.relativedelta(x, y)
    return delta.months


def calc_month_before_termination(data):
    data['months_before_termination'] = ((data['max_date'] - data['report_date_dt']) / np.timedelta64(1, 'M'))
    data['months_before_termination'] = data['months_before_termination'].apply(lambda x: round(x))
    return data


def calc_month_of_work(data):
    # todo fix this calculation
    data['months_of_work'] = ((data['report_date_dt'] - data['start_date_dt']) / np.timedelta64(1, 'M'))
    data['months_of_work'] = data['months_of_work'].apply(lambda x: round(x))
    return data


def calculate_risk(x):
    if x <= 1:
        return 1
    else:
        return 0

def set_sample(x, y, intervals: int = 2):
    if x == 1:  # always take high risk periods
        return True

    if x == np.nan: # those are mid-periods we shouldn't consider
        return False

    if x == 0:  # low risk periods
        if y % intervals == 0:
            return True
        else:
            return False

def is_on_bench(x, y):
    # x = project, y = client
    x = x.lower()
    y = y.lower()
    if 'bench' in x or 'bench' in y:
        return 1
    elif 'internal' in y:
        return 1
    else:
        return 0

def avg_time_per_project(tenure, nth_project):
    return tenure / nth_project


def set_prob_period(x):
    if x < 4:
        return True
    else:
        return False


def get_tenure(x, y):
    months_of_empl = (y - x) / np.timedelta64(1, 'M')
    return months_of_empl / 12

def is_covid_employment(start_date):
    import datetime
    covid_start = datetime.datetime.strptime('20-03-2020', '%d-%m-%Y')
    covid_end = datetime.datetime.strptime('13-05-2022', '%d-%m-%Y')
    if covid_start < start_date < covid_end:
        return 1
    else:
        return 0

def get_time_to_event(date_column, event_column):
    return (risky[date_column] - risky.groupby(['unique_id', risky[event_column].eq(1).cumsum()])[date_column].transform("min"))/np.timedelta64(1, 'M')

def get_time_to_event_on_float(date_column, event_column):
    return (risky[date_column] - risky.groupby(['unique_id', risky[event_column].gt(0).cumsum()])[date_column].transform("min"))/np.timedelta64(1, 'M')

In [35]:
#terminated['months_before_termination'] = terminated.apply(lambda x: calc_2(x['max_date'], x['report_date_dt']), axis=1)
terminated['tenure'] = terminated.apply(lambda x: get_tenure(x['start_date_dt'], x['report_date_dt']), axis=1)
terminated = calc_month_before_termination(terminated)
terminated['high_risk'] = terminated['months_before_termination'].apply(calculate_risk)
terminated = calc_month_of_work(terminated)
print(terminated.shape)
terminated['is_prob'] = terminated['months_of_work'].apply(set_prob_period)
terminated = terminated[terminated['is_prob'] == False]
terminated.drop(['is_prob'], axis=1, inplace=True)
print(terminated.shape)

(18476, 25)
(15737, 25)


In [36]:
risky = terminated.copy()

In [37]:
risky['high_risk'].value_counts()

0    13783
1     1954
Name: high_risk, dtype: int64

In [38]:
risky.drop(['left', 'diff'], inplace=True, axis=1, errors='ignore')

In [39]:
risky['current project'] = risky['current project'].astype(str)
risky['client'] = risky['client'].astype(str)
risky['current project'] = risky['current project'].str.lower()
risky['client'] = risky['client'].str.lower()
risky['is_on_bench'] = risky.apply(lambda x: is_on_bench(x['current project'], x['client']), axis=1)

In [40]:
risky.sort_values(['unique_id', 'report_date_dt'], inplace=True)
risky['was_promoted'] = risky.groupby('unique_id').apply(
    lambda group: group['grade'] != group['grade'].shift(1)).tolist()
risky['was_promoted'] = risky['was_promoted'].map({True: 1, False: 0})
risky.loc[risky.groupby('unique_id').head(1).index, 'was_promoted'] = 1

In [41]:
risky['project_changed'] = risky.groupby('unique_id').apply(
    lambda group: group['current project'] != group['current project'].shift(1)).tolist()
risky['project_changed'] = risky['project_changed'].map({True: 1, False: 0})
risky.loc[risky.groupby('unique_id').head(1).index, 'project_changed'] = 0

In [42]:
risky['account_changed'] = risky.groupby('unique_id').apply(
    lambda group: group['client'] != group['client'].shift(1)).tolist()
risky['account_changed'] = risky['account_changed'].map({True: 1, False: 0})
risky.loc[risky.groupby('unique_id').head(1).index, 'account_changed'] = 0

In [44]:
risky['high_risk'].value_counts()

0    13783
1     1954
Name: high_risk, dtype: int64

In [45]:
risky["time_since_promo_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='was_promoted')
risky["time_since_project_change_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='project_changed')
risky["time_since_account_change_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='account_changed')
risky["time_since_bench_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='is_on_bench')
risky["time_since_training_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='has_training')
risky['time_since_last_salary_raise'] = get_time_to_event_on_float(date_column='report_date_dt', event_column='salary_raise')

In [46]:
# populate last salary raise
risky['salary_raise_test'] = risky['salary_raise'].replace(0.00, np.nan)

In [47]:
risky.loc[risky.groupby('unique_id').head(1).index, 'salary_raise_test'] = 0

In [48]:
risky['last_salary_raise'] = risky['salary_raise_test'].ffill()

In [49]:
risky = risky.drop(['salary_raise_test'], axis=1)

In [50]:
risky['last_salary_raise'].isnull().sum()

0

In [52]:
risky.sort_values(['unique_id', 'report_date_dt'], inplace=True)
risky['bench_cumsum'] = risky.groupby(['unique_id'])['is_on_bench'].cumsum()

In [53]:
risky['bench_to_tenure'] = risky.apply(lambda x: (x['bench_cumsum'] / 12) / x['tenure'], axis=1)

In [54]:
risky['nth_project'] = risky.groupby(['unique_id'])[
                           'project_changed'].cumsum() + 1  # +1 beacuse it is reflecting the changes.

In [55]:
risky['avg_time_per_project'] = risky.apply(lambda x : avg_time_per_project(x['tenure'], x['nth_project']), axis=1)

In [56]:
risky = risky.drop(['is_on_bench', 'has_training'], axis=1)
risky['sampling'] = risky.apply(lambda x: set_sample(x['high_risk'], x['months_of_work']), axis=1)
risky = risky[risky['sampling'] == True]

In [58]:
risky.drop(['months_before_termination', 'sampling','months_of_work', 'grade', 'mapped_source', 'report_date', 'bench_cumsum',
            'left', 'diff', 'initial_grade', 'max_report_date_all', 'max_date', 'technology', 'start_date_dt',
            'report_date_dt', 'employee', 'Jjob family', 'source', 'status', 'division', 'technology', 'start date',
            'office location', 'contract type', 'client', 'current project', 'new_job_family', 'name', 'surname', 'grade', 'last_project', 'last_client', 'mapped_project', 'mapped_client', 'has_training', 'salary_raise',
            'is_on_bench', 'was_promoted', 'project_changed', 'account_changed', 'line manager'], axis=1, inplace=True,
           errors='ignore')


In [59]:
print(risky.shape)
print(risky.columns)
print(risky['high_risk'].value_counts())

(8969, 18)
Index(['sap id', 'unique_id', 'mapped_grade', 'office_location_prob_ratio',
       'technology_prob_ratio', 'mapped_client_prob_ratio', 'tenure',
       'high_risk', 'time_since_promo_in_months',
       'time_since_project_change_in_months',
       'time_since_account_change_in_months', 'time_since_bench_in_months',
       'time_since_training_in_months', 'time_since_last_salary_raise',
       'last_salary_raise', 'bench_to_tenure', 'nth_project',
       'avg_time_per_project'],
      dtype='object')
0    7015
1    1954
Name: high_risk, dtype: int64


In [65]:
time_lagged_columns = [
    'time_since_promo_in_months',
    'time_since_project_change_in_months',
    'time_since_account_change_in_months',
    'time_since_bench_in_months',
    'time_since_training_in_months',
    'time_since_last_salary_raise'
]

In [67]:
columns_based_on_lagged_features = []
periods = [(0,3), (3,6), (6, 12)]

from tqdm import tqdm

for column in tqdm(time_lagged_columns):
    for period in periods:
        base_period = period[0]
        desired_period = period[1]
        new_col = column.replace('time_since_', '')
        new_col = new_col.replace('_in_months', '')
        columns_based_on_lagged_features.append(new_col)
        risky[f'{new_col}_{desired_period}_m'] = risky[column].apply(lambda x: 1 if base_period <= x < desired_period else 0)


100%|██████████| 6/6 [00:00<00:00, 34.99it/s]


In [68]:
changed_risky = risky.copy(deep=True)

In [69]:
# CORR
risky = risky.drop(time_lagged_columns, axis=1)

In [77]:
# VIF to verify m-coli
verify_coli = False
if verify_coli:
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    # VIF dataframe
    vif_data = pd.DataFrame()
    target = risky['high_risk']
    X = risky.drop(['high_risk', 'unique_id'], axis=1) # remove unique_id ONLY for VIF
    assert 'high_risk' not in X.columns
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                              for i in range(len(X.columns))]

    print(vif_data)

In [78]:
risky = risky[risky['mapped_grade'] != '7'] # remove L7 grade from classification

In [53]:
risky = risky.drop(['office_location_prob_ratio'], axis=1)

In [79]:
risky.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8969 entries, 32896 to 26218
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   sap id                      1237 non-null   object 
 1   unique_id                   8969 non-null   object 
 2   mapped_grade                8969 non-null   int64  
 3   office_location_prob_ratio  8969 non-null   float64
 4   technology_prob_ratio       8969 non-null   float64
 5   mapped_client_prob_ratio    8969 non-null   float64
 6   tenure                      8969 non-null   float64
 7   high_risk                   8969 non-null   int64  
 8   last_salary_raise           8969 non-null   float64
 9   bench_to_tenure             8969 non-null   float64
 10  nth_project                 8969 non-null   int64  
 11  avg_time_per_project        8969 non-null   float64
 12  promo_3_m                   8969 non-null   int64  
 13  promo_6_m                   

In [80]:
risky = risky.drop(['line manager', 'sap id', 'tenure'], axis=1, errors='ignore')

In [56]:
col_to_pairplot = [
    'time_since_last_salary_raise',
    'time_since_promo_in_months',
    # 'time_since_account_change_in_months',
    # 'time_since_bench_in_months',
    # 'time_since_training_in_months',
    # 'bench_to_tenure',
    # 'avg_time_per_project',
    # 'nth_project',
    'high_risk'
]

In [57]:
import seaborn as sns
#sns.pairplot(risky[col_to_pairplot], hue='high_risk')

In [58]:
#sns.boxplot()

In [81]:
from sklearn.model_selection import GroupShuffleSplit

splitter = GroupShuffleSplit(test_size=.20, n_splits=5, random_state=7)
split = splitter.split(risky, groups=risky['unique_id'])
train_inds, test_inds = next(split)

X_train = risky.iloc[train_inds]
X_test = risky.iloc[test_inds]

In [83]:
X_train.columns

Index(['unique_id', 'mapped_grade', 'office_location_prob_ratio',
       'technology_prob_ratio', 'mapped_client_prob_ratio', 'tenure',
       'high_risk', 'last_salary_raise', 'bench_to_tenure', 'nth_project',
       'avg_time_per_project', 'promo_3_m', 'promo_6_m', 'promo_12_m',
       'project_change_3_m', 'project_change_6_m', 'project_change_12_m',
       'account_change_3_m', 'account_change_6_m', 'account_change_12_m',
       'bench_3_m', 'bench_6_m', 'bench_12_m', 'training_3_m', 'training_6_m',
       'training_12_m', 'last_salary_raise_3_m', 'last_salary_raise_6_m',
       'last_salary_raise_12_m'],
      dtype='object')

In [84]:
# X_train = risky.groupby('unique_id').sample(frac=.8)
# print(risky.shape)
# x_train_indexes = X_train.index
# print(x_train_indexes.shape)
# X_test = risky.loc[~risky.index.isin(x_train_indexes)]
# print(X_test.shape)

# print(unique_id.ngroups)
# print(X_test.groupby('unique_id').ngroups)

In [85]:
X_train.drop(['unique_id'], axis=1, inplace=True, errors='ignore')
X_test.drop(['unique_id'], axis=1, inplace=True, errors='ignore')

In [86]:
# corr = X_train.corr()
# sns.heatmap(corr)

In [87]:
y_train = X_train['high_risk']
X_train.drop(['high_risk'], axis=1, inplace=True)

y_test = X_test['high_risk']
X_test.drop(['high_risk'], axis=1, inplace=True)

assert 'high_risk' not in X_train.columns
assert 'high_risk' not in X_test.columns

In [65]:
print(X_train.shape)
print(y_train.shape)
print('--test--')
print(y_test.shape)
print(X_test.shape)

(7171, 25)
(7171,)
--test--
(1798,)
(1798, 25)


In [66]:
from sklearn.datasets import make_classification
A, a = make_classification(n_samples=10000, n_features=2, n_redundant=0,
 n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [68]:
from collections import Counter
counter_train = Counter(y_train)
counter_test = Counter(y_test)
print(counter_train)
print(counter_test)

Counter({0: 5613, 1: 1558})
Counter({0: 1402, 1: 396})


In [69]:
# from numpy import where
# from matplotlib import pyplot
#
# for label, _ in counter_train.items():
#     row_ix = where(y_train == label)[0]
#     pyplot.scatter(X_train[row_ix, 0], X_train[row_ix, 1], label=str(label))
# pyplot.legend()
# pyplot.show()

In [70]:
outlier_detection = False

if outlier_detection:
    from pyod.models.knn import KNN
    from pyod.models.lof import LOF
    from pyod.models.copod import COPOD
    from pyod.models.iforest import IForest

    from pyod.utils.example import visualize
    from pyod.utils.data import evaluate_print

    clf_name = 'KNN'

    from pyod.models.suod import SUOD

# initialized a group of outlier detectors for acceleration
    detector_list = [LOF(n_neighbors=15), LOF(n_neighbors=20),
                    LOF(n_neighbors=25), LOF(n_neighbors=35),
                    COPOD(), IForest(n_estimators=100),
                    IForest(n_estimators=200)]

# decide the number of parallel process, and the combination method
# then clf can be used as any outlier detection model
    clf = SUOD(base_estimators=detector_list, n_jobs=2, combination='average',
           verbose=False)

    clf.fit(X_train)
# get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

# it is possible to get the prediction confidence as well
    #y_test_pred, y_test_pred_confidence = clf.predict(X_test, return_confidence=True)  # outlier labels (0 or 1) and confidence in the range of [0,1]


# evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
    #      y_test_pred, show_figure=True, save_figure=False)

In [72]:
# FEATURE SELECTION
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
#X_train_fs = SelectKBest(score_func=chi2, k='all').fit_transform(X_train,y_train)

bestfeatures = SelectKBest(score_func=f_classif, k=20)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 10 best features

                     Specs      Score
6     avg_time_per_project  64.303836
19            training_3_m  34.306661
9               promo_12_m  20.594446
21           training_12_m  19.403919
7                promo_3_m  17.749316
22   last_salary_raise_3_m  17.230134
16               bench_3_m  16.233155
15     account_change_12_m  11.889392
24  last_salary_raise_12_m  11.856418
13      account_change_3_m  11.045296
12     project_change_12_m   9.415622
18              bench_12_m   8.375247
10      project_change_3_m   7.400209
3        last_salary_raise   6.871663
23   last_salary_raise_6_m   4.163264
0             mapped_grade   3.945869
20            training_6_m   3.415049
5              nth_project   3.166826
8                promo_6_m   1.048696
11      project_change_6_m   0.650967


In [108]:
top_rated_features = featureScores.sort_values(['Score'], ascending=False)['Specs'].head(5).tolist()
top_rated_features

['avg_time_per_project',
 'training_3_m',
 'promo_12_m',
 'training_12_m',
 'promo_3_m']

In [109]:
X_train = X_train[top_rated_features]
X_test = X_test[top_rated_features]

In [110]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|██████████| 29/29 [00:07<00:00,  3.82it/s]


In [111]:
predictions

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.76,0.67,0.67,0.77,0.18
LGBMClassifier,0.75,0.65,0.65,0.75,0.11
RandomForestClassifier,0.72,0.64,0.64,0.73,0.52
ExtraTreeClassifier,0.73,0.64,0.64,0.74,0.01
KNeighborsClassifier,0.74,0.64,0.64,0.74,0.07
ExtraTreesClassifier,0.73,0.64,0.64,0.74,0.42
BaggingClassifier,0.72,0.63,0.63,0.73,0.06
DecisionTreeClassifier,0.72,0.63,0.63,0.73,0.02
QuadraticDiscriminantAnalysis,0.54,0.55,0.55,0.58,0.01
NearestCentroid,0.51,0.55,0.55,0.55,0.01


In [112]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from numpy import mean, std


xgb_params = {
        'gamma': [0, 0.1, 0.2, 0.4],
        'learning_rate': [0.01, 0.03],
        'max_depth': [5, 6, 7, 8],
        'n_estimators': [800],
        'reg_alpha': [0, 0.1, 0.2],
        'reg_lambda': [0, 0.1, 0.2]
}

lgbm_params = {
    'boosting_type': ['gbdt', 'goss', 'rf', 'dart'],
    'learning_rate':[0.001, 0.01, 0.1],
    'n_estimators': [50],
    'max_depth':[10, 15, 20, 25],
    'num_leaves': [30, 35, 40, 50]
}

lgbm = LGBMClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)


grid = GridSearchCV(estimator=xgb, param_grid=xgb_params, scoring='recall', cv=3,
                        n_jobs=-1, verbose=2)

result = grid.fit(X_train, y_train)
best_classifier = result.best_estimator_
best_params = result.best_params_
print(f'Best params: {best_params}')

Fitting 3 folds for each of 288 candidates, totalling 864 fits
Best params: {'gamma': 0, 'learning_rate': 0.03, 'max_depth': 8, 'n_estimators': 800, 'reg_alpha': 0.1, 'reg_lambda': 0}


In [81]:
# OVER SAMPLING ON TRAINING DATASET
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

use_smote = False

if use_smote:

    counter_train = Counter(y_train)
    print(counter_train)
    print('-' * 20)

    over = SMOTE(sampling_strategy=0.4)
    under = RandomUnderSampler(sampling_strategy=0.7)

    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)

    X_train_smote, y_train_smote = pipeline.fit_resample(X_train, y_train)
    counter_train = Counter(y_train_smote)
    print(counter_train)

    X_train = X_train_smote
    y_train = y_train_smote

In [None]:
print(best_classifier)
model = best_classifier
# model_with_best_params = LGBMClassifier(learning_rate=0.1, boosting_type='gbdt', max_depth=20, n_estimators=1000, num_leaves=35,random_state=42)

#steps = [('model', model_with_best_params)]
#pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=7, n_repeats=3, random_state=42)
recalls = cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)
roc_curve = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
precisions = cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
accuracy = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(f'Accuracy: {mean(accuracy)} with std: {std(accuracy)}')
print(f'Precision: {mean(precisions)} with std: {std(precisions)}')
print(f'Recalls: {mean(recalls)} with std: {std(recalls)}')
print(f'ROC: {mean(roc_curve)} with std: {std(roc_curve)}')
# model_with_best_params_2 = LGBMClassifier(learning_rate=0.1, boosting_type='gbdt', max_depth=20, n_estimators=1400, num_leaves=35,random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
print(classification_report(y_test, y_pred))

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.03, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=8, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=800, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=42, ...)


In [85]:
# KAPPA
from sklearn.metrics import cohen_kappa_score
print(cohen_kappa_score(y_test, y_pred))

0.21456454200113928


In [None]:
# BRIER SCORE
from sklearn.metrics import brier_score_loss
brier_score_loss(y_test, y_pred_prob[:, 1])

In [None]:
aaa

In [None]:
save_artifacts = True
base_output_path = 'artifacts'
data_output = 'data'
model_output = 'models'

import uuid
import joblib
import os

artifacts_id = str(uuid.uuid4())
print(f'{artifacts_id}')
data_path = f'{base_output_path}/{artifacts_id}/{data_output}'
model_path = f'{base_output_path}/{artifacts_id}/{model_output}'
if save_artifacts:
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        os.makedirs(model_path)
    X_train.to_csv(f'{data_path}/training_data.csv', sep=';', index=0)
    with open(f'{model_path}/model.joblib', 'wb') as file:
        joblib.dump(model, file)

In [None]:
X_train.columns

In [None]:
base_df = X_test.reset_index()
base_df

In [None]:
from explainerdashboard import ClassifierExplainer, ExplainerDashboard


explainer = ClassifierExplainer(model, X_test, y_test,
                                labels=['low risk', 'high risk'], # defaults to ['0', '1', etc]
                                )

db = ExplainerDashboard(explainer,
                        title="KIT Explainer", # defaults to "Model Explainer"
                        shap_interaction=False, # you can switch off tabs with bools
                        whatif=True,
                        decision_trees=False
                        )
db.run(port=8050)

In [None]:
import shap
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_test)
shap.plots.beeswarm(shap_values) # wez ten wykres, jest super

In [None]:
# shap.force_plot(explainer.expected_value, shap_values[0, :], X_test.iloc[0, :])

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.sort_values(ascending=False).head()

## Explainer - LIME

In [None]:
import lime
from lime import lime_tabular

explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=['low risk', 'high risk'],
    mode='classification'
)

In [None]:
import re

exp = explainer.explain_instance(
        data_row=X_test.iloc[126],
        predict_fn=model.predict_proba
    )
individual_contributors = exp.as_list()
print(individual_contributors)
# individual_contributors = dict((re.sub('[^a-zA-Z_]', '', key), val) for (key, val) in individual_contributors)
# temp_dict = {50: individual_contributors}
# contrib_single_df = pd.DataFrame(temp_dict).transpose().reset_index()

In [None]:
model.predict_proba(X_test)

In [None]:
exp.show_in_notebook(show_table=True, show_all=False, show_predicted_value=True)

In [None]:
y_test.iloc[126]

In [None]:
import re

def prepare_contributors_for_prediction(pred_df: pd.DataFrame, classifier):
    all_contributors = []
    n_predictions = pred_df.shape[0]
    print(f'Processing {n_predictions} predictions')
    for i in range(n_predictions):
        exp = explainer.explain_instance(
            data_row=pred_df.iloc[i],
            predict_fn=classifier.predict_proba
        )
        individual_contributors = exp.as_list()
        print(individual_contributors)
        individual_contributors = dict((re.sub('[^a-zA-Z_]', '', key), val) for (key, val) in individual_contributors)
        temp_dict = {i: individual_contributors}
        contrib_single_df = pd.DataFrame(temp_dict).transpose().reset_index()
        all_contributors.append(contrib_single_df)
    print(f'Processed successfully {len(all_contributors)} predictions')
    return all_contributors

contributors = pd.concat(prepare_contributors_for_prediction(X_test, model), axis=0, ignore_index=True)
contributors

In [None]:
contributors

In [None]:
y_pred_train = pipeline.predict(X_train)
print(classification_report(y_train, y_pred_train))

In [None]:
ggg

In [None]:
recalls = cross_val_score(model, X_test, y_test, scoring='recall', cv=cv, n_jobs=-1)
roc_curve = cross_val_score(model, X_test, y_test, scoring='roc_auc', cv=cv, n_jobs=-1)
precisions = cross_val_score(model, X_test, y_test, scoring='precision', cv=cv, n_jobs=-1)
accuracy = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
print(f'Accuracy: {mean(accuracy)} with std: {std(accuracy)}')
print(f'Precision: {mean(precisions)} with std: {std(precisions)}')
print(f'Recalls: {mean(recalls)} with std: {std(recalls)}')
print(f'ROC: {mean(roc_curve)} with std: {std(roc_curve)}')

In [None]:
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(model, X_test, y_test)

In [None]:
dt_new = DecisionTreeClassifier(class_weight='balanced')
rm_new = RandomForestClassifier(class_weight='balanced')
knn = KNeighborsClassifier()
xgb = XGBClassifier()#scale_pos_weight=scale_pos_weight)


models_params_grid = {
    'KNN':
        {'n_neighbors': [3, 7, 9, 11, 13, 15, 17, 19],
         'weights': ['uniform', 'distance'],
         'p': [1, 2]
         },
    'DecisionTree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [2, 4, 6, 8, 10, 12],
        'max_leaf_nodes': [2, 4, 6, 8, 10, 12]
    },
    'Random Forest': {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30, 40],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [200, 400, 600]
    },
    'Random Survival Forest' :{
        'n_estimators': [200, 400, 600, 800, 1000],
        'min_samples_split': [10,20,30, 40],
        'min_samples_leaf': [5,10,15,20]
    },
    'Xgboost': {
        'gamma': [0, 0.1, 0.2, 0.4],
        'learning_rate': [0.01, 0.03],
        'max_depth': [5, 6, 7, 8],
        'n_estimators': [150, 300],
        'reg_alpha': [0, 0.1, 0.2],
        'reg_lambda': [0, 0.1, 0.2]
    }
}

for model, model_name in zip([knn, dt_new, rm_new],
                             ['KNN', 'DecisionTree', 'Random Forest']):
    print(f'****')
    print(f'{model_name} started...')

    grid = GridSearchCV(estimator=model, param_grid=models_params_grid.get(model_name), scoring='recall', cv=3,
                        n_jobs=-1, verbose=1)
    result = grid.fit(X_train, y_train)
    best_classifier = result.best_estimator_
    print(best_classifier)
    steps = [('model', best_classifier)]
    pipeline = Pipeline(steps=steps)
    cv = RepeatedStratifiedKFold(n_splits=7, n_repeats=3, random_state=42)
    recalls = cross_val_score(pipeline, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)
    roc_curve = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
    precisions = cross_val_score(pipeline, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred))
    #print(roc_curve)
    probablities = pipeline.predict_proba(X_test)[:, 1]
    df = pd.DataFrame({'y_pred_class': y_pred,
                       'y_test': y_test,
                       'proba': probablities})

In [None]:
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay, RocCurveDisplay

In [None]:
predictions = best_classifier.predict(X_test)


PrecisionRecallDisplay.from_predictions(y_test, predictions)

precision, recall, _ = precision_recall_curve(y_test, predictions)
disp = PrecisionRecallDisplay(precision=precision, recall=recall)
roc = RocCurveDisplay.from_predictions(y_test, predictions)

In [None]:
persist = False

if persist:
    import joblib, uuid
    model_id = str(uuid.uuid4())
    model_dir = 'models'
    file_name = f'{model_dir}/model_{model_id}.joblib'
    with open(file_name, 'wb') as file:
        joblib.dump(best_classifier, file)

In [None]:
risky.drop(['unique_id'], axis=1, inplace=True)

In [None]:
#sns.pairplot(risky, hue='high_risk')
risky.columns

In [None]:
risky['tenure'].hist()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["feature"] = risky.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(risky.values, i)
                   for i in range(len(risky.columns))]

print(vif_data)