In [1]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%load_ext autoreload

In [None]:
from v2.data_transformers.AttritionTrsTransformer import AttritionTrsTransformer

path = '<path_to_file>.csv'
outputh_path = '<path_to_file>'

attrition_transformer = AttritionTrsTransformer(path_to_data=path, cleanup=False)
attr_data = attrition_transformer.prepare_data_for_attrition_prediction(min_date='01.01.2018')

cleanup = False
if cleanup:
    attr_data.drop(['name', 'surname', 'employee'], axis=1, inplace=True)

attr_data.drop(['unique_id', 'mapped_last_project'], axis=1, inplace=True)
attr_data.rename({'sub-contract': 'sub_contract'}, inplace=True)

In [None]:
idx = attr_data.index
emp_mapper = pd.DataFrame({'id': idx,
                           'employee': attr_data['employee'],
                           'churned': attr_data['stay/left'],
                           'client': attr_data['client']})

attr_data.drop(['name', 'surname', 'employee', 'client'], axis=1, inplace=True)
emp_mapper.head()

In [4]:
cat_columns = [val for val in attr_data.columns if attr_data[val].dtype == 'object']
cat_columns

[]

In [5]:
for col in cat_columns:
    print('*********')
    print(attr_data[col].value_counts())


In [6]:
# VIF removal multicollinearity
attr_data.drop(['mapped_last_grade', 'other'], axis=1, inplace=True)

In [None]:
# VIF to verify m-coli
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe
vif_data = pd.DataFrame()
target = attr_data['stay/left']
X = attr_data.drop(['stay/left'], axis=1)
assert 'stay/left' not in X.columns
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

print(vif_data)

In [8]:
from sksurv.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sksurv.ensemble import RandomSurvivalForest
from sksurv.datasets import load_gbsg2
import numpy as np
from sklearn import set_config
from sklearn.model_selection import train_test_split

In [9]:
y = attr_data[['stay/left','tenure_in_years']]
X = attr_data.drop(['stay/left', 'tenure_in_years'], axis=1)
# X, y = load_gbsg2()
#
# grade_str = X.loc[:, "tgrade"].astype(object).values[:, np.newaxis]
# grade_num = OrdinalEncoder(categories=[["I", "II", "III"]]).fit_transform(grade_str)
#
# X_no_grade = X.drop("tgrade", axis=1)
# Xt = OneHotEncoder().fit_transform(X_no_grade)
# Xt.loc[:, "tgrade"] = grade_num

In [None]:
y['cens'] = y['stay/left'].map({0: False, 1: True})

In [None]:
y.drop(['stay/left'], axis=1, inplace=True)

In [12]:
y = y[['cens', 'tenure_in_years']].to_numpy()

In [13]:
aux = [(e1,e2) for e1,e2 in y]

In [14]:
y = np.array(aux, dtype=[('stay/left', '?'), ('tenure', '<f8')])

In [15]:
random_state = 20

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=random_state, shuffle=True)

In [16]:
rsf = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           n_jobs=-1,
                           random_state=random_state)
rsf.fit(X_train, y_train)

In [17]:
rsf.score(X_test, y_test)

0.8811265544989028

In [None]:
X_test

In [19]:
samples = 6

pred_sample = X_test.sample(samples)
idx_range = list(range(0,samples))

In [22]:

idx = pred_sample.index
mapping ={}

for key, value in zip(idx_range, idx):
    mapping[key] = value

print(mapping)

# name = emp_mapper.loc[60819]
# name

{0: 29061, 1: 60914, 2: 59788, 3: 49565, 4: 60664, 5: 60901}


In [23]:
surv = rsf.predict_survival_function(pred_sample, return_array=True)

In [None]:
for i, s in enumerate(surv):
    name = emp_mapper.loc[mapping[i], 'employee']
    plt.step(rsf.event_times_, s, where="post", label=str(name))
plt.ylabel("Survival probability")
plt.xlabel("Time in years")
plt.legend()
plt.grid(True)

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    rsf, X_test, y_test, n_repeats=15, random_state=random_state
)

pd.DataFrame(
    {k: result[k] for k in ("importances_mean", "importances_std",)},
    index=X_test.columns
).sort_values(by="importances_mean", ascending=False)