In [None]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%load_ext autoreload

In [None]:
from v2.data_transformers.AttritionTrsTransformer import AttritionTrsTransformer

path = '<path_to_file>.csv'
outputh_path = '<path_to_file>'
cleanup = False
attrition_transformer = AttritionTrsTransformer(path_to_data=path, cleanup=False)
attr_data = attrition_transformer.prepare_data_for_attrition_prediction(min_date='01.01.2018')

if not cleanup:
    attr_data.drop(['name', 'surname', 'employee'], axis=1, inplace=True)

attr_data.drop(['unique_id', 'mapped_last_project'], axis=1, inplace=True)
attr_data.rename({'sub-contract': 'sub_contract'}, inplace=True)

In [None]:
attr_data.columns

In [None]:
cat_columns = [val for val in attr_data.columns if attr_data[val].dtype == 'object']
cat_columns

In [None]:
for col in cat_columns:
    print('*********')
    print(attr_data[col].value_counts())


In [None]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
durations = attr_data['tenure_in_years'] ## Time to event data of censored and event data
event_observed = attr_data['stay/left']  ## It has the churned (1) and censored is (0)

## create a kmf object as km
km = KaplanMeierFitter() ## instantiate the class to create an object

## Fit the data into the model
km.fit(durations, event_observed,label='Kaplan Meier Estimate')
#
# ## Create an estimate
km.plot_survival_function(at_risk_counts=True)
plt.tight_layout()

In [None]:
km_contract = KaplanMeierFitter()

groups = attr_data['contract type']             ## Create the cohorts from the 'Contract' column
ix1 = (groups == 'Permanent')   ## Cohort 1
ix2 = (groups == 'Contract')         ## Cohort 2
ix3 = (groups == 'Sub-contract')         ## Cohort 3


km_contract.fit(durations[ix1], event_observed[ix1], label='Permanent')    ## fit the cohort 1 data
ax = km_contract.plot()


km_contract.fit(durations[ix2], event_observed[ix2], label='Contract')         ## fit the cohort 2 data
ax1 = km_contract.plot(ax=ax)


km_contract.fit(durations[ix3], event_observed[ix3], label='Sub-contract')        ## fit the cohort 3 data
km_contract.plot(ax=ax1)


In [None]:
# Office location

km_office = KaplanMeierFitter()

groups = attr_data['office location']             ## Create the cohorts from the 'Contract' column
ix1 = (groups == 'Lodz')   ## Cohort 1
ix2 = (groups == 'Poznan')         ## Cohort 2
ix3 = (groups == 'Warszawa')
ix4 = (groups == 'Remote')
ix5 = (groups == 'Krakow')   ## Cohort 3


km_office.fit(durations[ix1], event_observed[ix1], label='Lodz')    ## fit the cohort 1 data
ax = km_office.plot()


km_office.fit(durations[ix2], event_observed[ix2], label='Poznan')         ## fit the cohort 2 data
ax1 = km_office.plot(ax=ax)


km_office.fit(durations[ix3], event_observed[ix3], label='Warszawa')        ## fit the cohort 3 data
km_office.plot(ax=ax1)

km_office.fit(durations[ix4], event_observed[ix4], label='Remote')        ## fit the cohort 3 data
km_office.plot(ax=ax1)

km_office.fit(durations[ix5], event_observed[ix5], label='Krakow')        ## fit the cohort 3 data
km_office.plot(ax=ax1)

In [None]:
km.conditional_time_to_event_

In [None]:
from lifelines.plotting import plot_lifetimes


In [None]:
km.median_survival_time_

In [None]:
from lifelines.utils import median_survival_times
median_ci = median_survival_times(km.confidence_interval_)
median_ci

In [None]:
import pandas as pd
attr_data = pd.get_dummies(attr_data, drop_first=True)
attr_data.head()

In [None]:
#attr_data.columns

In [None]:
from lifelines import CoxPHFitter

In [None]:
# VIF removal multicollinearity
attr_data.drop(['mapped_last_grade', 'other'], axis=1, inplace=True)


In [None]:
# VIF to verify m-coli
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe
vif_data = pd.DataFrame()
target = attr_data['stay/left']
X = attr_data.drop(['stay/left'], axis=1)
assert 'stay/left' not in X.columns
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

print(vif_data)

In [None]:
cph = CoxPHFitter()
X = pd.concat([X, target], axis=1)
cph.fit(X, duration_col='tenure_in_years', event_col='stay/left')
cph.print_summary()

In [None]:
small_test = attr_data.sample(10)
small_test = small_test[small_test['stay/left'] == 0]
small_test.drop(['stay/left'], axis=1, inplace=True)
small_test

In [None]:
cph.predict_survival_function(small_test).plot()

In [None]:
attr_data.sample(10)