In [75]:
%matplotlib inline
%load_ext autoreload
%reload_ext autoreload
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
import joblib, uuid

warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
from v2.data_transformers.SurvivalAttritionTrsTransformer import SurvivalAttritionTrsTransformer

input_paths = {
    'data': '<path_to_file>.csv',
    'training': '<path_to_file>.csv',
}

outputh_paths = {
    'general':'<output_file>'
}

cleanup = False
transformer = SurvivalAttritionTrsTransformer(path_to_data=input_paths['data'],
                                                 training_data_path=input_paths['training'],
                                                 predict=False,
                                                 cleanup=False)

data = transformer.prepare_data_for_attrition_prediction(min_date='01.01.2018')

2023-03-01 21:16:26,855 - v2.data_transformers.BaseTrsTransformer - INFO - Loading data from Z:/REPORTS/TRS wyciągi/Basic/ddc/final_all.csv
2023-03-01 21:16:33,914 - v2.data_transformers.BaseTrsTransformer - INFO - Employee column being generated...
2023-03-01 21:16:33,927 - v2.data_transformers.BaseTrsTransformer - INFO - Decoding polish characters...
2023-03-01 21:16:34,121 - v2.data_transformers.BaseTrsTransformer - INFO - Adding unique identifier...
2023-03-01 21:16:34,185 - v2.data_transformers.BaseTrsTransformer - INFO - Cleaning technology...
2023-03-01 21:16:34,193 - v2.data_transformers.BaseTrsTransformer - INFO - Data processed: shape (60955, 24), columns Index(['no', 'name', 'surname', 'status', 'division', 'grade', 'technology',
       'position', 'start date', 'end date', 'office location',
       'contract type', 'client', 'current project', 'report_date',
       'project status', 'owner', 'source', 'traffic source',
       'brand awareness source', 'job family', 'skype,,

In [80]:
from dateutil import relativedelta


def calc_2(x, y):
    delta = relativedelta.relativedelta(x, y)
    return delta.months


def calc_month_before_termination(data):
    data['months_before_termination'] = ((data['max_date'] - data['report_date_dt']) / np.timedelta64(1, 'M'))
    data['months_before_termination'] = data['months_before_termination'].apply(lambda x: round(x))
    return data


def calc_month_of_work(data):
    # todo fix this calculation
    data['months_of_work'] = ((data['report_date_dt'] - data['start_date_dt']) / np.timedelta64(1, 'M'))
    data['months_of_work'] = data['months_of_work'].apply(lambda x: round(x))
    return data


def calculate_risk(x):
    if x <= 2:
        return 1
    else:
        return 0

def set_sample(x, y, intervals: int = 2):
    if x == 1:  # always take high risk periods
        return True

    #     if x == np.nan: # those are mid-periods we shouldn't consider
    #         return False

    if x == 0:  # low risk periods
        if y % intervals == 0:
            return True
        else:
            return False

def is_on_bench(x, y):
    # x = project, y = client
    x = x.lower()
    y = y.lower()
    if 'bench' in x or 'bench' in y:
        return 1
    elif 'internal' in y:
        return 1
    else:
        return 0

def avg_time_per_project(tenure, nth_project):
    return tenure / nth_project


def set_prob_period(x):
    if x < 3:
        return True
    else:
        return False


def get_tenure(x, y):
    months_of_empl = (y - x) / np.timedelta64(1, 'M')
    return months_of_empl / 12

def is_covid_employment(start_date):
    import datetime
    covid_start = datetime.datetime.strptime('20-03-2020', '%d-%m-%Y')
    covid_end = datetime.datetime.strptime('13-05-2022', '%d-%m-%Y')
    if covid_start < start_date < covid_end:
        return 1
    else:
        return 0

def get_time_to_event(date_column, event_column):
    return (risky[date_column] - risky.groupby(['unique_id', risky[event_column].eq(1).cumsum()])[date_column].transform("min"))/np.timedelta64(1, 'M')

In [81]:
data = calc_month_before_termination(data)
data = calc_month_of_work(data)

In [82]:
risky = data.copy()

In [83]:
risky.drop(['sampling', 'months_of_work', 'diff'], inplace=True, axis=1, errors='ignore')

In [84]:
risky['current project'].astype(str)
risky['client'].astype(str)
risky['current project'].fillna('No data', inplace=True)
risky['client'].fillna('No data', inplace=True)
risky['is_on_bench'] = risky.apply(lambda x: is_on_bench(x['current project'], x['client']), axis=1)

In [85]:
risky['tenure'] = risky.apply(lambda x: get_tenure(x['start_date_dt'], x['report_date_dt']), axis=1)

In [86]:
risky['is_covid_employment'] = risky['start_date_dt'].apply(is_covid_employment)

In [87]:
risky.sort_values(['unique_id', 'report_date_dt'], inplace=True)
risky['was_promoted'] = risky.groupby('unique_id').apply(
    lambda group: group['grade'] != group['grade'].shift(1)).tolist()
risky['was_promoted'] = risky['was_promoted'].map({True: 1, False: 0})
risky.loc[risky.groupby('unique_id').head(1).index, 'was_promoted'] = 1

In [88]:
risky['project_changed'] = risky.groupby('unique_id').apply(
    lambda group: group['current project'] != group['current project'].shift(1)).tolist()
risky['project_changed'] = risky['project_changed'].map({True: 1, False: 0})
risky.loc[risky.groupby('unique_id').head(1).index, 'project_changed'] = 0

In [89]:
risky['account_changed'] = risky.groupby('unique_id').apply(
    lambda group: group['client'] != group['client'].shift(1)).tolist()
risky['account_changed'] = risky['account_changed'].map({True: 1, False: 0})
risky.loc[risky.groupby('unique_id').head(1).index, 'account_changed'] = 0

In [90]:
risky["time_since_promo_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='was_promoted')
risky["time_since_project_change_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='project_changed')
risky["time_since_account_change_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='account_changed')
risky["time_since_bench_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='is_on_bench')
risky["time_since_training_in_months"] = get_time_to_event(date_column='report_date_dt', event_column='has_training')

In [91]:
risky.sort_values(['unique_id', 'report_date_dt'], inplace=True)
risky['bench_cumsum'] = risky.groupby(['unique_id'])['is_on_bench'].cumsum()

In [92]:
risky['bench_to_tenure'] = risky.apply(lambda x: (x['bench_cumsum'] / 12) / x['tenure'], axis=1)

In [93]:
risky['nth_project'] = risky.groupby(['unique_id'])[
                           'project_changed'].cumsum() + 1  # +1 beacuse it is reflecting the changes.

In [94]:
risky['avg_time_per_project'] = risky.apply(lambda x : avg_time_per_project(x['tenure'], x['nth_project']), axis=1)

In [95]:
risky.drop(['months_before_termination', 'months_of_work', 'grade', 'mapped_source', 'report_date',
            'bench_cumsum',
            'diff', 'initial_grade', 'max_report_date_all', 'max_date', 'technology', 'start_date_dt',
            'report_date_dt', 'Jjob family', 'source', 'status', 'division', 'technology', 'start date',
            'office location', 'contract type', 'client', 'current project', 'new_job_family', 'name', 'surname', 'grade', 'last_project', 'last_client', 'mapped_project', 'mapped_client', 'has_training'
            'is_on_bench', 'was_promoted', 'project_changed', 'account_changed'], axis=1, inplace=True,
           errors='ignore')
risky.drop(['has_training', 'is_on_bench'], axis=1, inplace=True,
           errors='ignore')

In [96]:
print(risky.shape)
print(risky.columns)

(34479, 20)
Index(['employee', 'unique_id', 'mapped_grade', 'contract type_Contract',
       'contract type_Permanent', 'contract type_Sub-contract', 'left',
       'office_location_prob_ratio', 'technology_prob_ratio',
       'mapped_client_prob_ratio', 'tenure', 'is_covid_employment',
       'time_since_promo_in_months', 'time_since_project_change_in_months',
       'time_since_account_change_in_months', 'time_since_bench_in_months',
       'time_since_training_in_months', 'bench_to_tenure', 'nth_project',
       'avg_time_per_project'],
      dtype='object')


In [98]:
data = risky.drop_duplicates(subset=['unique_id'], keep='last')

In [None]:
idx = data.index
emp_mapper = pd.DataFrame({'id': idx,
                           'employee': data['employee'],
                           'churned': data['left'],
                           'tenure': data['tenure']})

data.drop(['employee', 'unique_id'], axis=1, inplace=True)

emp_mapper.head()

In [None]:
# VIF to verify m-coli
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe
vif_data = pd.DataFrame()
target = data['left']
X = data.drop(['left'], axis=1)
assert 'left' not in X.columns
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

print(vif_data)

In [101]:
col_to_pairplot = [
    'time_since_promo_in_months',
    'time_since_account_change_in_months',
    'time_since_bench_in_months',
    'time_since_training_in_months',
    'bench_to_tenure',
    'avg_time_per_project',
    'nth_project',
    'left'
]

In [102]:
# import seaborn as sns
# sns.pairplot(risky[col_to_pairplot], hue='left')

In [103]:
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [104]:
y = data[['left','tenure']]
X = data.drop(['left', 'tenure'], axis=1)

In [105]:
y['cens'] = y['left'].map({0: False, 1: True})
y = y[['cens', 'tenure']].to_numpy()

In [106]:
aux = [(e1,e2) for e1,e2 in y]

In [107]:
y = np.array(aux, dtype=[('left', '?'), ('tenure', '<f8')])

# MODEL TRAINING

In [108]:
random_state = 20

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=random_state, shuffle=True)

In [109]:
rsf = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           n_jobs=-1,
                           random_state=random_state)
rsf.fit(X_train, y_train)

persist = True

if persist:
    model_id = str(uuid.uuid4())
    print(f'Attached id to model: {model_id}')
    model_dir = 'models'
    file_name = f'{model_dir}/surv_model_{model_id}.joblib'
    with open(file_name, 'wb') as file:
        joblib.dump(rsf, file)

Attached id to model: ff8a154a-7e90-40dc-90e5-c07d8da9344e


In [110]:
rsf.score(X_test, y_test)

0.9232945489321007

In [111]:
samples = 6

pred_sample = X_test[X_test.index == 33372]
idx_range = list(range(0,samples))

In [113]:
idx = pred_sample.index
mapping ={}

for key, value in zip(idx_range, idx):
    mapping[key] = value

print(mapping)

{}


In [None]:
surv = rsf.predict_survival_function(pred_sample, return_array=True)

In [None]:
# predictions = rsf.predict(X_test)
# predictions

In [None]:
surv

In [None]:
for i, s in enumerate(surv):
    name = emp_mapper.loc[mapping[i], 'employee']
    plt.step(rsf.event_times_, s, where="post", label=str(name))
plt.ylabel("Survival probability")
plt.xlabel("Time in years")
plt.legend()
plt.grid(True)

In [None]:
rsf.event_times_

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    rsf, X_test, y_test, n_repeats=15, random_state=random_state
)

In [None]:
pd.DataFrame(
    {k: result[k] for k in ("importances_mean", "importances_std",)},
    index=X_test.columns
).sort_values(by="importances_mean", ascending=False)