# Setup

In [None]:
import pandas as pd
import numpy as np
from plotnine import * # ggplot in python
import matplotlib.pyplot as plt
from matplotlib import pyplot
import random
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
random.seed(2021)
%matplotlib inline
import datetime as dt
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.linear_model import Lasso,LassoCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
import shap

# Data Preparation

In [None]:
#get value from the sheet
data_value = sheet.get_all_values()

#create a dataframe and put the data in
df=pd.DataFrame(data=data_value)
#Replacing Header with the first row
df.columns = df.iloc[0]
#Delete the first row
df = df[1:]
df.replace({'': None}, inplace=True)
df = df.drop(df.columns[[0]], axis=1)
#Column names
names = ['Fund Account ID', 'Start Date', 'End Date', 'Plan Status',
       'Plan Lifetime (Months)', 'First-Time Depositor Subscriber (Yes / No)',
       'Category Name', 'Medium Bucket', 'Monthly Plan Deposit Amount',
       'Plan Donation Rate', 'User Country']
df.columns = names
#Change the dates columns to start and end

df.head()

In [None]:
len(df)

In [None]:
len(df['Fund Account ID'].unique())

In [None]:
df1 = df.groupby(by=["Fund Account ID"]).count()
len(df1[df1['Start Date'] > 1])/len(df['Fund Account ID'].unique())*100
print('Total number of unique fund id = {}'.format(len(df['Fund Account ID'].unique())))
print('Total number of unique duplicate fund id = {}'.format(len(df1[df1['Start Date'] > 1])))
print('Percentage of fund id cancelled and resubscribed = {}%'.format(round(len(df1[df1['Start Date'] > 1])/len(df['Fund Account ID'].unique())*100,3)))

In [None]:
# Get join year and month
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Start Year'] = df['Start Date'].dt.year

In [None]:
# Transform country into united states and others
def country(x):
  if x == 'United States':
    return True
  else:
    return False

df['Inside United States'] = df['User Country'].apply(lambda x: country(x))

In [None]:
# Whether people signup on a referal day
range1 = pd.date_range(start = '2020-01-07', end = '2020-05-03')
range2 = pd.date_range(start = '2020-05-22', end = '2020-06-19')
others = ['2020-08-14', '2020-09-18', '2020-09-28', '2020-10-31', '2020-11-19']
dates_list = [dt.datetime.strptime(date, "%Y-%m-%d").date() for date in others]
def promo_day(x):
  if x in range1:
    return True
  elif x in range2:
    return True
  elif x in dates_list:
    return True
  else:
    return False

df['Signup On Promo Day'] = df['Start Date'].apply(lambda x: promo_day(x))

In [None]:
# Clean numerical data
num1 = df['Monthly Plan Deposit Amount'].apply(lambda x: float(x.replace('$','').replace(',','')))
df['Monthly Plan Deposit Amount'] = num1
num3 = df['Plan Donation Rate'].apply(lambda x: float(x.replace('%',''))/100)
df['Plan Donation Rate'] = num3

In [None]:
# # Y need normalization in linear regression
# # numerical data
# # Monthly Plan Deposit Amount
# print(skew(df['Monthly Plan Deposit Amount']))
# print(skew(boxcox1p(df['Monthly Plan Deposit Amount'], boxcox_normmax(df['Monthly Plan Deposit Amount'] + 1))))
# df['Monthly Plan Deposit Amount'] = boxcox1p(df['Monthly Plan Deposit Amount'], boxcox_normmax(df['Monthly Plan Deposit Amount'] + 1))

# # Plan Donation Rate
# print(skew(df['Plan Donation Rate']))
# print(skew(boxcox1p(df['Plan Donation Rate'], boxcox_normmax(df['Plan Donation Rate'] + 1))))
# df['Plan Donation Rate'] = boxcox1p(df['Plan Donation Rate'], boxcox_normmax(df['Plan Donation Rate'] + 1))

In [None]:
# User Details Fund Account ID is unique for ppl, drop it
drop_list = ['Fund Account ID', 'Start Date', 'End Date', 'User Country', 'Plan Lifetime (Months)']
glm_df = df.drop(columns = drop_list)
glm_df.head()

# EDA

In [None]:
connections = glm_df['Plan Donation Rate']
connections.describe()
sns.boxplot(connections)
glm_df[glm_df['Plan Donation Rate'] > 10]

In [None]:
glm_df['Monthly Plan Deposit Amount'].describe()
sns.boxplot(glm_df['Monthly Plan Deposit Amount'])
#glm_df[glm_df['Plan Donation Rate'] > 10]

In [None]:
sns.countplot(df['Plan Status'])

In [None]:
sns.countplot(glm_df['First-Time Depositor Subscriber (Yes / No)'])

In [None]:
pd.value_counts(glm_df['Category Name']).plot.bar()

In [None]:
pd.value_counts(glm_df['Medium Bucket']).plot.bar()

In [None]:
sns.countplot(glm_df['Start Year'])

In [None]:
sns.countplot(glm_df['Inside United States'])

## NA Processing

In [None]:
glm_df.info()

In [None]:
for col in glm_df.columns:
    print("NA in '%s': %.2f"%(col, (glm_df[col].isna().sum() / glm_df.shape[0]) * 100) + '%')

In [None]:
glm_df['Medium Bucket'].unique()

In [None]:
# 'MG Referral Source: Last Non-Direct Click Medium Bucket' is an important variable, we want to keep it so that we replace the NA with empty string
glm_df = glm_df.fillna('unknown')

## Label Encoder

convert string to number - Dummy

* For 'Automated Lending & Deposit Plans Plan Status' and 'User Details First-Time Depositor Subscriber (Yes / No)', dummying is equivalent to label encoding.
* 'Subscription Loan Channel Automated Plan Autoloan Channel Nickname' and 'MG Referral Source: Last Non-Direct Click Medium Bucket' will have 8 and 10 dummy variables respectively

In [None]:
def label_encode(Series):
    label_list = list(np.unique(Series))
    le = LabelEncoder()
    le.fit(label_list)
    labeled = le.transform(Series)
    label_mapping = dict(zip(le.transform(le.classes_), le.classes_))
    return labeled, label_mapping 

In [None]:
#  Automated Lending & Deposit Plans Plan Status
ALDPPS, map_ALDPPS = label_encode(glm_df['Plan Status'])
glm_df.loc[:, 'Plan Status'] = ALDPPS
print('Plan Status: \n\t', map_ALDPPS)

#  User Details First-Time Depositor Subscriber (Yes / No)
user, map_user = label_encode(glm_df['First-Time Depositor Subscriber (Yes / No)'])
glm_df.loc[:, 'First-Time Depositor Subscriber (Yes / No)'] = user
print('First-Time Depositor Subscriber (Yes / No): \n\t', map_user)

#  Inside United States
ALDPPS, map_ALDPPS = label_encode(glm_df['Inside United States'])
glm_df.loc[:, 'Inside United States'] = ALDPPS
print('Inside United States: \n\t', map_ALDPPS)

#  Signup On Promo Day
ALDPPS, map_ALDPPS = label_encode(glm_df['Signup On Promo Day'])
glm_df.loc[:, 'Signup On Promo Day'] = ALDPPS
print('Signup On Promo Day: \n\t', map_ALDPPS)

# Category Name
dummy_Category = pd.get_dummies(glm_df['Category Name'], prefix = 'Category_')
dummy_Category = dummy_Category.drop(columns = ['Category__unknown'])

# Medium Bucket
dummy_Medium = pd.get_dummies(glm_df['Medium Bucket'], prefix = 'Medium_')
dummy_Medium = dummy_Medium.drop(columns = ['Medium__unknown'])

# Start Year
dummy_startYear = pd.get_dummies(glm_df['Start Year'], prefix = 'startYear_', drop_first=True)

# Combine
glm_df = pd.concat([glm_df.drop(columns = ['Category Name', 'Medium Bucket', 'Start Year']),
                   dummy_Category,
                   dummy_Medium,
                   dummy_startYear], axis = 1)
glm_df.head()

In [None]:
glm_df.shape

# Correlation

In [None]:
def corrFilter(x: pd.DataFrame, bound: float):
    xCorr = glm_df.corr()
    xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
    xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
    return xFlattened

corrFilter(glm_df, .3)

In [None]:
# print again
print('Plan Status: \n\t', map_ALDPPS)
print('First-Time Depositor Subscriber (Yes / No): \n\t', map_user)
print('Inside United States: \n\t', map_ALDPPS)
print('Signup On Promo Day: \n\t', map_ALDPPS)

In [None]:
# #Feature importance
# X = glm_df.drop(['Plan Status'], axis = 1)  ## X usually means our input variables (or independent variables)
# y = glm_df['Plan Status']
# # logistic regression for feature importance
# # define the model
# model = LogisticRegression()
# # fit the model
# model.fit(X, y)
# sorted_idx = model.coef_[0].argsort()
# feature_importance_ = pd.DataFrame({'Importance' : model.coef_[0][sorted_idx]}, index = X.columns[sorted_idx])
# feature_importance_.nlargest(10, 'Importance').plot(kind='barh')
# plt.xlabel("Logistic Regression Feature Importance")

# Training / Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    glm_df.iloc[:, 1:],
    glm_df.iloc[:, 0],
    test_size = 0.3,
    stratify = glm_df.iloc[:, 0],
    random_state = 500
)

In [None]:
#do scaling
x_train_scale = x_train.copy()
x_test_scale = x_test.copy()
y_train_scale = y_train.copy()
y_test_scale = y_test.copy()

#store Mean, std from train, and standardize to test data

# numerical features
num_cols = ['Monthly Plan Deposit Amount','Plan Donation Rate']

# apply standardization on numerical features
for i in num_cols:
    
    # fit on training data column
    scale = StandardScaler().fit(x_train_scale[[i]])
    
    # transform the training data column
    x_train_scale[i] = scale.transform(x_train_scale[[i]])
    
    # transform the testing data column
    x_test_scale[i] = scale.transform(x_test_scale[[i]])


In [None]:
print('Shape of x_train: {}'.format(x_train.shape))
print('Shape of x_test: {}'.format(x_test.shape))

In [None]:
print("After OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train==0)))

In [None]:
# deal with data imbalance

smt = SMOTE(random_state=0)
x_train_SMOTE, y_train_SMOTE = smt.fit_sample(x_train, y_train)

In [None]:
print('Shape of x_train: {}'.format(x_train_SMOTE.shape))
print('Shape of x_test: {}'.format(y_train_SMOTE.shape))

In [None]:
print("After OverSampling, counts of label '1': {}".format(sum(y_train_SMOTE==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_SMOTE==0)))

In [None]:
#cross validation - fine tuning model
#training and validation loss plot

# Logistic Regression

Model 1 - Sklearn


In [None]:
# Raw regression and test on test set
LR = LogisticRegression(random_state=2021, max_iter=1000)
fit1 = LR.fit(x_train_scale, y_train_scale)
pred1 = fit1.predict(x_test_scale)

print(classification_report(y_test_scale, pred1))

In [None]:
# Regression using Cross Validation and test on train set
LR = LogisticRegressionCV(cv=5, random_state=2021, max_iter=1000)
fit2 = LR.fit(x_train_scale, y_train_scale)
pred2 = fit2.predict(x_train_scale)

print(classification_report(y_train_scale, pred2))

In [None]:
# Regression using Cross Validation and test on test set

LR = LogisticRegressionCV(cv=5, random_state=2021, max_iter=1000)
fit3 = LR.fit(x_train_scale, y_train_scale)
pred3 = fit3.predict(x_test_scale)

print(classification_report(y_test_scale, pred3))

In [None]:
# Details for cross validation

scores = cross_val_score(fit3, x_train_scale, y_train_scale, cv=5, scoring= 'accuracy')
print('Cross-Validation Accuracy Scores', scores)

In [None]:
#Check for overfitting
train_as = metrics.accuracy_score(pred2, y_train_scale)
test_as = metrics.accuracy_score(pred3, y_test_scale)
print(f"Accuracy score for test data : {test_as}")
print(f"Accuracy score for train data : {train_as}")

In [None]:
# bring in interaction terms
poly=preprocessing.PolynomialFeatures(2, interaction_only=False, include_bias=False)
df_array = poly.fit_transform(x_train_scale)

target_feature_names = ['_x_'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(x_train_scale.columns,p) for p in poly.powers_]]
result_train= pd.DataFrame(df_array, columns = target_feature_names)

In [None]:
# bring in interaction terms
poly=preprocessing.PolynomialFeatures(2, interaction_only=False, include_bias=False)
df_array = poly.fit_transform(x_test_scale)

target_feature_names = ['_x_'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(x_test_scale.columns,p) for p in poly.powers_]]
result_test= pd.DataFrame(df_array, columns = target_feature_names)

In [None]:
len(result_train.columns)

In [None]:
def roc_auc_plot(fpr, tpr, roc_auc):
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()

In [None]:
def logistic(X_train, y_train, X_test, y_test):
    logreg = LogisticRegressionCV(cv=5, random_state=2021, max_iter=10000)
    fit = logreg.fit(X_train, y_train)
    y_pred = fit.predict(X_test)
    y_prob = fit.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_prob)
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)

    print("=============Confusion Matrix=============")
    print(confusion_matrix(y_test, y_pred))
    print('\n')
    plot_confusion_matrix(fit, X_test, y_test, cmap=plt.cm.Blues)  
    plt.show()
    print("=============Key Metrics==================")
    print(classification_report(y_test, y_pred))
    print("=============ROC AUC======================")
    roc_auc_plot(fpr, tpr, roc_auc) 
    print('AUC:', roc_auc)

    print("=============ROC AUC======================")
  

    table=pd.DataFrame({'columns':X_test.columns, 'coef':fit.coef_.tolist()[0]})
    remain_terms=table[table['coef']!=0]
    sorted = remain_terms.sort_values(by=['coef'])
    print(sorted.head(10))
    return sorted

In [None]:
interaction_model = logistic(result_train,y_train_scale, result_test, y_test_scale)

In [None]:
pure_model = logistic(x_train_scale,y_train_scale, x_test_scale, y_test_scale)
print('')
print(pure_model)

Model 2 - statsmodels

In [None]:
import statsmodels.api as sm
lr = sm.Logit(y_train_scale, x_train_scale).fit(maxiter=1000)
lr.summary()

# RandomForest

In [None]:
# Use train and test data without scaling
# Random forest without tuning parameters
rf = RandomForestClassifier(random_state = 2021)
fit5 = rf.fit(x_train, y_train)
pred5 = fit5.predict(x_test)
print(classification_report(y_test, pred5))


Tuning Parameters

In [None]:
# Tuning parameter by random search and grid search
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100],
    'max_features': ['sqrt'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [9, 10, 11],
    'n_estimators': [350, 400, 450]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(x_train, y_train)
grid_search.best_params_

In [None]:
# Fit in the search result to RF model
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix

def random_forest(x_train, y_train, x_test, y_test):
    rfc = RandomForestClassifier(bootstrap = True, n_estimators = 400, min_samples_split = 9, min_samples_leaf = 3, max_features = 'sqrt', max_depth = 90)
    fit = rfc.fit(x_train, y_train)
    y_pred = rfc.predict(x_test)
    y_prob = rfc.predict_proba(x_test)[:,1]
    roc_auc = roc_auc_score(y_test, y_prob)
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    
    

    print("=============Confusion Matrix=============")
    print(confusion_matrix(y_test, y_pred))
    print('\n')
    labels = ['active', 'cancelled']
    plot_confusion_matrix(fit, x_test, y_test, display_labels= labels, cmap=plt.cm.Blues)  
    plt.show()
    print("=============Key Metrics==================")
    print(classification_report(y_test, y_pred))
    print("=============ROC AUC======================")
    roc_auc_plot(fpr, tpr, roc_auc) 
    print('AUC:', roc_auc)

In [None]:
random_forest(x_train, y_train, x_test, y_test)

Feature Importance

In [None]:
#Method 1: rf build in feature importance
#https://mljar.com/blog/feature-importance-in-random-forest/

rfc = RandomForestClassifier(bootstrap = True, n_estimators = 400, min_samples_split = 9, min_samples_leaf = 3, max_features = 'sqrt', max_depth = 90)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
y_prob = rfc.predict_proba(x_test)[:,1]
roc_auc = roc_auc_score(y_test, y_prob)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)


importances = rfc.feature_importances_
x_columns = x_train.columns[0:]
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, x_columns[indices[f]], importances[indices[f]]))

print("=============Confusion Matrix=============")
print(confusion_matrix(y_test, y_pred))
print('\n')
print("=============Key Metrics==================")
print(classification_report(y_test, y_pred))
print("=============ROC AUC======================")
roc_auc_plot(fpr, tpr, roc_auc) 
print('AUC:', roc_auc)

In [None]:
plt.figure(figsize=(10,10))
sorted_idx = rfc.feature_importances_.argsort()
plt.barh(x_test.columns[sorted_idx], rfc.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")
plt.show()

In [None]:
#Method 2: Drop column feature importance
from sklearn.base import clone 

def imp_df(column_names, importances):
    data = {
        'Feature': column_names,
        'Importance': importances,
    }
    df = pd.DataFrame(data) \
        .set_index('Feature') \
        .sort_values('Importance', ascending=False)

    return df
    
def drop_col_feat_imp(model, X_train, y_train, random_state = 2021):
    
    # clone the model to have the exact same specification as the one initially trained
    model_clone = clone(model)
    # set random_state for comparability
    model_clone.random_state = random_state
    # training and scoring the benchmark model
    model_clone.fit(X_train, y_train)
    benchmark_score = model_clone.score(X_train, y_train)
    # list for storing feature importances
    importances = []
    
    # iterating over all columns and storing feature importance (difference between benchmark and new model)
    for col in X_train.columns:
        model_clone = clone(model)
        model_clone.random_state = random_state
        model_clone.fit(X_train.drop(col, axis = 1), y_train)
        drop_col_score = model_clone.score(X_train.drop(col, axis = 1), y_train)
        importances.append(benchmark_score - drop_col_score)
    
    importances_df = imp_df(X_train.columns, importances)
    return importances_df

In [None]:
feature_importance = drop_col_feat_imp(rfc, x_train, y_train, random_state = 2021)
feature_importance

In [None]:
plt.figure(figsize=(10,10))
plt.barh(feature_importance.index, feature_importance.Importance)
plt.gca().invert_yaxis()
plt.xlabel("Random Forest Drop Column Feature Importance")
plt.show()

SHAP

In [None]:
rfc = RandomForestClassifier(bootstrap = True, n_estimators = 400, min_samples_split = 9, min_samples_leaf = 3, max_features = 'sqrt', max_depth = 90)
fit = rfc.fit(x_train, y_train)
shap_values = shap.TreeExplainer(fit).shap_values(x_train)

In [None]:
shap.summary_plot(shap_values[1], x_train)

In [None]:
def ABS_SHAP(df_shap,df):
    #import matplotlib as plt
    # Make a copy of the input data
    shap_v = pd.DataFrame(df_shap)
    feature_list = df.columns
    shap_v.columns = feature_list
    df_v = df.copy().reset_index().drop('index',axis=1)
    
    # Determine the correlation in order to plot with different colors
    corr_list = list()
    for i in feature_list:
        b = np.corrcoef(shap_v[i],df_v[i])[1][0]
        corr_list.append(b)
    corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
    # Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
    corr_df.columns  = ['Variable','Corr']
    corr_df['Sign'] = np.where(corr_df['Corr']>0,'red','blue')
    
    # Plot it
    shap_abs = np.abs(shap_v)
    k=pd.DataFrame(shap_abs.mean()).reset_index()
    k.columns = ['Variable','SHAP_abs']
    k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
    k2 = k2.sort_values(by='SHAP_abs',ascending = True)
    colorlist = k2['Sign']
    ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
    ax.set_xlabel("SHAP Value (Red = Positive Impact)")
    
ABS_SHAP(shap_values[1],x_train) 

In [None]:
def make_shap_waterfall_plot(shap_values, features, num_display=20):
    column_list = features.columns
    feature_ratio = (np.abs(shap_values).sum(0) / np.abs(shap_values).sum()) * 100
    column_list = column_list[np.argsort(feature_ratio)[::-1]]
    feature_ratio_order = np.sort(feature_ratio)[::-1]
    cum_sum = np.cumsum(feature_ratio_order)
    column_list = column_list[:num_display]
    feature_ratio_order = feature_ratio_order[:num_display]
    cum_sum = cum_sum[:num_display]
    
    num_height = 0
    if (num_display >= 20) & (len(column_list) >= 20):
        num_height = (len(column_list) - 20) * 0.4
        
    fig, ax1 = plt.subplots(figsize=(8, 8 + num_height))
    ax1.plot(cum_sum[::-1], column_list[::-1], c='blue', marker='o')
    ax2 = ax1.twiny()
    ax2.barh(column_list[::-1], feature_ratio_order[::-1], alpha=0.6)
    
    ax1.grid(True)
    ax2.grid(False)
    ax1.set_xticks(np.arange(0, round(cum_sum.max(), -1)+1, 10))
    ax2.set_xticks(np.arange(0, round(feature_ratio_order.max(), -1)+1, 10))
    ax1.set_xlabel('Cumulative Ratio')
    ax2.set_xlabel('Composition Ratio')
    ax1.tick_params(axis="y", labelsize=13)
    plt.ylim(-1, len(column_list))
    
# Shap waterfall plot
make_shap_waterfall_plot(shap_values[1], x_train)

In [None]:
# Let's have a look on the top features and their dependence plots
top_features = ['Plan Donation Rate', 'Category__Women', 'startYear__2021', 'Monthly Plan Deposit Amount', 'Medium__Email']

# Shap Dependence Plot for top features
for feature in top_features:
    shap.dependence_plot(feature, shap_values[1], x_train, interaction_index=None)

In [None]:
#XGboost