In [22]:
from xverse.transformer import MonotonicBinning

In [23]:
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn

np.random.seed(1)
from sklearn.pipeline import make_pipeline

#initial data import
df = pd.read_csv('dfsmote.csv')

df = df.drop(["Unnamed: 0"],axis=1)

In [24]:
#define our X and y
X = df.drop(["Outcome_G"],axis=1)
y = df["Outcome_G"]

In [25]:
#determines bins for weight of evidence transformation
clf = MonotonicBinning()
clf.fit(X,y)

print(clf.bins)
output_bins = clf.bins

{'Term': array([  3.,  36.,  58., 180.]), 'Age1': array([1802., 3583., 4711., 8103.]), 'MUDI': array([0., 2., 6.]), 'Avg Transactions': array([  0.,  18.,  32., 187.]), 'Avg Bal': array([     0.,    435.,   1543., 135211.]), 'Avg Turnover': array([     0.,   2000.,   3416., 149227.]), 'Time with Bank Years': array([ 0.,  7., 10., 32.]), 'Total Salary': array([  0.,  28.,  43., 685.]), 'Multiple Customers': array([0, 1]), 'Single Purpose': array([0, 1])}


In [26]:
#transform the dataset into these bins
clf = MonotonicBinning(custom_binning = output_bins)

out_X = clf.transform(X)
out_X.head()

Unnamed: 0,Term,Age1,MUDI,Avg Transactions,Avg Bal,Avg Turnover,Time with Bank Years,Total Salary,Multiple Customers,Single Purpose
0,"(58.0, 180.0]","(4711.0, 8103.0]","(2.0, 6.0]","(-0.001, 18.0]","(1543.0, 135211.0]","(-0.001, 2000.0]","(7.0, 10.0]","(43.0, 685.0]",0,0
1,"(2.999, 36.0]","(3583.0, 4711.0]","(2.0, 6.0]","(18.0, 32.0]","(-0.001, 435.0]","(2000.0, 3416.0]","(7.0, 10.0]","(28.0, 43.0]",0,0
2,"(58.0, 180.0]","(1801.999, 3583.0]","(-0.001, 2.0]","(-0.001, 18.0]","(1543.0, 135211.0]","(-0.001, 2000.0]","(-0.001, 7.0]","(-0.001, 28.0]",0,0
3,"(2.999, 36.0]","(1801.999, 3583.0]","(-0.001, 2.0]","(32.0, 187.0]","(435.0, 1543.0]","(2000.0, 3416.0]","(-0.001, 7.0]","(-0.001, 28.0]",0,0
4,"(58.0, 180.0]","(1801.999, 3583.0]","(-0.001, 2.0]","(-0.001, 18.0]","(-0.001, 435.0]","(2000.0, 3416.0]","(7.0, 10.0]","(43.0, 685.0]",0,0


In [27]:
#calculate weights of evidence for each bin
from xverse.transformer import WOE

clf = WOE()
clf.fit(X,y)

clf.woe_df

Unnamed: 0,Variable_Name,Category,Count,Event,Non_Event,Event_Rate,Non_Event_Rate,Event_Distribution,Non_Event_Distribution,WOE,Information_Value
0,Age1,"(1801.999, 3583.0]",2986,2003,983,0.670797,0.329203,0.335511,0.329313,0.018645,0.00096
1,Age1,"(3583.0, 4711.0]",2987,2008,979,0.672246,0.327754,0.336348,0.327973,0.025216,0.00096
2,Age1,"(4711.0, 8103.0]",2982,1959,1023,0.656942,0.343058,0.328141,0.342714,-0.043453,0.00096
3,Avg Bal,"(-0.001, 435.0]",2989,1890,1099,0.632319,0.367681,0.316583,0.368174,-0.150971,0.023472
4,Avg Bal,"(435.0, 1543.0]",2982,2125,857,0.712609,0.287391,0.355946,0.287102,0.214942,0.023472
5,Avg Bal,"(1543.0, 135211.0]",2984,1955,1029,0.655161,0.344839,0.327471,0.344724,-0.051344,0.023472
6,Avg Transactions,"(-0.001, 18.0]",3129,1677,1452,0.535954,0.464046,0.280905,0.486432,-0.549083,0.188628
7,Avg Transactions,"(18.0, 32.0]",2845,2038,807,0.716344,0.283656,0.341374,0.270352,0.233253,0.188628
8,Avg Transactions,"(32.0, 187.0]",2981,2255,726,0.756458,0.243542,0.377722,0.243216,0.440208,0.188628
9,Avg Turnover,"(-0.001, 2000.0]",2987,1401,1586,0.469032,0.530968,0.234673,0.531323,-0.817176,0.404061


In [28]:
woe_data = clf.transform(X)

In [29]:
woe_data

Unnamed: 0,Term,Age1,MUDI,Avg Transactions,Avg Bal,Avg Turnover,Time with Bank Years,Total Salary,Multiple Customers,Single Purpose
0,-0.205258,-0.043453,-0.812415,-0.549083,-0.051344,-0.817176,0.177524,0.671806,-0.146465,-1.175177
1,0.703098,0.025216,-0.812415,0.233253,-0.150971,0.335199,0.177524,0.186513,-0.146465,-1.175177
2,-0.205258,0.018645,0.414014,-0.549083,-0.051344,-0.817176,-0.366564,-0.674894,-0.146465,-1.175177
3,0.703098,0.018645,0.414014,0.440208,0.214942,0.335199,-0.366564,-0.674894,-0.146465,-1.175177
4,-0.205258,0.018645,0.414014,-0.549083,-0.150971,0.335199,0.177524,0.671806,-0.146465,-1.175177
...,...,...,...,...,...,...,...,...,...,...
8950,-0.205258,0.018645,0.414014,0.233253,0.214942,0.659555,-0.366564,0.186513,-0.146465,2.117002
8951,0.703098,-0.043453,0.414014,0.440208,-0.150971,0.335199,0.245500,0.671806,0.743678,2.117002
8952,0.703098,0.018645,0.414014,0.440208,-0.150971,0.335199,-0.366564,0.186513,-0.146465,2.117002
8953,-0.205258,0.018645,-0.812415,0.440208,0.214942,0.335199,0.177524,-0.674894,-0.146465,-1.175177


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(woe_data,y,test_size = 0.2)

In [16]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression(penalty = "none", fit_intercept=False,max_iter=10000)

lrmod = logr.fit(X_train,y_train)

In [17]:
from sklearn.metrics import confusion_matrix

y_pred_lr = logr.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pred_lr)
cnf_matrix

array([[509,  80],
       [265, 937]])

In [18]:
import sklearn.metrics as metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred_lr))
print("Precision:",metrics.precision_score(y_test, y_pred_lr))
print("Recall:",metrics.recall_score(y_test, y_pred_lr))
print("F1Score:" ,metrics.f1_score(y_test, y_pred_lr))

Accuracy: 0.8073701842546064
Precision: 0.9213372664700098
Recall: 0.7795341098169717
F1Score: 0.8445245606128887


In [20]:
import matplotlib.pyplot as plt

y_pred_proba = logr.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data, auc="+str(auc))
plt.legend(loc=4)
plt.show()

  plt.show()


In [21]:
auc

0.8915990327383054

In [30]:
X = df.drop(["Outcome_G"],axis = 1)

print("Model coefficients:\n")
for i in range(X_train.shape[1]):
    print(X_train.columns[i], "=", logr.coef_[0][i].round(6))

Model coefficients:

Term = 0.844198
Age1 = 2.660239
MUDI = 0.448021
Avg Transactions = 0.509083
Avg Bal = 0.332859
Avg Turnover = 0.373567
Time with Bank Years = 0.933824
Total Salary = 0.182516
Multiple Customers = 1.119728
Single Purpose = 0.795117
