In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df:stats.chi12.sf(chisq,df)

In [2]:
raw_data = pd.read_csv('Bank-data.csv')

In [3]:
raw_data

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
...,...,...,...,...,...,...,...,...
513,513,1.334,0.0,1.0,0.0,0.0,204.0,no
514,514,0.861,0.0,0.0,2.0,1.0,806.0,yes
515,515,0.879,0.0,0.0,0.0,0.0,290.0,no
516,516,0.877,0.0,0.0,5.0,1.0,473.0,yes


In [4]:
raw_data = raw_data.drop(['Unnamed: 0'], axis = 1)

In [5]:
raw_data

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4.856,0.0,1.0,0.0,0.0,157.0,no
...,...,...,...,...,...,...,...
513,1.334,0.0,1.0,0.0,0.0,204.0,no
514,0.861,0.0,0.0,2.0,1.0,806.0,yes
515,0.879,0.0,0.0,0.0,0.0,290.0,no
516,0.877,0.0,0.0,5.0,1.0,473.0,yes


In [6]:
data = raw_data.copy()
data['y']= data['y'].map({'yes':1,'no':0})
data

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,0
1,0.767,0.0,0.0,2.0,1.0,274.0,1
2,4.858,0.0,1.0,0.0,0.0,167.0,0
3,4.120,0.0,0.0,0.0,0.0,686.0,1
4,4.856,0.0,1.0,0.0,0.0,157.0,0
...,...,...,...,...,...,...,...
513,1.334,0.0,1.0,0.0,0.0,204.0,0
514,0.861,0.0,0.0,2.0,1.0,806.0,1
515,0.879,0.0,0.0,0.0,0.0,290.0,0
516,0.877,0.0,0.0,5.0,1.0,473.0,1


In [7]:
data.columns.values

array(['interest_rate', 'credit', 'march', 'may', 'previous', 'duration',
       'y'], dtype=object)

In [8]:
y = data['y']
x1 = data[['interest_rate', 'credit', 'march', 'may', 'previous', 'duration']]

## Creating the Logistic Regression

In [9]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()


Optimization terminated successfully.
         Current function value: 0.335942
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,511.0
Method:,MLE,Df Model:,6.0
Date:,"Fri, 28 Feb 2020",Pseudo R-squ.:,0.5153
Time:,22:55:16,Log-Likelihood:,-174.02
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,7.579e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1385,0.339,-0.408,0.683,-0.804,0.527
interest_rate,-0.7802,0.092,-8.471,0.000,-0.961,-0.600
credit,2.4028,1.090,2.205,0.027,0.267,4.538
march,-1.8097,0.332,-5.459,0.000,-2.459,-1.160
may,0.1946,0.229,0.849,0.396,-0.255,0.644
previous,1.2746,0.583,2.186,0.029,0.132,2.417
duration,0.0070,0.001,9.386,0.000,0.006,0.008


In [10]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------

        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [11]:
confusion_matrix(x,y,results_log)

(array([[220.,  39.],
        [ 31., 228.]]),
 0.8648648648648649)

In [12]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0',1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,220.0,39.0
Actual 1,31.0,228.0


In [54]:
test_data = pd.read_csv('Bank-data-testing.csv')

In [55]:
test_data = test_data.drop(['Unnamed: 0'], axis = 1)
test_data['y'] = test_data['y'].map({'yes':1,'no':0})
test_actul = test_data['y']
test_data = test_data.drop(['y'],axis= 1)

In [63]:
test_data = sm.add_constant(test_data)
test_data

Unnamed: 0,const,interest_rate,credit,march,may,previous,duration
0,1.0,1.313,0.0,1.0,0.0,0.0,487.0
1,1.0,4.961,0.0,0.0,0.0,0.0,132.0
2,1.0,4.856,0.0,1.0,0.0,0.0,92.0
3,1.0,4.120,0.0,0.0,0.0,0.0,1468.0
4,1.0,4.963,0.0,0.0,0.0,0.0,36.0
...,...,...,...,...,...,...,...
217,1.0,4.963,0.0,0.0,0.0,0.0,458.0
218,1.0,1.264,0.0,1.0,1.0,0.0,397.0
219,1.0,1.281,0.0,1.0,0.0,0.0,34.0
220,1.0,0.739,0.0,0.0,2.0,0.0,233.0


In [64]:
test_actul

0      0
1      0
2      0
3      1
4      0
      ..
217    1
218    1
219    0
220    0
221    1
Name: y, Length: 222, dtype: int64

In [67]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------

        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [69]:
cm = confusion_matrix(test_data,test_actul,results_log)
cm

(array([[94., 17.],
        [12., 99.]]),
 0.8693693693693694)