# Binary Predictors in a Logistic Regression

Using the same code as in the previous exercise, find the odds of 'duration'. 

What do they tell you?

## Import the relevant libraries

In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Load the data

Load the ‘Bank_data.csv’ dataset.

In [None]:
raw_data = pd.read_csv('Bank-data.csv')
raw_data

In [None]:
# We make sure to create a copy of the data before we start altering it. Note that we don't change the original data we loaded.
data = raw_data.copy()

# Removes the index column thata comes with the data
data = data.drop(['Unnamed: 0'], axis = 1)

# We use the map function to change any 'yes' values to 1 and 'no'values to 0. 
data['y'] = data['y'].map({'yes':1, 'no':0})
data

### Declare the dependent and independent variables

Use 'duration' as the independet variable.

In [None]:
y = data['y']
x1 = data['duration']

### Simple Logistic Regression

Run the regression.

In [None]:
x=sm.add_constant(x1)
reg_log=sm.Logit(y,x)
results_log=reg_log.fit()
results_log.summary()

In [None]:
# Create a scatter plot of x1 (Duration, no constant) and y (Subscribed)
plt.scatter(x1,y,color = 'C0')
# Don't forget to label your axes!
plt.xlabel('Duration', fontsize = 20)
plt.ylabel('Subscription', fontsize = 20)
plt.show()

### Find the odds of duration

In [None]:
import numpy as np
np.exp(0.0051)

### Expand the model and Declare the independent variable(s)

In [None]:
# To avoid writing them out every time, we save the names of the estimators of our model in a list. 
estimators=['interest_rate','march','credit','previous','duration']

X1 = data[estimators]
y = data['y']

In [None]:
X = sm.add_constant(X1)
reg_logit = sm.Logit(y,X)
results_logit = reg_logit.fit()
results_logit.summary2()

### Accuracy

In [None]:
np.set_printoptions(formatter={'float': lambda x:"{0:0.2f}".format(x)})
results_logit.predict()

In [None]:
np.array(data['y'])

In [None]:
results_logit.pred_table()

In [None]:
cm_df = pd.DataFrame(results_logit.pred_table())
cm_df.columns=['Predicted 0','Predicted 1']
cm_df=cm_df.rename(index={0:'Actual 0',1:'Actual 1'})
cm_df

### testing the model and accessing its accuracy

In [None]:
raw_data2= pd.read_csv("Bank_testing.csv")
data_test=raw_data2.copy()
data_test=data_test.drop(['Unnamed: 0'],axis=1)

In [None]:
data_test['y']=data_test['y'].map({'yes':1, 'no':0})
data_test

In [None]:
y_test=data_test['y']
X1_test=data_test[estimators]
X_list=sm.add_constant(X1_test)

In [None]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [None]:
confusion_matrix(X,y,results_logit)