# Logistic regression with Binary predictors

In [10]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Prepare vars

In [11]:
raw_data = pd.read_csv('./data/binarypredictors.csv')
data = raw_data.copy()
# transform categorical data to binary
data['Admitted'] = data['Admitted'].map({'Yes': 1, 'No': 0})
data['Gender'] = data['Gender'].map({'Female': 1, 'Male': 0}) # Male is the baseline or reference group
data = data.drop(['SAT'], axis=1)
data

Unnamed: 0,Admitted,Gender
0,0,0
1,1,1
2,1,1
3,0,0
4,0,0
...,...,...
163,1,1
164,1,0
165,0,0
166,0,0


### Logistic regression

In [12]:
y = data['Admitted']
x1 = data['Gender']
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.572260
         Iterations 5


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 17 May 2023",Pseudo R-squ.:,0.1659
Time:,22:43:35,Log-Likelihood:,-96.14
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,6.283e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6436,0.222,-2.901,0.004,-1.078,-0.209
Gender,2.0786,0.363,5.727,0.000,1.367,2.790


### Interpretation

log(odds) = -064 + 2.08 * Gender

log(odds2) = -064 + 2.08 * Gender2

log(odds1) = -064 + 2.08 * Gender1

substracting the above two equations => log(odds2) - log(odds1) = 2.08(Gender2-Gender1)

since log(a)-log(b)=log(a/b)=> log(odds2/odds1) = 2.08(Gender2-Gender1)

log(oddsfemale/oddsmale) = 2.08(1-0) = 2.08

take exp both sides => oddsfemale/oddsmale = exp(2.08) => oddsfemale = 7.99*oddsmale

=> female are 7.99 (e^bk) times more likely than male to succeed

### Model accuracy

In [13]:
# Compute model accuracy based on predictions using confusion matrix
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0', 1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,59.0,15.0
Actual 1,31.0,63.0


In [14]:
# Accuracy = (59 + 63) / (59 + 63 + 15 + 31) = 73% accuracy
cm = np.array(cm_df)
accuracy = (cm[0,0]+cm[1,1])/cm.sum()
accuracy

0.7261904761904762

### Testing the model

In [15]:
# 90-10 split
test = pd.read_csv('./data/binarypredictorstest.csv')
test['Admitted'] = test['Admitted'].map({'Yes': 1, 'No': 0})
test['Gender'] = test['Gender'].map({'Female': 1, 'Male': 0})
test

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0
5,1556,1,1
6,1731,1,1
7,1809,1,1
8,1930,1,1
9,1708,1,0


In [16]:
# Make test data same format as training data
test_actual = test['Admitted']
test_data = test.drop(['Admitted'], axis=1)
test_data = test_data.drop(['SAT'], axis=1)
test_data = sm.add_constant(test_data)
# test_data = test_data[x.columns.values] if needed to fix the columns order so they match
test_data

Unnamed: 0,const,Gender
0,1.0,0
1,1.0,1
2,1.0,1
3,1.0,0
4,1.0,0
5,1.0,1
6,1.0,1
7,1.0,1
8,1.0,1
9,1.0,0


In [18]:
cm_df = pd.DataFrame(cm)
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0', 1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,59.0,15.0
Actual 1,31.0,63.0


In [22]:
# Manually compute confusion matrix
def confusion_matrix(data, actual_values, model):
    pred_values = model.predict(data)
    bins=np.array([0,0.5,1])
    cm=np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    missclassification_rate = 1-accuracy
    return accuracy, missclassification_rate
accurary,missclassification_rate = confusion_matrix(test_data, test_actual, results_log)
print('Accuracy: ' + str(accuracy))
print('Missclassification: ' + str(missclassification_rate))

Accuracy: 0.7261904761904762
Missclassification: 0.21052631578947367
