In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
sns.set()

In [2]:
raw_data = pd.read_csv('2.02. Binary predictors.csv')
raw_data

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male
...,...,...,...
163,1722,Yes,Female
164,1750,Yes,Male
165,1555,No,Male
166,1524,No,Male


In [3]:
data = raw_data.copy()
data['Gender'] = data['Gender'].map({'Male':0, 'Female':1})
data['Admitted'] = data['Admitted'].map({'Yes':1,'No':0})
data

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0
...,...,...,...
163,1722,1,1
164,1750,1,0
165,1555,0,0
166,1524,0,0


In [4]:
y = data['Admitted']
x1 = data['Gender']

In [5]:
x = sm.add_constant(x1)
reg = sm.Logit(y, x).fit()
reg.summary()

Optimization terminated successfully.
         Current function value: 0.572260
         Iterations 5


  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 18 May 2020",Pseudo R-squ.:,0.1659
Time:,02:30:30,Log-Likelihood:,-96.14
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,6.283e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6436,0.222,-2.901,0.004,-1.078,-0.209
Gender,2.0786,0.363,5.727,0.000,1.367,2.790


**Logit model**
The logit model for Gender - Admittance regression is:

$log(odds)$ = -0.6436 + 2.08 * Gender

Lets consider two odds:

$log(odds{\substack{1}}) =-0.6436 + 2.08 * Gender{\substack{1}}$

$log(odds{\substack{2}}) =-0.6436 + 2.08 * Gender{\substack{2}}$

Now lets subtract both:

$log(odds{\substack{2}}/odds{\substack{1}}) = 2.08(Gender{\substack{2}} - Gender{\substack{1}})$

- now lets assume Gender2 = female(1) and Gender1 = male(0)

$log(odds{\substack{female}}/odds{\substack{male}}) = 2.08(1 - 0)$

$log(odds{\substack{female}}/odds{\substack{male}}) = 2.08$

exponenting both sides implies:

$e^{log(odds{\substack{female}}/odds{\substack{male}})} = e^{2.08}$

$\frac{odds\substack{female}}{odds\substack{male}} = 7.99$

Therefore $odds\substack{female} = 7.99 * odds\substack{male}$

***
***
Now lets add SAT into independent variable and analyze the summary of regression:
***

In [6]:
y = data['Admitted']
x1 = data[['Gender', 'SAT']]

x = sm.add_constant(x1)
reg = sm.Logit(y, x).fit()
reg.summary()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 18 May 2020",Pseudo R-squ.:,0.8249
Time:,02:30:30,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
Gender,1.9449,0.846,2.299,0.022,0.287,3.603
SAT,0.0406,0.010,4.129,0.000,0.021,0.060


 From the above summary what we got is:
 - higher log-likelihood, meaning it is better one. SAT was an outstanding predicator
 - the new coefficient for Gender is 1.9449

In [7]:
np.exp(1.9449)

6.992932526814459

**The above shows that: the same SAT score, a female has 7 times higher odds to get admitted.**
From the above dataset we can dedict that _This particular university(degreee) it is much easier for females to enter._

## Accuracy
Lets check the accuracy of our model:

*sm.LogitResults.predict():* returns the values predicted by our model 

In [8]:
reg.predict()

array([2.24098643e-06, 9.98264069e-01, 9.99997581e-01, 2.25470272e-01,
       2.48392751e-02, 9.92249420e-01, 9.96544212e-01, 9.99963261e-01,
       9.99971204e-01, 1.48031753e-02, 9.99875812e-01, 9.99951185e-01,
       7.60867651e-01, 2.33384671e-06, 5.96283811e-01, 9.99834996e-01,
       1.14446654e-01, 1.18626448e-01, 5.05147726e-01, 9.99865308e-01,
       9.99999366e-01, 9.99997048e-01, 1.71939595e-04, 5.61635704e-03,
       9.68663798e-01, 9.99644611e-01, 4.84851641e-01, 9.91962775e-01,
       9.99828160e-01, 9.94609023e-01, 1.15028367e-04, 8.32585363e-01,
       2.47449367e-01, 9.99998840e-01, 9.98847293e-01, 9.99372736e-01,
       3.12716933e-01, 9.99932453e-01, 2.32639633e-01, 5.29744519e-05,
       1.95739604e-02, 4.54521689e-01, 9.99956956e-01, 2.97763113e-06,
       9.94178832e-01, 1.77714430e-05, 9.93914956e-01, 2.29360536e-04,
       3.30501192e-04, 6.89914934e-03, 4.24966754e-03, 9.99999657e-01,
       9.23952460e-01, 2.28569785e-02, 9.99994550e-01, 5.47478329e-06,
      

In [9]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
reg.predict()

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

The above predicted values by the model are probabilities, in the model these are the values of $\pi$.

Here the values less than 0.5 are less than 50% chance of admissions and same way values greater than 0.5 are greather than 50% chance.

The below are the actual values:

In [10]:
np.array(data['Admitted'])

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

**If the 80% of the predicted values coincide with the actual values, we say the model has 80% accuracy.**

Lets summaries in table by: _using pred_table_

In [11]:
reg.pred_table() #the values below are bit confusion, lets create a new tables in understanding way.

array([[69.00, 5.00],
       [4.00, 90.00]])

In [12]:
cm_df = pd.DataFrame(reg.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


The above matrix is called confusion matrix, it shows how confused our model is:

**The model did its job well**
- For 69 Observations the model predicted 0 and the true value was 0
- For 90 Observations the model predicted 1 and the true values was 1

**The model got confused**
- For 4 Observations the model predicted 0 and the true value was 1
- For 5 Observations the model predicted 1 and the true value was 0

Overall the model made an accurate prediction in 159/168 cases =>
159/168 = 0.946 = **94.6% accuracy**

In [13]:
cm = np.array(cm_df)
accuracy = ((cm[0,0]+cm[1,1])/cm.sum())*100
accuracy

94.64285714285714

## Checking for Overfit
Lets split the data as train and test in a ratio of 70:30

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x1, y, test_size=0.3, random_state=365)

In [15]:
#Applying Logit regression on Train data:
X_Train = sm.add_constant(x_train)
reg_train = sm.Logit(y_train, X_Train).fit()
reg_train.summary()

Optimization terminated successfully.
         Current function value: 0.108751
         Iterations 10


  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,117.0
Model:,Logit,Df Residuals:,114.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 18 May 2020",Pseudo R-squ.:,0.843
Time:,02:30:32,Log-Likelihood:,-12.724
converged:,True,LL-Null:,-81.06
Covariance Type:,nonrobust,LLR p-value:,2.1e-30

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-69.0770,21.132,-3.269,0.001,-110.495,-27.659
Gender,2.3388,1.089,2.147,0.032,0.204,4.474
SAT,0.0407,0.013,3.240,0.001,0.016,0.065


In [16]:
def confusion_matrix(data, actual_values, model):
    pred_values = model.predict(data)
    bins = np.array([0,0.5,1])
    cm = np.histogram2d(actual_values, pred_values, bins = bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm, accuracy

In [17]:
cm = confusion_matrix(X_Train, y_train, reg_train)
accuracy_train = cm[1]
cm

(array([[55.00, 2.00],
        [4.00, 56.00]]), 0.9487179487179487)

Lets find the accuracy for test data:

In [18]:
#Applying Logit regression on Test data:
X_Test = sm.add_constant(x_test)
reg_test = sm.Logit(y_test, X_Test).fit()
reg_test.summary()

Optimization terminated successfully.
         Current function value: 0.119181
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,51.0
Model:,Logit,Df Residuals:,48.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 18 May 2020",Pseudo R-squ.:,0.8128
Time:,02:30:32,Log-Likelihood:,-6.0782
converged:,True,LL-Null:,-32.462
Covariance Type:,nonrobust,LLR p-value:,3.48e-12

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-79.1481,33.146,-2.388,0.017,-144.113,-14.183
Gender,0.8311,1.490,0.558,0.577,-2.090,3.752
SAT,0.0479,0.020,2.383,0.017,0.009,0.087


In [19]:
cm_test = confusion_matrix(X_Test, y_test, reg_test)
accuracy_test = cm_test[1]
cm_test

(array([[15.00, 2.00],
        [2.00, 32.00]]), 0.9215686274509803)

In [20]:
#converting into table
cm_test_df = pd.DataFrame(cm_test[0])
cm_test_df.columns = ['Predicted 0','Predicted 1']
cm_test_df = cm_test_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_test_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15.0,2.0
Actual 1,2.0,32.0


In [21]:
print ('Missclassification rate: '+str((cm_test[0][0,1]+cm_test[0][1,0])/(cm_test[0][0,0]+cm_test[0][1,1])))

Missclassification rate: 0.0851063829787234
