In [1]:
import numpy as np
import pandas as pd
import scipy
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
sns.set()

In [2]:
raw_data = pd.read_csv('binary_predictors.csv') # gender
raw_data['Admitted'] = raw_data['Admitted'].map({'Yes': 1, 'No': 0})
raw_data['Gender'] = raw_data['Gender'].map({'Female': 1, 'Male': 0})
# male is baseline
raw_data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


In [3]:
y = raw_data['Admitted']
x1 = raw_data['Gender']
x = sm.add_constant(x1)
reg_log = sm.Logit(y, x)
results_log = reg_log.fit()
results_log.summary()
# model is significant
# log of odds = -0.6436 + 2.0786 * Gender
# odds female = 7.99 * odds male

Optimization terminated successfully.
         Current function value: 0.572260
         Iterations 5


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Sat, 13 Sep 2025",Pseudo R-squ.:,0.1659
Time:,17:29:38,Log-Likelihood:,-96.14
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,6.283e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6436,0.222,-2.901,0.004,-1.078,-0.209
Gender,2.0786,0.363,5.727,0.000,1.367,2.790


In [4]:
x1 = raw_data[['SAT', 'Gender']]
x = sm.add_constant(x1)
reg_log = sm.Logit(y, x)
results_log = reg_log.fit()
results_log.summary()
# log likelihood increased

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 13 Sep 2025",Pseudo R-squ.:,0.8249
Time:,17:29:38,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


In [5]:
np.set_printoptions(formatter={'float': lambda x: '{0:0.2f}'.format(x)})
results_log.predict() # probs
np.array(raw_data['Admitted'])
results_log.pred_table()

cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0: 'Actual 0', 1: 'Actual 1'})
cm_df # confusion matrix

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


In [6]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0] + cm[1,1]) / cm.sum()
accuracy_train

np.float64(0.9464285714285714)

In [7]:
test = pd.read_csv('test_dataset.csv')
test['Admitted'] = test['Admitted'].map({'Yes': 1, 'No': 0})
test['Gender'] = test['Gender'].map({'Female': 1, 'Male': 0})

test_actual = test['Admitted']
test_data = test.drop(['Admitted'], axis=1) # sat and gender
test_data = sm.add_constant(test_data)
# now data looks same w input data
# normally u d need to reorder 

In [8]:
def confusion_matrix(data, actual_values, model):
    pred_vals = model.predict(data)
    bins = np.array([0, 0.5, 1])
    cm = np.histogram2d(actual_values, pred_vals, bins=bins)[0] # 0 or 1
    accuracy = (cm[0, 0] + cm[1, 1]) / cm.sum()
    return cm, accuracy

In [9]:
cm = confusion_matrix(test_data, test_actual, results_log)
cm

(array([[5.00, 1.00],
        [1.00, 12.00]]),
 np.float64(0.8947368421052632))

In [10]:
# opposite of accuracy is missclassification rate 