In [1]:
import warnings
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score
import scipy
import seaborn as sns
import matplotlib.pyplot as plt

# Problem 5

In [2]:
default = pd.read_csv('Default.csv', na_values=['?'])
default.dropna(axis=0, inplace=True)
default.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [3]:
#@title 5.a
X = default.loc[:, ['income', 'balance']]
y = pd.get_dummies(default['default'])['Yes']

default_logit = LogisticRegression(fit_intercept=True)
default_logit.fit(X, y)

default_logit.intercept_, default_logit.coef_

(array([-11.54046792]), array([[2.08089741e-05, 5.64710265e-03]]))

In [4]:
#@title 5.b
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sm.add_constant(X), y, test_size=0.3, random_state=1)
def_logit = sm.Logit(y_train, X_train).fit()
def_logit.summary()

Optimization terminated successfully.
         Current function value: 0.078788
         Iterations 10


0,1,2,3
Dep. Variable:,Yes,No. Observations:,7000.0
Model:,Logit,Df Residuals:,6997.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 03 Jul 2025",Pseudo R-squ.:,0.4758
Time:,16:56:19,Log-Likelihood:,-551.52
converged:,True,LL-Null:,-1052.0
Covariance Type:,nonrobust,LLR p-value:,4.2489999999999996e-218

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-11.8581,0.528,-22.477,0.000,-12.892,-10.824
income,2.223e-05,6e-06,3.707,0.000,1.05e-05,3.4e-05
balance,0.0059,0.000,21.078,0.000,0.005,0.006


In [5]:
from sklearn.metrics import accuracy_score

predictions = def_logit.predict(X_test)
predictions[predictions >= 0.5] = 1
predictions[predictions < 0.5] = 0


print(f'Misclassification Error: {1 - accuracy_score(y_test, predictions)}')

Misclassification Error: 0.024666666666666615


In [6]:
#@title 5.c
miss_class_err = []
for i in range(3):
    X_train, X_test, y_train, y_test = train_test_split(sm.add_constant(X), y, test_size=0.7)
    def_logit = LogisticRegression(fit_intercept=True).fit(X_train, y_train)

    predictions = def_logit.predict(X_test)
    predictions[predictions >= 0.5] = 1
    predictions[predictions < 0.5] = 0

    error_rate = 1 - accuracy_score(y_test, predictions)
    miss_class_err.append(error_rate)
    print(f'Misclassification Error - {i+1}: {error_rate}')

Misclassification Error - 1: 0.028714285714285692
Misclassification Error - 2: 0.025428571428571467
Misclassification Error - 3: 0.0268571428571428


In [7]:
np.array(miss_class_err).mean()

np.float64(0.026999999999999986)

In [8]:
#@title 5.d
X = default.loc[:, ['income', 'balance', 'student']]
X = pd.get_dummies(X, columns=['student'])[['income', 'balance', 'student_Yes']]
y = pd.get_dummies(default['default'])['Yes']
# Including a dummy variable for student
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
def_logit2 = LogisticRegression(fit_intercept=True).fit(X_train, y_train)

predictions = def_logit2.predict(X_test)
predictions[predictions >= 0.5] = 1
predictions[predictions < 0.5] = 0
error_rate = 1 - accuracy_score(y_test, predictions)
print(f'Model with student dummy var. : {error_rate}')

Model with student dummy var. : 0.024333333333333318


In [9]:
# Not including a dummy variable for student
X_train, X_test, y_train, y_test = train_test_split(X.drop(['student_Yes'], axis=1),
                                                    y, test_size=0.3, random_state=1)

def_logit3 = LogisticRegression(fit_intercept=True).fit(X_train, y_train)
predictions = def_logit3.predict(X_test)
predictions[predictions >= 0.5] = 1
predictions[predictions < 0.5] = 0

error_rate = 1 - accuracy_score(y_test, predictions)
print(f'Model with student dummy var. : {error_rate}')

Model with student dummy var. : 0.024666666666666615


# Question 6

In [10]:
#@title 6.a
X = default.loc[:, ['income', 'balance']]
y = pd.get_dummies(default['default'])['Yes']

logit = sm.Logit(y, sm.add_constant(X)).fit(disp=False)
logit.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-11.5405,0.435,-26.544,0.000,-12.393,-10.688
income,2.081e-05,4.99e-06,4.174,0.000,1.1e-05,3.06e-05
balance,0.0056,0.000,24.835,0.000,0.005,0.006


In [11]:
#@title 6.b
def boot_fn(data, indices):
    X = sm.add_constant(data[['balance', 'income']].loc[indices])
    y = pd.get_dummies(data['default'].loc[indices])['Yes']

    logit = sm.Logit(y, X).fit(disp=False)
    return [logit.params['balance'], logit.params['income']]
np.random.seed(0)
indices = np.random.choice(default.index, size=len(default), replace=True)
boot_fn(default, indices)

[np.float64(0.005799490373833468), np.float64(2.1728365149007354e-05)]

In [12]:
#@title 6.c
def boot(data, func, num_samples=1000):
    coef_income = []
    coef_balance = []
    for i in range(num_samples):
        indices = np.random.choice(default.index, size=len(default), replace=True)
        results = func(data, indices)
        coef_income.append(results[0])
        coef_balance.append(results[1])

    return np.array(coef_balance).std(), np.array(coef_income).std()

np.random.seed(0)
boot(default, boot_fn, 1000)

(np.float64(4.713197793939103e-06), np.float64(0.00023616384027116337))

## 6.d
Estimated bootstrap standard errors are comparable to those obtained in section 6.b.