In [15]:
from ISLP import load_data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the "Default" dataset from the ISLP package
default = load_data("Default")

# Convert the "student" column to binary (0 or 1)
default['student'] = default['student'].apply(lambda x: 1 if x == 'Yes' else 0)

# Set a random seed
np.random.seed(1)

# Exercise-5:a
# Fit a logistic regression model that uses income and balance to predict default
X = default[['income', 'balance']]
y = default['default']
model = LogisticRegression()
model.fit(X, y)
# Print the model coefficients and intercept
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

# Exercise-5:b
# Split the sample set into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=1)

# Fit a logistic regression model using only the training observations
model = LogisticRegression()
model.fit(X_train, y_train)

# Obtain a prediction of default status for each individual in the validation set
probs = model.predict_proba(X_val)[:, 1]
pred_glm = np.where(probs > 0.5, "Yes", "No")

# Compute the validation set error
validation_error = 1 - accuracy_score(y_val, pred_glm)
print("Validation Set Error:", validation_error)

# Exercise-5:c
# Repeat the process in Exercise-5:b three times using different splits
validation_errors = []
for _ in range(3):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    pred_glm = np.where(probs > 0.5, "Yes", "No")
    validation_error = 1 - accuracy_score(y_val, pred_glm)
    validation_errors.append(validation_error)

print("Validation Set Errors (3 splits):", validation_errors)

# Exercise-5:d
# Logistic regression model with income, balance, and the binary "student" variable
X = default[['income', 'balance', 'student']]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=1)
model = LogisticRegression()
model.fit(X_train, y_train)
probs = model.predict_proba(X_val)[:, 1]
pred_glm = np.where(probs > 0.5, "Yes", "No")
validation_error = 1 - accuracy_score(y_val, pred_glm)
print("Validation Set Error (with student variable):", validation_error)


Model Coefficients: [[2.08091984e-05 5.64710797e-03]]
Model Intercept: [-11.54047812]
Validation Set Error: 0.032200000000000006
Validation Set Errors (3 splits): [0.032200000000000006, 0.024399999999999977, 0.03539999999999999]
Validation Set Error (with student variable): 0.032200000000000006


In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from ISLP import load_data

# Load the "Default" dataset
default = load_data("Default")

# Convert 'default' column to binary (0 and 1)
default['default'] = (default['default'] == 'Yes').astype(int)

# Set a random seed
np.random.seed(1)

# Exercise-6:a
# Fit a logistic regression model using sm.GLM() to determine estimated standard errors
model = sm.GLM(default['default'], default[['income', 'balance']], family=sm.families.Binomial())
results = model.fit()
estimated_std_errors = results.bse
print("(a) Estimated standard errors using sm.GLM():")
print("Income:", estimated_std_errors['income'])
print("Balance:", estimated_std_errors['balance'])

# Exercise-6:b
def boot_fn(data, indices):
    sampled_data = data.iloc[indices]
    model = sm.GLM(sampled_data['default'], sampled_data[['income', 'balance']], family=sm.families.Binomial())
    results = model.fit()
    return results.params

# Exercise-6:c
n_bootstrap = 1000
n_samples = len(default)

bootstrap_results = []
for _ in range(n_bootstrap):
    indices = np.random.choice(n_samples, n_samples, replace=True)
    params = boot_fn(default, indices)
    bootstrap_results.append(params)

bootstrap_results = np.array(bootstrap_results)

# Calculate standard errors using the bootstrap percentile method
boot_std_error_income = np.std(bootstrap_results[:, 0])
boot_std_error_balance = np.std(bootstrap_results[:, 1])

print("\n(c) Estimated standard errors using Bootstrap (percentile method):")
print("Income:", boot_std_error_income)
print("Balance:", boot_std_error_balance)

# Exercise-6:d
# Comment on the estimated standard errors
print("\n(d) Estimated standard errors: Using sm.GLM() provides traditional standard errors based on model assumptions.")
print("Bootstrap offers a resampling-based approach, yielding more robust estimates, helpful when assumptions aren't perfectly met.")
print("In this case, 'Income' has a smaller standard error compared to 'Balance,' indicating more certainty in its coefficient estimate.")


(a) Estimated standard errors using sm.GLM():
Income: 3.6997599254501696e-06
Balance: 7.032108165333494e-05

(c) Estimated standard errors using Bootstrap (percentile method):
Income: 3.970711336125356e-06
Balance: 6.751985526499156e-05

(d) Estimated standard errors: Using sm.GLM() provides traditional standard errors based on model assumptions.
Bootstrap offers a resampling-based approach, yielding more robust estimates, helpful when assumptions aren't perfectly met.
In this case, 'Income' has a smaller standard error compared to 'Balance,' indicating more certainty in its coefficient estimate.


In [17]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from ISLP import load_data

weekly = load_data('Weekly')

# Convert 'Direction' column to binary (0 and 1)
weekly['Direction'] = (weekly['Direction'] == 'Up').astype(int)

# Exercise-7:a
# Fit a logistic regression model using Lag1 and Lag2 to predict Direction
model_a = sm.GLM(weekly['Direction'], weekly[['Lag1', 'Lag2']], family=sm.families.Binomial())
results_a = model_a.fit()

# Exercise-7:b
# Fit a logistic regression model using all but the first observation to predict Direction
model_b = sm.GLM(weekly['Direction'][1:], weekly[['Lag1', 'Lag2']][1:], family=sm.families.Binomial())
results_b = model_b.fit()

# Exercise-7:c
# Use the model from (b) to predict the direction of the first observation
posterior_prob_c = results_b.predict(weekly[['Lag1', 'Lag2']].iloc[0:1])
prediction_c = (posterior_prob_c > 0.5).astype(int)  # 1 if probability > 0.5, 0 otherwise
correct_classification = (prediction_c.iloc[0] == weekly['Direction'].iloc[0])
print("First Observation Classification Correct:", correct_classification)

# Exercise-7:d
n = len(weekly)
error_vector = np.zeros(n)

for i in range(n):
    # i
    model_d = sm.GLM(weekly['Direction'].iloc[0:n].drop(i), weekly[['Lag1', 'Lag2']].iloc[0:n].drop(i), family=sm.families.Binomial())
    results_d = model_d.fit()
    
    # ii
    posterior_prob_d = results_d.predict(weekly[['Lag1', 'Lag2']].iloc[i:i+1])
    
    # iii
    prediction_d = (posterior_prob_d > 0.5).astype(int)
    
    # iv
    error_vector[i] = int(prediction_d.iloc[0] != weekly['Direction'].iloc[i])

# Print the error vector indicating errors in predicting the direction for each observation
print("Error Vector for Each Observation:")
print(error_vector)

# Exercise-7:e
loocv_error = np.mean(error_vector)
# Print the LOOCV estimate for the test error and the classification result for the first observation
print("LOOCV Estimate for Test Error:", loocv_error)
# Comment on the results
print("\nComment: The LOOCV estimate for test error is approximately 0.465, indicating that the logistic regression model using Lag1 and Lag2 has an error rate of 46.5%. This suggests that the model's predictive accuracy may not be very high, and it should be further evaluated for its effectiveness.")


First Observation Classification Correct: False
Error Vector for Each Observation:
[1. 1. 0. ... 0. 0. 1.]
LOOCV Estimate for Test Error: 0.46464646464646464

Comment: The LOOCV estimate for test error is approximately 0.465, indicating that the logistic regression model using Lag1 and Lag2 has an error rate of 46.5%. This suggests that the model's predictive accuracy may not be very high, and it should be further evaluated for its effectiveness.
