# 第五章 重抽样方法

In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import (
    mean_squared_error,
    log_loss,
)
from sklearn.linear_model import (
    LogisticRegression,
    LogisticRegressionCV
)
import pandas as pd
import numpy as np

## 应用

### defualt数据集

In Chapter 4, we used logistic regression to predict the probability of `default` using `income` and `balance` on the `Default` data set. We will now estimate the test error of this logistic regression model using thevalidation set approach. Do not forget to set a random seed beforebeginning your analysis.

(a) Fit a logistic regression model that uses `income` and `balance` to predict `default`.


In [2]:
np.random.seed(seed=10)
default_df = pd.read_csv("datasets/Default.csv")

default_X_train, default_X_test, default_y_train, default_y_test = train_test_split(
    default_df[['income', 'balance']],
    default_df['default'].apply(lambda x: True if x == "Yes" else False),
    test_size=0.2)

default_lr_raw = LogisticRegression().fit(default_X_train, default_y_train)

default_lr_raw_train_error = np.sum(np.abs(
    default_lr_raw.predict(default_X_train) >= 0.5 -
    default_y_train
)) / default_y_train.count()

default_lr_raw_test_error = np.sum(np.abs(
    default_lr_raw.predict(default_X_test) >= 0.5 - 
    default_y_test
)) / default_y_test.count()


print(f"""\
test error: {default_lr_raw_test_error}
train_error: {default_lr_raw_train_error}
""")

test error: 0.0345
train_error: 0.033375



(b) Using the validation set approach, estimate the test error of this model. In order to do this, you must perform the following steps:

i. Split the sample set into a training set and a validation set.

ii. Fit a multiple logistic regression model using only the training observations.

iii. Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default for that individual, and classifying the individual to the default category if the posterior probability is greaterthan 0.5.

iv. Compute the validation set error, which is the fraction ofthe observations in the validation set that are misclassified.

(c) Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained.



In [12]:
np.random.seed(seed=20)
loo = LeaveOneOut()

res = []

default_X_train.index = list(range(len(default_X_train.index)))

for train_index, validation_index in loo.split(default_X_train):
    X_train, X_validation = default_X_train.iloc[train_index, :], default_X_train.iloc[validation_index, :]
    y_train, y_validation = default_y_train.iloc[train_index], default_y_train.iloc[validation_index]
    lr_model_temp = LogisticRegression().fit(X_train, y_train)
    train_error = np.sum(np.abs(lr_model_temp.predict(X_train) >= 0.5 - y_train)) / y_train.count()
    validation_error = np.sum(np.abs(lr_model_temp.predict(X_validation) >= 0.5 - y_validation))
    test_error = np.sum(np.abs(lr_model_temp.predict(default_X_test) >= 0.5 - default_y_test)) / default_y_test.count()
    res.append((train_error, validation_error, test_error))

res_df = pd.DataFrame(res, columns=['train_error', 'validation_error', 'test_error'])


In [13]:
print(f"""
validation_error: {res_df['validation_error'].mean()}
train_error: {res_df['train_error'].mean()}
test_error: {res_df['test_error'].mean()}
""")


validation_error: 0.0335
train_error: 0.03337503125390675
test_error: 0.03449999999999999



(d) Now consider a logistic regression model that predicts the probability of default using income,balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach. Comment on whether or not including adummy variable for student leads to a reduction in the test error rate.

In [20]:
np.random.seed(seed=10)
default_df = pd.read_csv("datasets/Default.csv")
default_df['student_encode'] = default_df.student.apply(lambda x: True if x == "Yes" else False)

default_X_train, default_X_test, default_y_train, default_y_test = train_test_split(
    default_df[['income', 'balance', "student_encode"]],
    default_df['default'].apply(lambda x: True if x == "Yes" else False),
    test_size=0.2)

default_lr_raw = LogisticRegression().fit(default_X_train, default_y_train)

default_lr_raw_train_error = np.sum(np.abs(
    default_lr_raw.predict(default_X_train) >= 0.5 -
    default_y_train
)) / default_y_train.count()

default_lr_raw_test_error = np.sum(np.abs(
    default_lr_raw.predict(default_X_test) >= 0.5 - 
    default_y_test
)) / default_y_test.count()


print(f"""\
test error: {default_lr_raw_test_error}
train_error: {default_lr_raw_train_error}
""")

test error: 0.0345
train_error: 0.033375

