#### In HW3 you did an exercise on performing a classification on the **HeartData.csv** data file. We recently learned how to apply the LOOCV and K-Fold CV to regression problems. In this homework we would like to apply the LOOCV and K-Fold CV to the logistic regression, LDA and QDA models used in HW3. Using 10-fold CV and LOOCV fit the models and report the classification accuracy for the 3 models (logistic regression, LDA and QDA).For this question use num as the response variable and all the other variables as features.

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from ISLP.models import sklearn_sm
from ISLP.models import (ModelSpec as MS,summarize)
from sklearn.discriminant_analysis import \
     (LinearDiscriminantAnalysis as LDA,
      QuadraticDiscriminantAnalysis as QDA)
from sklearn.model_selection import cross_validate, LeaveOneOut, KFold, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

# = = = = = Running LOOCV = = = = =

In [2]:
Data = pd.read_csv('HeartData-1.csv')
X_LR = MS(Data.columns.drop(['num'])).fit_transform(Data) # with intercept
X = Data.drop(['num'], axis = 1) # without intercept
y = Data['num']

In [3]:
def split_data_leave_one_out(X, y, i):
    # training data size 296
    X_train = X.drop(i)
    y_train = y.drop(i)

    # test data size 1
    X_test = X.iloc[i:i+1]
    y_test = y.iloc[i]
    return X_train, y_train, X_test, y_test

In [4]:
print('~ ~ ~ ~ ~ 1. Logistic Regression Model ~ ~ ~ ~ ~')
MSE_LR = np.zeros(len(Data))
for i in range(len(Data)):
    X_train, y_train, X_test, y_test = split_data_leave_one_out(X_LR, y, i)
    # fit model
    lrm = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit()
    # predict the left one sample
    pred = lrm.predict(exog=X_test)
    # calculate MSE
    MSE_LR[i] = mean_squared_error([y_test], [pred])
print(np.mean(MSE_LR))

~ ~ ~ ~ ~ 1. Logistic Regression Model ~ ~ ~ ~ ~
0.12403120021860257


In [5]:
print('~ ~ ~ ~ ~ 2. LDA Model ~ ~ ~ ~ ~')
lda = LDA(store_covariance=True)
MSE_LDA = np.zeros(len(Data))
for i in range(len(Data)):
    X_train, y_train, X_test, y_test = split_data_leave_one_out(X, y, i)
    # fit model
    lda.fit(X_train, y_train)
    # predict the left one sample
    pred = lda.predict(X_test)
    # calculate MSE
    MSE_LDA[i] = mean_squared_error([y_test], [pred])
print(np.mean(MSE_LDA))

~ ~ ~ ~ ~ 2. LDA Model ~ ~ ~ ~ ~
0.16498316498316498


In [6]:
print('~ ~ ~ ~ ~ 3. QDA Model ~ ~ ~ ~ ~')
qda = QDA(store_covariance=True)
MSE_QDA = np.zeros(len(Data))

for i in range(len(Data)):
    X_train, y_train, X_test, y_test = split_data_leave_one_out(X, y, i)
    # fit model
    qda.fit(X_train, y_train)
    # predict the left one sample value
    pred = qda.predict(X_test)
    # calculate MSE
    MSE_QDA[i] = mean_squared_error([y_test], [pred])
print(np.mean(MSE_QDA))

~ ~ ~ ~ ~ 3. QDA Model ~ ~ ~ ~ ~
0.1750841750841751


# = = = = = Running 10-Fold = = = = =

In [7]:
def split_into_10_Fold_chunks(X, y, train_index, test_index):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    return X_train, y_train, X_test, y_test

In [8]:
kf = KFold(n_splits=10, shuffle=True, random_state=0)

MSE_LR = np.zeros(len(Data))
MSE_LDA = np.zeros(len(Data))
MSE_QDA = np.zeros(len(Data))

for i, (train_index, test_index) in enumerate(kf.split(Data)):
    # print(f"Fold {i}:")
    # print(f"  Train: index={train_index}")
    # print(f"  Test:  index={test_index}")
    X_train_LR, y_train, X_test_LR, y_test = split_data_leave_one_out(X_LR, y, i)
    X_train = X_train_LR.drop(['intercept'], axis = 1)
    X_test = X_test_LR.drop(['intercept'], axis = 1)
    
    # fit model
    lrm = sm.GLM(y_train, X_train_LR, family=sm.families.Binomial()).fit()
    lda.fit(X_train, y_train)
    qda.fit(X_train, y_train)
    
    # predict the values
    pred_LR = lrm.predict(exog=X_test_LR)
    pred_LDA = lda.predict(X_test)
    pred_QDA = qda.predict(X_test)
    
    # calculate MSE
    MSE_LR[i] = mean_squared_error([y_test], [pred_LR])
    MSE_LDA[i] = mean_squared_error([y_test], [pred_LDA])
    MSE_QDA[i] = mean_squared_error([y_test], [pred_QDA])
    
print("Logistic Regression CV error:", np.mean(MSE_LR))
print("LDA CV error:", np.mean(MSE_LDA))
print("QDA CV error:", np.mean(MSE_QDA))

Logistic Regression CV error: 0.0009922927494102863
LDA CV error: 0.0
QDA CV error: 0.0
