## 10-fold cross validation vs. leave-one-out

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

%matplotlib inline

### read dataset

In [2]:
with open('Transfusion.txt', 'r') as fr:
    dataset = fr.readlines()
fr.close()

dataset = dataset[1:]

data = []
for i in range(len(dataset)):
    curLine = dataset[i].strip().split(",")
    data.append(curLine)

for i in range(len(data)):
    for j in range(len(data[0])):
        data[i][j] = float(data[i][j])

In [3]:
data = np.array(data)

In [4]:
### define X, y ###
X = data[:, :-1]
y = data[:, -1]

In [5]:
# normalization
X = (X - X.mean(0)) / X.std(0)

In [6]:
X_posit = X[y == 1]
X_neget = X[y == 0]

In [7]:
### stratfied random sampling ###

index_posit = np.arange(len(X_posit))
index_neget = np.arange(len(X_neget))

np.random.shuffle(index_posit)
np.random.shuffle(index_neget)

X_posit_shuffle = X_posit[index_posit]
X_neget_shuffle = X_neget[index_neget]

In [8]:
np.random.shuffle(index_posit)
np.random.shuffle(index_neget)

In [9]:
X_posit_shuffle = X_posit[index_posit]
X_neget_shuffle = X_neget[index_neget]

In [8]:
train_posit = X_posit_shuffle[:120]
test_posit = X_posit_shuffle[120:]
train_neget = X_neget_shuffle[:380]
test_neget = X_neget_shuffle[380:]

In [9]:
def tenFoldSplit(X_p, X_n, num_p, num_n):
    '''
    Generate 10-flod data. Each fold includes 38 positive samples and 12 negetive samples.
    
    '''
    
    train_folds = []
    y = []
    for i in range(10):
        fold = []
        if i <= 8 :            
            fold += X_p[i*int(num_p):(i+1)*int(num_p)]
            fold += X_n[i*int(num_n):(i+1)*int(num_n)]
            train_folds.append(fold)
            y.append([1]*num_p + [0]*num_n)
        else:
            fold += X_p[i*int(num_p):]
            fold += X_n[i*int(num_n):]
            train_folds.append(fold)
            y.append([1]*num_p + [0]*num_n)
        
    return train_folds, y

In [10]:
train_fold, train_y =tenFoldSplit(train_posit.tolist(), train_neget.tolist(), 12, 38)
test_fold, test_y = tenFoldSplit(test_posit.tolist(), test_neget.tolist(), 6, 19)

### Define logistic regression function ( Use Newton method )

### $$\beta^{t+1}=\beta^t-\big[ \frac{\partial^2\mathcal{l}(\beta)}{\partial\beta\partial\beta^T} \big]^{-1}\frac{\partial\mathcal{l}(\beta)}{\partial\beta}$$

In [11]:
def Log_Reg_fun(X, y):
    '''
    Given training dataset, return optimal parameters of logistic regression by newton method.
    
    '''
    
    m, n = X.shape
    lr = 0.001
    # initialization #
    Beta = np.ones((1, n)) * 0.1
    
    z = X.dot(Beta.T)
    
    for i in range(150):
        p1 = np.exp(z) / (1 + np.exp(z))
        p = np.diag((p1 * (1 - p1)).reshape(m))
        first_order = -np.sum(X * (y - p1), 0, keepdims = True)
        
        # update #
        Beta -= first_order * lr
        z = X.dot(Beta.T)
        
    l = np.sum(y * z + np.log(1 + np.exp(z)))
    
    return Beta

In [12]:
### define testing function ###
def testing(X, y, Beta):
    '''
    return error number of input X.
    '''
    
    predicts = (X.dot(Beta.T) >= 0)
#     print('pre: ', predicts)
    error_num = np.sum(predicts != y)
    
    return error_num

In [13]:
def tenFoldCrossValidation(folds, y):
    '''
    return error num of 10-fold cross validation.
    folds: list with shape [10, 50, 4]
    y: list with shape [10, 50]
    Output:
        error_nums
    '''
    
    tenFolderror_num = 0
    for i in range(10):
        train_X = folds[:i] + folds[i+1:]
        train_y = y[:i] + y[i+1:]
        val_X = folds[i]
#         print('shape X:', np.shape(val_X))
        val_y = y[i]
#         print('shape y:', np.shape(val_y))
        train_X = np.array(train_X).reshape(-1, 4)
        train_y = np.array(train_y).reshape([-1, 1])
        val_X = np.array(val_X)
#         print('shape X_:', np.shape(val_X))
        val_y = np.array(val_y).reshape([-1, 1])
#         print('shape y_:', np.shape(val_y))
        beta = Log_Reg_fun(train_X, train_y)
#         print('beta = ', beta)
        error_num = testing(val_X, val_y, beta)
#         print('error num = ', error_num)
        tenFolderror_num += error_num
        
    return tenFolderror_num

In [14]:
### define LeaveOneOut validation function ###
def LOO(X, y):
    """
    Return erroe num of LOO.
    Input:
        X: list with shape [500, 4].
        y: list with shape [500]
    Return:
        loo_error_nums:
    """
    loo_error_nums = 0
    for i in range(100):
        train_X = X[:i] + X[i+1:]
        train_y = y[:i] + y[i+1:]
        val_X = X[i]
        val_y = y[i]
        train_X = np.array(train_X).reshape(-1, 4)
        train_y = np.array(train_y).reshape([-1, 1])
        val_X = np.array(val_X)
        val_y = np.array(val_y).reshape([-1, 1])
        beta = Log_Reg_fun(train_X, train_y)
        error_num = testing(val_X, val_y, beta)
        loo_error_nums += error_num

        
    return loo_error_nums

In [15]:
# LOO_train_X = np.concatenate((train_posit, train_neget))
# LOO_train_y = np.concatenate((np.ones(len(train_posit)), np.zeros(len(train_neget))))
LOO_train_X_r1 = train_posit[:24].tolist() + train_neget[:76].tolist()
LOO_train_y_r1 = [1] * len(train_posit[:24]) + [0] * len(train_neget[:76])
LOO_train_X_r2 = train_posit[24:24*2].tolist() + train_neget[76:76*2].tolist()
LOO_train_y_r2 = LOO_train_y_r1
LOO_train_X_r3 = train_posit[24*2:24*3].tolist() + train_neget[76*2:76*3].tolist()
LOO_train_y_r3 = LOO_train_y_r1
LOO_train_X_r4 = train_posit[24*3:24*4].tolist() + train_neget[76*3:76*4].tolist()
LOO_train_y_r4 = LOO_train_y_r1
LOO_train_X_r5 = train_posit[24*4:].tolist() + train_neget[76*4:].tolist()
LOO_train_y_r5 = LOO_train_y_r1

In [16]:
if __name__ == "__main__":
    
    #10-fold cross validation
    loo_error_num_r1 = LOO(LOO_train_X_r1, LOO_train_y_r1)
    loo_error_num_r2 = LOO(LOO_train_X_r2, LOO_train_y_r2)
    loo_error_num_r3 = LOO(LOO_train_X_r3, LOO_train_y_r3)
    loo_error_num_r4 = LOO(LOO_train_X_r4, LOO_train_y_r4)
    loo_error_num_r5 = LOO(LOO_train_X_r5, LOO_train_y_r5)
    loo_error_num = loo_error_num_r1 + loo_error_num_r2 + loo_error_num_r3 + loo_error_num_r4 + loo_error_num_r5
    
    print(loo_error_num_r1, loo_error_num_r2, loo_error_num_r3, loo_error_num_r4, loo_error_num_r5)
    print('LeaveOneOut_errorNum: {}/500 '.format(loo_error_num),'= ',round(loo_error_num/500, 4))
    tenfold_error_num = tenFoldCrossValidation(train_fold, train_y)
    print(tenfold_error_num)
    print ('tenFoldCrossValidation_errorNum: {}/500'.format(tenfold_error_num),'= ',round(tenfold_error_num/500, 4))

39 53 44 41 44
LeaveOneOut_errorNum: 221/500  =  0.442
205
tenFoldCrossValidation_errorNum: 205/500 =  0.41
