In [32]:
import numpy as np
import pandas as pd
from wine_datacleaning_with_LDA import *
import itertools
from wine_datacleaning_with_LRegression import LogisticRegression
from accuracy import *


# split data set into k-fold
def k_fold_split(df, k):
    # randomly shuffle the dataframe
    df = df.reindex(np.random.permutation(df.index))
    data = np.array_split(df, k)
    return data

# perform logistic regression training and validation given the training_set, validation_set
def validation_LRegression(training_set, validation_set, y):
    training_set_x = np.array(training_set.loc[:,training_set.columns!=y])
    training_set_y = np.array([[yi] for yi in training_set[y]])
    validation_set_x = np.array(validation_set.loc[:,validation_set.columns!=y])
    validation_set_y = np.array([[yi] for yi in validation_set[y]])
    LRModel = LogisticRegression()
    LRModel.fit(np.array(training_set_x),np.array(training_set_y),lambda n: 0.001/(n+1), 3000)
    print('---------------------Logistic Regression Training set accuracy---------------------')
    print('Training set data: ', len(training_set.index))
    prediction = LRModel.predict(validation_set_x)
    error, correct, validation_accuracy = accuracy(prediction, validation_set_y)
    print('---------------------Logistic Regression Validation set accuracy-------------------')
    print('Validation set data: ', len(validation_set.index))
    print('Validation set accuracy: ', validation_accuracy)
    return validation_accuracy

# perform logistic regression training and validation given the training_set, validation_set
def validation_LDA(training_set, validation_set, y):
    training_set_x = np.array(training_set.loc[:,training_set.columns!=y])
    training_set_y = np.array([[yi] for yi in training_set[y]])
    validation_set_x = np.array(validation_set.loc[:,validation_set.columns!=y])
    validation_set_y = np.array([[yi] for yi in validation_set[y]])
    LDAModel = LDAClassifier()
    LDAModel.fit(np.array(training_set_x),np.array(training_set_y))
    print('----------------------------LDA Training set accuracy------------------------------')
    print('Training set data: ', len(training_set.index))
    prediction = LDAModel.predict(validation_set_x)
    validation_accuracy = accuracy(prediction, validation_set_y)
    print('----------------------------LDA Validation set accuracy----------------------------')
    print('Validation set data: ', len(validation_set.index))
    print('Validation set accuracy: ', validation_accuracy)
    return validation_accuracy
    
def k_fold_validation_LRegression(df, y, k):
    spliteddf = k_fold_split(df, k)
    accuracy = 0.
    for i in range(0,k):
        validation_set = spliteddf[i]
        training_sets = spliteddf.copy()
        del training_sets[i]
        training_set = pd.concat(training_sets)
        accuracy += validation_LRegression(training_set, validation_set, y)
    accuracy /= k
    print('----------------------------Logistic Regression %d-fold Validation-----------------'%(k))
    print('Fold: ', k)
    print('Accuracy: ', accuracy)
    print('Error rate: ', 1-accuracy)
        
        
def k_fold_validation_LDA(df, y, k):
    spliteddf = k_fold_split(df, k)
    accuracy = 0.
    for i in range(0,k):
        validation_set = spliteddf[i]
        training_sets = spliteddf.copy()
        del training_sets[i]
        training_set = pd.concat(training_sets)
        accuracy += validation_LDA(training_set, validation_set, y)
    accuracy /= k
    print('----------------------------LDA %d-fold Validation---------------------------------'%(k))
    print('Fold: ', k)
    print('Accuracy: ', accuracy)
    print('Error rate: ', 1-accuracy)



In [33]:
df = pd.read_csv('./winequality-red-cleaned.csv')
#k_fold_validation_LRegression(df, 'quality', 5)
k_fold_validation_LDA(df, 'quality', 5)
print(df)
#k_fold_validation_LDA(df, 'quality', 10)

----------------------------LDA Training set accuracy------------------------------
Training set data:  1279
243
----------------------------LDA Validation set accuracy----------------------------
Validation set data:  320
Validation set accuracy:  0.759375
----------------------------LDA Training set accuracy------------------------------
Training set data:  1279
242
----------------------------LDA Validation set accuracy----------------------------
Validation set data:  320
Validation set accuracy:  0.75625
----------------------------LDA Training set accuracy------------------------------
Training set data:  1279
240
----------------------------LDA Validation set accuracy----------------------------
Validation set data:  320
Validation set accuracy:  0.75
----------------------------LDA Training set accuracy------------------------------
Training set data:  1279
225
----------------------------LDA Validation set accuracy----------------------------
Validation set data:  320
Validati