In [201]:
#step1
import math

def euclideanDistance(a, b):
    distance = 0.0
    for i in range(len(a)):
        distance += (math.pow((a[i]-b[i]), 2))
    return math.sqrt(distance)

In [202]:
#step2
import math

def manhattanDistance(a, b):
    manDis = 0.0
    for i in range(len(a)):
        manDis += (abs(a[i]-b[i]))
    return manDis

In [203]:
#step3
import numpy as np

def accuraryGeneralization(predicted, actual):
    accuracy = sum(np.array(predicted) == np.array(actual)) / float(len(actual))
    generalization = 1 - accuracy
    return accuracy, generalization


In [204]:
#step4

def precisionScore(yTest, yPredict):
    tp = sum((np.array(yTest) == 1) & (np.array(yPredict) == 1))
    fp = sum((np.array(yTest) == 0) & (np.array(yPredict) == 1))
    score = tp/(tp+fp)
    return score

def recallScore(yTest, yPredict):
    tp = sum((np.array(yTest) == 1) & (np.array(yPredict) == 1))
    fn = sum((np.array(yTest) == 1) & (np.array(yPredict) == 0))
    score = tp/(tp+fn)
    return score

def f1_score(yTest, yPredict):
    tp = sum((np.array(yTest) == 1) & (np.array(yPredict) == 1))
    fn = sum((np.array(yTest) == 1) & (np.array(yPredict) == 0))
    fp = sum((np.array(yTest) == 0) & (np.array(yPredict) == 1))
    score = tp / (tp + ((fn + fp) / 2))
    return score

In [205]:
#step5

def confusionMatrix(actual, prediction):
    confusionLists = np.unique(actual)
    matrixSize = len(confusionLists)
    matrix = np.zeros((matrixSize, matrixSize))
    for i in range (matrixSize):
        for j in range (matrixSize):
            matrix[i, j] = np.sum((actual == confusionLists[i]) & (predicted == confusionLists[j]))
    return matrix


In [206]:
#step6

import matplotlib.pyplot as plt
%matplotlib inline

def roc_curve(y_train, y_train_pred):
    fprList = []
    tprList = []

    for i in range(len(y_train)): #you write yTest?
        tp = sum((np.array(y_train) == 1) & (np.array(y_train_pred) == 1))
        fn = sum((np.array(y_train) == 1) & (np.array(y_train_pred) == 0))
        fp = sum((np.array(y_train) == 0) & (np.array(y_train_pred) == 1))
        tn = sum((np.array(y_train) == 0) & (np.array(y_train_pred) == 0))
        fpr = fp/(fp+tn)
        tpr = tp/(tp+fn)
        #put point by point into list
        fprList.append(fpr)
        tprList.append(tpr)
        
    #start from 0 to 1 on x and y axis   
    plt.axis([0,1,0,1])
    #plot point by point
    plt.scatter(fprList, tprList)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()
    return

In [207]:
#step7

import matplotlib.pyplot as plt

def AUC(y_train, y_train_pred):
    fprList = []
    tprList = []

    for i in range(len(y_train)):
        tp = sum((np.array(y_train) == 1) & (np.array(y_train_pred) == 1))
        fn = sum((np.array(y_train) == 1) & (np.array(y_train_pred) == 0))
        fp = sum((np.array(y_train) == 0) & (np.array(y_train_pred) == 1))
        tn = sum((np.array(y_train) == 0) & (np.array(y_train_pred) == 0))
        fpr = fp/(fp+tn)
        tpr = tp/(tp+fn)
        fprList.append(fpr)
        tprList.append(tpr)
       
    auc = np.trapz(tprList, fprList)
    print('AUC: %.3f' %auc)
    
    return

In [17]:
#step8
import matplotlib.pyplot as plt
%matplotlib inline


# testing, out of 10 data points
#tps = [1,3,5,6,8,10,12,14,15,17]
#fps = [22,20,17,14,12,10,7,5,3,1]
#fns = [1,2,4,5,8,13,18,22,25,28]
thresholds = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


def precisions(tp, fp):
    precision_array = []
    for i in range(len(tp)):
        precision = tp[i] / (tp[i] + fp[i])
        i += 1
        precision_array.append(precision)
    return precision_array


def recalls(tp, fn):
    recall_array = []
    for i in range(len(tp)):
        recall = tp[i] / (tp[i] + fn[i])
        i += 1
        recall_array.append(recall)
    return recall_array


#precisions = precisions(tps, fps)
#recalls = recalls(tps, fns)

fig = plt.figure(figsize=(10, 6))


def precision_recall_curve(precision, recall, threshold):
    plt.plot(thresholds, precisions, "b--", linewidth=8, label="Precision")
    plt.plot(thresholds, recalls, "g-", linewidth=3, label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="lower right")
    plt.title('Precision-Recall Curve')
    # plt.xlim([0, 1])
    plt.ylim([0, 1.1])


#precision_recall_curve(precisions, recalls, thresholds)
#plt.show()

<Figure size 720x432 with 0 Axes>

In [278]:
#step9

from math import *
from collections import defaultdict

class KNN:
    
    # Storing the values as instance variables
    def __init__(self, X, y, n_neighbors, weights = "uniform", distanceCalc = "euclidean" ):
        self.X = X
        self.y = y
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.distanceCalc = distanceCalc
    
    # Calculation for inverse of distances for weights = distance
    def inverseWeights(distance):
        if distance == 0:
            return 1
        else:
            return (1 / distance)
    
    # Predicting the labels for x_test
    def predict(self, X):  # test = x_test
        
        labels = []
        for x in X:
            distanceNeighbours = []
            KNearestNeighbours = []
            KNN = []
            labelValues = []
            weightsCalculated = []
            labelPredicted = []
            labelWeight = defaultdict(list)
            zeroCount = 0
            oneCount = 0
            totalWeightZero = 0
            totalWeightOne = 0

            # If weights is uniform
            if self.weights == "uniform":
                for index, i in enumerate(self.X):
                    if self.distanceCalc == "euclidean":
                        distance = euclideanDistance(x, i)
                    elif self.distanceCalc == "manhattan":
                        distance = manhattanDistance(x, i)
                    distanceNeighbours.append([distance, index])
                    
                KNN = sorted(distanceNeighbours)
                KNearestNeighbours = KNN[:self.n_neighbors]

                for distance, i in KNearestNeighbours:
                    labelValues.append(self.y[i])
                    
                for i in range(len(labelValues)):
                    if labelValues[i] == 0:
                        zeroCount += 1
                    elif labelValues[i] == 1:
                        oneCount += 1

                if zeroCount > oneCount:
                    labelPredicted = 0
                elif zeroCount < oneCount:
                    labelPredicted = 1
                labels.append(labelPredicted)

            # If weights is distance
            elif self.weights == 'distance':
                for index, i in enumerate(self.X):
                    if self.distanceCalc == "euclidean":
                        distance = euclideanDistance(x, i)
                    elif self.distanceCalc == "manhattan":
                        distance = manhattanDistance(x, i)
                    distanceNeighbours.append([distance, index])
                
                KNN = sorted(distanceNeighbours)
                KNearestNeighbours = KNN[:self.n_neighbors]
                
                for distance, i in KNearestNeighbours:
                    weightsCalculated = inverseWeights(distance)                      
                    labelValues.append([weightsCalculated, self.y[i]])

                for i in range(len(labelValues)):
                    if labelValues[i][1] == 0:
                        totalWeightZero += labelValues[i][0]
                    if labelValues[i][1] == 1:
                        totalWeightOne += labelValues[i][0]
                        
                if totalWeightZero > totalWeightOne:
                    labelPredicted = 0
                    
                elif totalWeightZero < totalWeightOne:
                    labelPredicted = 1
                labels.append(labelPredicted)
                
        return labels

# Instantiating the class KNN
#model = KNN(x_train, y_train, 5)

# Predicting the label values for data samples in x_test
#y_predict = model.predict(x_test)


In [371]:
#step10
import pandas as pd

df = pd.read_csv('winequality-white.csv', sep = ';', skipinitialspace=True)

In [372]:
#step11

df.rename(columns={'quality':'target'}, inplace=True)

In [373]:
#step11

def categorizeQualityData(dataFrame, columnName):
    for i in range(dataFrame.shape[0]):
        if dataFrame[columnName][i] > 5:
            dataFrame[columnName][i] = 0
        else:
            dataFrame[columnName][i] = 1  
    return 

categorizeQualityData(df, 'target')
df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0


In [374]:
#step12
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,0.334831
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.471979
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,0.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,0.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,0.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,1.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,1.0


In [375]:
#step13
df = df.sample(frac=1) 
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,target
991,8.2,0.26,0.44,1.3,0.046,7.0,69.0,0.9944,3.14,0.62,10.2,1
4094,6.4,0.25,0.28,4.9,0.03,29.0,98.0,0.99024,3.09,0.58,12.8,0
1508,6.8,0.21,0.49,14.5,0.06,50.0,170.0,0.9991,3.55,0.44,9.8,0
3092,7.6,0.27,0.52,3.2,0.043,28.0,152.0,0.99129,3.02,0.53,11.4,0
3109,6.9,0.3,0.36,4.5,0.054,31.0,203.0,0.99513,3.4,0.57,10.4,1


In [257]:
#step14
import warnings
warnings.filterwarnings('ignore')


# Matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from scipy import stats

# Calculate correlation coefficient
def corrfunc(x, y, **kws):
    r, _ = stats.pearsonr(x, y)
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.1, .6), xycoords=ax.transAxes,
               size = 24)
    
cmap = sns.cubehelix_palette(light=1, dark = 0.1,
                             hue = 0.5, as_cmap=True)

sns.set_context(font_scale=2)

# Pair grid set up
g = sns.PairGrid(df)

# Scatter plot on the upper triangle
g.map_upper(plt.scatter, s=10, color = 'red')

# Distribution on the diagonal
g.map_diag(sns.distplot, kde=False, color = 'red')

# Density Plot and Correlation coefficients on the lower triangle
g.map_lower(sns.kdeplot, cmap = cmap)
g.map_lower(corrfunc);

KeyboardInterrupt: 

In [306]:
df = df.drop(columns=['alcohol'])

In [376]:
#Creating a copy of the data frame
dfStandard = df.copy()

In [377]:
#step16

def partition(X,y,t):
    x_train = np.array(X[int((len(X)+1)*t):])
    x_test = np.array(X[:int((len(X)+1)*t)])
    y_train = np.array(y[int((len(y)+1)*t):])
    y_test = np.array(y[:int((len(y)+1)*t)]) 
    return x_train, x_test, y_train, y_test

#separate the target vector and features
def separateTargetVector(dataSet):
    y = dataSet['target'] #target vector
    X = dataSet.drop(columns='target') #features matrix
    t = 0.2
    return X,y,t

In [378]:
#function to standardize the data
def standardizeData(dataSet):
    for key, values in dataSet.iteritems():
        mean = dataSet[key].mean()
        std = dataSet[key].std()
        if key != 'target':
            dataSet[key] = (dataSet[key] - mean) / std
    return dataSet

dfStandard = standardizeData(dfStandard)


In [379]:
#step17

#Naively run KNN_Classifier with df
def calculateDf(df):
    X,y,t = separateTargetVector(df)
    xTrain,xTest,yTrain,yTest = partition(X,y,t)
    model = KNN(xTrain, yTrain, 5)
    yPredict = model.predict(xTest)

    accuracy = sum(np.array(yPredict) == np.array(yTest)) / float(len(yTest))
    score = f1_score(yTest, yPredict)
    return accuracy, score
# print("accuracy: %f" %accuracy)
# print("f1Score: %f" %score)

# accuracy: 0.757916
# f1Score: 0.617124

#without alcohol
# accuracy: 0.706844
# f1Score: 0.553655

In [380]:
#step17 after standardize the data

#Naively run KNN_Classifier with dfStandard
def calculateDFStandard(dfStandard):
    X,y,t = separateTargetVector(dfStandard)
    xTrain,xTest,yTrain,yTest = partition(X,y,t)
    model = KNN(xTrain, yTrain, 5)
    yPredict = model.predict(xTest)

    accuracy = sum(np.array(yPredict) == np.array(yTest)) / float(len(yTest))
    score = f1_score(yTest, yPredict)
    return accuracy, score
# print("accuracy: %f" %accuracy)
# print("f1Score: %f" %score)

#Run 1 with all features
#accuracy: 0.777324
#f1Score: 0.648387

# accuracy: 0.770174
# f1Score: 0.638844

# accuracy: 0.743616
# f1Score: 0.607199

#without alcohol
# accuracy: 0.715015
# f1Score: 0.555024

In [381]:
#step15

corrValue = df.corr().abs()['target'].sort_values(ascending=True)

for i in range(len(corrValue)-2):
    accuracy, score = calculateDf(df)
    accuracy1, score1 = calculateDFStandard(dfStandard)
    print(accuracy)
    print(score)
    print(accuracy1)
    print(score1)
    df = df.drop(columns=[corrValue.index[i]])
    dfStandard = dfStandard.drop(columns=[corrValue.index[i]])

0.6956077630234934
0.5066225165562914
0.7875383043922369
0.6687898089171974
0.6945863125638406
0.5057851239669422
0.7814096016343207
0.665625
0.7303370786516854
0.5614617940199336
0.7671092951991828
0.64375
0.72829417773238
0.5596026490066225
0.7517875383043923
0.6161137440758294
0.72829417773238
0.5596026490066225
0.7620020429009193
0.631911532385466
0.7374872318692544
0.5793780687397708
0.7752808988764045
0.6518987341772152
0.7293156281920327
0.5590682196339434
0.7650663942798774
0.6314102564102564
0.7660878447395302
0.6239737274220033
0.7579162410623085
0.6195826645264848
0.7650663942798774
0.6179401993355482
0.7579162410623085
0.6082644628099173
0.72829417773238
0.5581395348837209
0.7242083758937692
0.5573770491803278


In [None]:
#step18

def partitionSplit(df, folds):
    s_partition = []
    dfCopy = df
    eachPartition = int(df.shape[0] / folds)
    for i in range(folds):
        partition = []
        for j in range(eachPartition):
            if(len(partition) < eachPartition):
                value = dfCopy.index[0]
                partition.append(dfCopy.loc[value].values.tolist())
                dfCopy = dfCopy.drop(value)
        s_partition.append(np.asarray(partition))
        
    return s_partition

def sFold(folds, data, labels, model, error_function, **model_args):
    
    #loop to choose a partition as validation set
    for partition in split:
        trainSet = list(split)
        trainSet.remove(i)
        newTrainSet = []
        newTrainSet.append(trainSet)
        testSet = []
        for row in partition:
            testSet.append(row)

    #call knn function
    xTrain, yTrain, xTest, yTest = partition(data, labels, 0.2)
    model.fit(xTrain, yTrain)
    y_train_predict = model.predict(xTest)
    y_actual = yTest
    
    if error_function is None:
        error = fi_score(xTrain, yTrain)
    else:
        error = fi_score(xTrain, yTrain)
    
    accuracy = sum(np.array(predicted) == np.array(actual)) / float(len(actual))
    print('Accuracy: %.3f' %accuracy)
    
    return {'expected labels': y_actual, 'predicted labels': y_train_predict, 'errors': [error]}

In [None]:
#Step 19
xTrain, xTest, yTrain, yTest = partition(X, y, t)
model = KNN(xTrain, yTrain, 1)
y_train_predict = model.predict(xTest)

def dictionary(k, distance, weights):
    modelArgs = dict()
    #form the dictionary
    for eachK in k:
        for eachDistance in distance:
            for eachWeights in weights:
                modelArgs['k'] = eachK
                modelArgs['distance'] = eachDistance
                modelArgs['weights'] = eachWeights
                #use of s-folds
#                 f1 = f1_score(yTest, y_predict)
#                 out = sFold(5, X, y, KNN, f1, modelArgs)
                
                print('Model_Args:',modelArgs)
    return modelArgs

k = [1,5,9,11]
distance = ['euclidean','manhattan']
weights = ['uniform', 'distance']

dictionary(k,distance,weights)

In [None]:
#Step 20

xTrain, xTest, yTrain, yTest = partition(X, y, t)
model = KNN(xTrain, yTrain, 5)
y_train_predict = model.predict(xTest)
y_predict = np.array(y_train_predict)

print('Precision:', precisionScore(yTest, y_predict))
print('Recall:', recallScore(yTest, y_predict))
print('F1 score:', f1_score(yTest, y_predict))
print('Accuracy and Generalization Error:', accuraryGeneralization(yTest, y_predict))
print('Confusion Matrix:', confusionMatrix(yTest, y_predict))

In [None]:
#Step 21
roc_curve(yTest, y_train_predict)

In [None]:
#Step 22
AUC(yTest, y_train_predict)

In [105]:
#Step 24
from math import sqrt

#assume k=9 is the best
def confidenceInterval(z, accuracy, n):
    result = z * sqrt((accuracy * (1 - accuracy)) / n)
    print('%.3f' % result)
    
confidenceInterval(1.96, 0.2, 9)

0.248
