## Import modules

In [3]:
import pandas as pd
from scipy.io import arff
import numpy as np
import math
import operator
import seaborn as sns

## Data loading

In [4]:
data=arff.loadarff("./product-selection/trainProdSelection.arff")
train_df = pd.DataFrame(data[0])
data=arff.loadarff("./product-selection/testProdSelection.arff")
test_df = pd.DataFrame(data[0])

FileNotFoundError: [Errno 2] No such file or directory: './product-selection/trainProdSelection.arff'

In [None]:
train_df.info()

## Encoding and scaling on train data

In [None]:
train_df.Type=train_df.Type.str.decode("UTF-8")

In [None]:
train_df.LifeStyle=train_df.LifeStyle.str.decode("UTF-8")
train_df.label=train_df.label.str.decode("UTF-8")


In [None]:
train_df.head()


In [None]:
minValue=train_df.Vacation.min()
maxValue=train_df.Vacation.max()
train_df.Vacation=train_df.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))


In [None]:
minValue=train_df.eCredit.min()
maxValue=train_df.eCredit.max()
train_df.eCredit=train_df.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [None]:
minValue=train_df.salary.min()
maxValue=train_df.salary.max()
train_df.salary=train_df.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [None]:
minValue=train_df.property.min()
maxValue=train_df.property.max()
train_df.property=train_df.property.apply(lambda x:(x-minValue)/(maxValue-minValue))


In [None]:
train_df.head()


## Encoding and scaling on test data

In [None]:
test_df.Type=test_df.Type.str.decode("UTF-8")


In [None]:
test_df.LifeStyle=test_df.LifeStyle.str.decode("UTF-8")
test_df.label=test_df.label.str.decode("UTF-8")

In [None]:
minValue=test_df.Vacation.min()
maxValue=test_df.Vacation.max()
test_df.Vacation=test_df.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [None]:
minValue=test_df.eCredit.min()
maxValue=test_df.eCredit.max()
test_df.eCredit=test_df.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [None]:
minValue=test_df.salary.min()
maxValue=test_df.salary.max()
test_df.salary=test_df.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [None]:
minValue=test_df.property.min()
maxValue=test_df.property.max()
test_df.property=test_df.property.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [None]:
test_df.head()


## Onehot encoding for train data

In [None]:
train_df = pd.concat([train_df,pd.get_dummies(train_df['Type'], prefix='Type')],axis=1)
train_df = pd.concat([train_df,pd.get_dummies(train_df['LifeStyle'], prefix='Type')],axis=1)
train_df.drop(['Type'],axis=1, inplace=True)
train_df.drop(['LifeStyle'],axis=1, inplace=True)

In [None]:
OutputCol=train_df['label']
train_df.drop(['label'],axis=1,inplace=True)
train_df['label']=OutputCol
train_df.head()

## Onehot encoding for test data

In [None]:
test_df = pd.concat([test_df,pd.get_dummies(test_df['Type'], prefix='Type')],axis=1)
test_df = pd.concat([test_df,pd.get_dummies(test_df['LifeStyle'], prefix='Type')],axis=1)
test_df.drop(['Type'],axis=1, inplace=True)
test_df.drop(['LifeStyle'],axis=1, inplace=True)

In [None]:

OutputCol=testDF['label']
test_df.drop(['label'],axis=1,inplace=True)
test_df['label']=OutputCol
test_df.head()

## Functions for knn, euclidean distance and accuracy

In [None]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(0,length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)
  
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors
 
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]

def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [None]:
def knn(k):
    predictions=[]
    for x in range(len(test_df)):
        neighbors = getNeighbors(train_df.values, test_df.values[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
        #print('> predicted=' + repr(result) + ', actual=' + repr(testDF.values[x][-1]))
    accuracy = getAccuracy(test_df.values, predictions)
    print('Accuracy: ' + repr(accuracy) + '%','with k=',k)
    return accuracy

In [None]:
#Accuracy
accuracy_list=[]
k_list=[]
for i in range(1,100,2):
  accuracy_list.append(knn(i))
  k_list.append(i)

In [None]:
#plot accuracy vs K
import matplotlib.pyplot as plt
plt.plot(k_list,acc_list,color='green')
plt.xlabel('K-VALUE')
plt.ylabel('ACCURACY')
plt.grid(True)
plt.show()

In [None]:
TotalDF=pd.concat([train_df, test_df], ignore_index=True)


In [None]:
knn(55) #accuracy is maximum here
