# K Nearest Neighbors

In [1]:
# import neccesary library
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn.metrics import accuracy_score
import time

In [2]:
# load data
original_data = pd.read_csv('data/processed_data.csv')

In [3]:
# convert text data to glove
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file= 'data/glove.6B.300d.txt'
word2vec_output_file = 'data/glove.6B.300d.txt.word2vec'
glove2word2vec(glove_input_file,word2vec_output_file)

from gensim.models import KeyedVectors
filename = 'data/glove.6B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [4]:
# vectorize each sentence
def sentence2vec(s):
    count = 0
    vec = np.zeros(300)
    for x in s.split():
        if x in model.vocab:
            vec += model[x]
            count += 1
    return vec / count

In [5]:
# create data,target
list_data = []
for row in original_data.itertuples(False):
    
    list_data.append(sentence2vec(row[1]))

X = np.asarray(list_data)

y_data = original_data['y'].tolist()
y = np.asarray(y_data)

  


# Build KNN from scratch with python

In [6]:
# import libary
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import cdist
from collections import Counter
from sklearn.model_selection import train_test_split

In [7]:
# replace NaN to number and inf number to finite number
def normalize(X):
    X = np.nan_to_num(X)
    return X

In [8]:
# caculate distance btw 2 point
def distance(instance1, instance2):
    
    instance1 = np.array(instance1) 
    instance2 = np.array(instance2)
    
    return np.linalg.norm(instance1 - instance2)

In [9]:
# get list of neighbors have smallest distance with test_instance
def get_neighbors(training_set,labels,test_instance,k):
    
    distances = []
    for index in range(len(training_set)):
        dist = distance(test_instance, training_set[index])
        distances.append(( dist, labels[index]))
    distances.sort(key=lambda x: x[0])
    neighbors = distances[:k]
    return neighbors

In [10]:
# decide which class is test_instance belong
def vote(neighbors):
    class_counter = Counter()
    for neighbor in neighbors:
        class_counter[neighbor[1]] += 1
    return class_counter.most_common(1)[0][0]

In [11]:
# predict label for test set
def predict_data(training_data,labels,test_data,k):
    pred_labels = []
    for data in test_data:
        nb = get_neighbors(training_data,labels,data,k)
        pred_labels.append(vote(nb))
    pred_labels = np.asarray(pred_labels)
    return pred_labels

In [12]:
# creat train,test data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
X_train = normalize(X_train)
X_test = normalize(X_test)

# make prediction
my_pred = predict_data(X_train,y_train,X_test,k=5)

print("accuracy : ",100*accuracy_score(y_test, my_pred))

# Use KNN model in Scikit-learn

In [17]:
# get prediction by using scikit-learn
start_time = time.time()
clf = neighbors.KNeighborsClassifier(n_neighbors = 5, p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
end_time = time.time()

In [18]:
print("accuracy : ",100*accuracy_score(y_test, y_pred))
print("testing time : %.2fs"%(end_time - start_time))

accuracy :  64.7444913267698
testing time : 11.04s


# Conclusion

 My model and KNN model in Sklearn have same result