In [1]:
# This is a sample Python script.

# Press ⌃R to execute it or replace it with your code.
# Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings.
import numpy as np
import scipy
from scipy.spatial import distance
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
class KNN:
    def __init__(self):
        self.X = None
        self.y = None
        
        self.k = 0
        self.metric = None
        
    def fit(self,X,y):
        if len(X)==len(y):
            self.X = X
            self.y = y
        else:
            print(f'Input sizes not same. X: {len(X)}, y: {len(y)}. No fitting done')
        
    def setmode(self,k,metric):
        # distance.cosine, distance.euclidean, distance.cdist
        self.k = k
        self.metric = metric
        print(f'Set k = {k}, metric = {metric}')
        
        
    # OPTIMIZE TO O(NLOGN)
    # USE SPARSE KD TREES
    def predict(self,x):
        distances = []
        for i in range(len(self.X)):
            distances.append((self.metric(self.X[i],x),self.y[i]))
        distances = sorted(distances, key = lambda x: x[0])
        
        return 1 if sum(distances[i][1] for i in range(self.k))>(self.k/2) else 0
    
    # RETURN MORE INFORM
    def evaluate(self,Xtest,ytest):
        corrects = 0
        total = 0
        for i in range(len(Xtest)):
            x = Xtest[i]
            y = ytest[i]
            
            pred = self.predict(x)
            if pred==y:
                print(f'{i}/{len(Xtest)}: good')
                corrects+=1
            else:
                print(f'{i}/{len(Xtest)}: bad')
            total+=1
        
        return (corrects/total)
            

In [3]:
dataset = pd.read_csv('spam_ham_dataset.csv')
dataset.drop('Unnamed: 0',axis=1, inplace=True)
dataset.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
# how do i give data to the classifier
# each x is some d dimensional vector that i'll have to encode from text, and each y is a number
# i'll have one numpy array of size Nxd. each row is an email, but it has word counts.
# i'll convert label_num to a numpy array.
# i'll test_train_split after tokenizing and converting label_num to a numpy array.
# then i'll pass X, y to knn fit which will save all of it.

In [5]:
vectorizer = CountVectorizer()
print(dataset.text)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [6]:
vectorizer.fit(dataset.text)
encoded_text = vectorizer.transform(dataset.text)

In [7]:
encoded_text = encoded_text.toarray()
print(encoded_text.shape)

(5171, 50447)


In [10]:
y = dataset.label_num.to_numpy()
print(y.shape)
print(y)

(5171,)
[0 0 0 ... 0 0 1]


In [16]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(encoded_text,y, test_size=0.2)


knn = KNN()
knn.fit(Xtrain,ytrain)
knn.setmode(10,distance.euclidean)
print(knn.evaluate(Xtest,ytest))

Set k = 10, metric = <function euclidean at 0x7f88ee0fc280>
0/1035: good
1/1035: bad
2/1035: good
3/1035: bad
4/1035: good
5/1035: good
6/1035: good
7/1035: good
8/1035: good
9/1035: good
10/1035: good
11/1035: good
12/1035: good
13/1035: good
14/1035: good
15/1035: good
16/1035: good
17/1035: good
18/1035: good
19/1035: good
20/1035: good
21/1035: good
22/1035: good
23/1035: bad
24/1035: good
25/1035: good
26/1035: good
27/1035: bad
28/1035: good
29/1035: good
30/1035: good
31/1035: good
32/1035: good
33/1035: good
34/1035: good
35/1035: bad
36/1035: good
37/1035: good
38/1035: good
39/1035: bad
40/1035: bad
41/1035: good
42/1035: good
43/1035: good
44/1035: good
45/1035: good
46/1035: good
47/1035: good
48/1035: bad
49/1035: bad
50/1035: good
51/1035: good
52/1035: bad
53/1035: bad
54/1035: good
55/1035: good
56/1035: good
57/1035: good
58/1035: good
59/1035: bad
60/1035: good
61/1035: bad
62/1035: bad
63/1035: good
64/1035: good
65/1035: good
66/1035: good
67/1035: good
68/1035: goo

KeyboardInterrupt: 