# K-Nearest Neighbors (K-NN)

### 參考課程實作並在datasets_483_982_spam.csv的資料集中獲得90% 以上的 accuracy (testset)

## Importing the libraries

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re
from sklearn.preprocessing import LabelEncoder

## Importing the dataset

In [12]:
#"讀取資料集"
dataset = pd.read_csv(r'datasets_483_982_spam.csv', encoding = 'latin-1')
dataset.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### 取出訓練內文與標註

In [25]:
le = LabelEncoder()
label = le.fit_transform(dataset['v1'])
dataset['v1'] = label

In [57]:
X = dataset.loc[:,'v2']
Y = dataset.loc[:,'v1']

In [27]:
print('Training Data Examples : \n{}'.format(X[:5]))

Training Data Examples : 
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object


In [28]:
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Labeling Data Examples : 
0    0
1    0
2    1
3    0
4    0
Name: v1, dtype: int64


### 文字預處理

In [49]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk

nltk.download('stopwords')

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

"""可以參考課程練習方式清理文字，或是使用自己的方式"""
def get_pos_tag(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {'J': wordnet.ADJ,
                'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def get_clean_text(Corpus):
    Corpus_clean = [re.sub('[^a-zA-z]', ' ',corpus).lower() for corpus in Corpus]
    Corpus_tokenize = [nltk.word_tokenize(corpus) for corpus in Corpus_clean]
    Corpus_stopwords_lammatizer = []
    stop_words = set(stopwords.words('english'))
    for content in Corpus_tokenize:
        content_clean = []
        for word in content:
            if word not in stop_words:
                word = lemmatizer.lemmatize(word, get_pos_tag(word))
                content_clean.append(word)
        Corpus_stopwords_lammatizer.append(content_clean)
    Corpus_output = [' '.join(corpus) for corpus in Corpus_stopwords_lammatizer]
    return Corpus_output

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/evenpan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
X = get_clean_text(X)

### Bag of words

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv=CountVectorizer(max_features = 1000)
X=cv.fit_transform(X).toarray()

In [None]:
X.shape

## Splitting the dataset into the Training set and Test set

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [54]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [55]:
max_features = [1000, 2000, 3000, 5000]
n_neighbors  = [5, 10, 25, 50] ## 可自行嘗試不同K值
for feature in max_features:
    X = dataset.loc[:,'v2']
    X = get_clean_text(X)
    cv=CountVectorizer(max_features = feature)
    X=cv.fit_transform(X).toarray()
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
    for k in n_neighbors:
        classifier = KNeighborsClassifier(n_neighbors = k, metric = 'minkowski', p = 2)
        # cv = 10 代表切成10等分
        accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10,n_jobs=-1)
        print('設置max_feature值:{}'.format(feature))
        print('設置K值:{}'.format(k))
        print('Average Accuracy: {}'.format(accuracies.mean()))
        print('Accuracy STD: {}'.format(accuracies.std()))

設置max_feature值:1000
設置K值:5
Average Accuracy: 0.9326870559782335
Accuracy STD: 0.011142883448628004
設置max_feature值:1000
設置K值:10
Average Accuracy: 0.9073346097647
Accuracy STD: 0.009748916670463811
設置max_feature值:1000
設置K值:25
Average Accuracy: 0.8801869300146118
Accuracy STD: 0.005265672355040464
設置max_feature值:1000
設置K值:50
Average Accuracy: 0.8667269612535901
Accuracy STD: 0.001032051062554304
設置max_feature值:2000
設置K值:5
Average Accuracy: 0.9246112762634151
Accuracy STD: 0.007470854281771815
設置max_feature值:2000
設置K值:10
Average Accuracy: 0.8988093918476345
Accuracy STD: 0.00692921751678977
設置max_feature值:2000
設置K值:25
Average Accuracy: 0.8718869350531566
Accuracy STD: 0.0033905170727354505
設置max_feature值:2000
設置K值:50
Average Accuracy: 0.8660543155136796
Accuracy STD: 0.0008912552114944287
設置max_feature值:3000
設置K值:5
Average Accuracy: 0.9225923313347106
Accuracy STD: 0.007469001136575767
設置max_feature值:3000
設置K值:10
Average Accuracy: 0.8956698745402327
Accuracy STD: 0.006668910790140812
設置max

## Training the K-NN model on the Training set

In [60]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

## Predicting a new result

In [61]:
print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))

Trainset Accuracy: 0.9385236706304689


In [62]:
print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))

Testset Accuracy: 0.9219730941704036


## Predicting the Test set results

In [63]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [64]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[965   0]
 [ 87  63]]


0.9219730941704036