## KNN
#### 필요한 패키지를 불러온다:

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, preprocessing
from scipy.stats import itemfreq

#### 데이터를 불러온다:

In [None]:
os.chdir(r"D:\python_ML\python_ml_basic\data")

In [None]:
# df = pd.read_csv('data_spam.csv', header='infer',encoding='ISO-8859-1')
df = pd.read_csv('data_spam.csv', header='infer',encoding='latin1')

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
X=np.array(df.drop(columns='is_spam'))
Y=np.array(df.is_spam)
header = df.columns
headerX = df.drop(columns='is_spam').columns

#### 통계적 요약과 시각화:

In [None]:
table = itemfreq(Y)
plt.bar(table[:,0],table[:,1],color = 'blue')
plt.title('Category Frequency')
plt.show()
table

'yes', 'no' 레이블을 숫자로 변환:

In [None]:
LE = preprocessing.LabelEncoder()
Y = LE.fit_transform(Y)

In [None]:
table = itemfreq(Y)
table

NaN이 있으면 채워 넣음:

In [None]:
IPT = preprocessing.Imputer()
X = IPT.fit_transform(X)

#### 통계적 요약 및 데이터 전처리:

In [None]:
np.round(df.describe(),5)

In [None]:
X = preprocessing.scale(X)

In [None]:
np.round(pd.DataFrame(X,columns=headerX).describe(),3)

#### KNN 적용:

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=3)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

KNN with n_neighbours = 5

In [None]:
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, Y_train);
Y_pred = knn5.predict(X_test)
print(metrics.confusion_matrix(Y_test,Y_pred))
print("------------------------")
print( "Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

KNN with n_neighbours = 100

In [None]:
knn100 = KNeighborsClassifier(n_neighbors=100)
knn100.fit(X_train, Y_train);
Y_pred = knn100.predict(X_test)
print(metrics.confusion_matrix(Y_test,Y_pred))
print("------------------------")
print( "Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

#### KNN의 매개변수 (분산-편향 트레이드 오프):

In [None]:
accs = []
k_grid = np.arange(1,51,1)
for k in k_grid:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train);
    Y_pred = knn.predict(X_test)
    accs.append(metrics.accuracy_score(Y_test,Y_pred))

In [None]:
plt.scatter(k_grid,accs,c='red',marker='o',s=10,alpha=0.7)
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.title('Accuracy vs k')
plt.show()

#### KNN의 매개변수 최적화:

In [None]:
k_grid = np.arange(1,51,1)
weights = ['uniform','distance']
parameters = {'n_neighbors':k_grid, 'weights':weights}

In [None]:
gridCV = GridSearchCV(KNeighborsClassifier(), parameters, cv=10)
gridCV.fit(X_train, Y_train);
best_k = gridCV.best_params_['n_neighbors']
best_w = gridCV.best_params_['weights']

In [None]:
print("Best k : " + str(best_k))
print("Best weight : " + best_w)

In [None]:
knn_best = KNeighborsClassifier(n_neighbors=best_k, weights = best_w)
knn_best.fit(X_train, Y_train);
Y_pred = knn_best.predict(X_test)
print( "Best Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))