# k-Nearest Neighbors (kNN) 

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [None]:
# Set Style
# sns.set_style('darkgrid')
# sns.set_style('dark')
# sns.set_style('whitegrid')
# sns.set_style('white')
# sns.set_style('ticks')

## Iris Dataset

In [None]:
df = sns.load_dataset('iris')
# df.head()
# df.tail()
df.sample(6, random_state=1)

In [None]:
df.isnull().sum()

In [None]:
df.species.unique()

In [None]:
# แต่ละ Class มีจำนวน Samples เท่าใด
df.groupby('species').count()
# df.groupby('species').sepal_length.count()

In [None]:
# Frequency of Species (target or class)
(label, count) = np.unique(df.species , return_counts=True)
freq = np.asarray((label, count)).T
print(freq)

## Data Visualization

In [None]:
sns.histplot(data=df, x='species')
plt.show()

In [None]:
sns.histplot(data=df, x='species', hue='species', shrink=.5)
plt.show()

In [None]:
# Scatter plot by Matplotlib
plt.scatter(df.sepal_length, df.sepal_width)
plt.show()

In [None]:
# Seaborn
sns.scatterplot(data=df, x='sepal_length', y='sepal_width')
plt.show()

In [None]:
sns.scatterplot(data=df, x='sepal_length', y='sepal_width',
                 hue='species', 
                 style='species', palette='Set1',
                 s=80)
plt.title('Iris dataset: Sepal')
plt.show()

In [None]:
sns.scatterplot(data=df, x='petal_length', y='petal_width',
                 hue='species', style='species', s=80)
plt.title('Iris dataset: Petal')
plt.show()

## KDE 
A kernel density estimate (KDE) plot (for visualizing the distribution of observations in a dataset)

In [None]:
sns.histplot(data=df, x='petal_width', hue='species', shrink=.8)
plt.show()

In [None]:
sns.kdeplot(data=df, x='petal_width', hue='species', shade=True)
plt.show()

In [None]:
sns.kdeplot(data=df, x='sepal_width', hue='species', shade=True)
plt.show()

## Pairplot

In [None]:
plt.rcParams.update({'font.size':14})
# sns.pairplot(df)
sns.pairplot(df, hue='species')
plt.show()

In [None]:
columns = ['petal_length', 'petal_width', 'species']
sns.pairplot(df[columns], hue='species', height=4)
plt.show()

## Prepare (เตรียม) X, y

In [None]:
X = df.drop(['species'], axis=1)
X.head()

In [None]:
y = df.species
y.head()

## Label Encoding

In [None]:
# Label Encoding  (setosa-> 0,  versicolor->1 , virginica->2 )
y, class_names = pd.factorize(df.species)

class_names

In [None]:
class_names[1]

In [None]:
y[:10]

In [None]:
df.head()

In [None]:
df[50:55]

In [None]:
y[50:55]

## Train-test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=20)  

In [None]:
X_train.shape , X_test.shape

In [None]:
# Frequency of the Test set (target or class)
(label, count) = np.unique(y_test , return_counts=True)
freq = np.asarray((label, count)).T
print(freq)

In [None]:
plt.bar(freq[:,0], freq[:,1])
plt.xticks(np.arange(3), class_names)  # Set text labels.
plt.show()

In [None]:
sns.barplot(x=freq[:,0], y=freq[:,1])
plt.xticks(np.arange(3), class_names)
plt.show()

In [None]:
X_train[:5]

In [None]:
y_train[:5]

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier() # maxdepth=3

model.fit(X_train, y_train)

In [None]:
cvs = cross_val_score(model, X, y, cv=5)  # no need to use model.fit
print('cross val scores {}'.format(cvs.round(3)))
print('mean (%) = {:.3f}' .format(cvs.mean() * 100 ))

In [None]:
model.feature_importances_

## Tree

In [None]:
from sklearn.tree import plot_tree

data_feature_names = X.columns # enc_columns #

plt.subplots(nrows=1, ncols=1, figsize=(6, 6), dpi=96)
plot_tree(model, 
         filled=True, rounded=True,
         feature_names=data_feature_names,  
         class_names=class_names)

plt.show()

## Evaluation

In [None]:
score = model.score(X_test, y_test)
print('Score =', score.round(3))

In [None]:
# from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

print('Score -> {:.3f}' . format(model.score(X_test, y_test)))

# print(classification_report(y_test, y_pred))        # 0 1 2
print(classification_report(y_test, y_pred, target_names=class_names))

cm = confusion_matrix(y_test, y_pred)
print(cm)

## Confusion Matrix Plot

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

plt.rcParams.update({'font.size':12})

cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm, display_labels=class_names).plot() # cmap='Pastel1'
plt.title('Confusion Matrix')
plt.show()

## Miss-classification (ผิดพลาด)

In [None]:
y_pred = model.predict(X_test)

In [None]:
dy = pd.DataFrame()
dy['y_test'] = y_test
dy['y_pred'] = y_pred
dy['result'] = y_test == y_pred

dy[dy.result == False]

In [None]:
print("Miss-classification:", len(y_test[(y_test != y_pred)]) )

## Predict

In [None]:
X_new = np.array([[4.2, 4.0, 1.9, 0.8],     # 
                  [6.1, 3.5, 3.9, 1.5],     # 
                  [6.7, 3.5, 6.3, 1.4]])    # 

y_pred = model.predict(X_new)
y_pred
# print(class_names[y_pred][0])
for i in y_pred:
    print(class_names[i])

## Plot Decision Regions 

In [None]:
# from the Training set

from mlxtend.plotting import plot_decision_regions

value = 0
width = 9.5

ax = plot_decision_regions(np.array(X_train), y_train, 
                      clf=model, feature_index=[2, 3],    # Petal Length-width
                      filler_feature_values={0: value, 1: value},
                      filler_feature_ranges={0: width, 1: width},
                      legend=2) 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, class_names, framealpha=0.5, loc='upper left')

plt.title('Decision Tree: Regions from Training Set')
plt.xlabel(df.columns[2]) 
plt.ylabel(df.columns[3])

plt.show()

In [None]:
# จาก Test-set และ New data points (ที่ Predict)

from mlxtend.plotting import plot_decision_regions

value = 0
width = 9.5

ax = plot_decision_regions(np.array(X_test), y_test, 
                      clf=model, feature_index=[2, 3],  # Petal Length-Width  
                      filler_feature_values={0: value, 1: value},
                      filler_feature_ranges={0: width, 1: width},
                      legend=2) 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, class_names, framealpha=0.5, loc='upper left')

plt.title('Decision Tree: Regions from Test set and New data points')

plt.xticks()
plt.yticks()
plt.xlabel(df.columns[2]) 
plt.ylabel(df.columns[3])

plt.scatter(X_new[:,2], X_new[:,3], marker='o', s=130, c='b')
plt.show()

## Data Points

In [None]:
X_new

In [None]:
# Sepal Length

spl = X_new[: ,0]   # Sepal Length and width
spl

In [None]:
# Sepal width
spw = X_new[: ,1]
spw

In [None]:
sns.scatterplot(data=df, x='sepal_length', y='sepal_width',
                 hue='species', style='species', s=80)

plt.scatter(spl, spw, marker='o', s=130, c='b')

plt.title('Iris dataset: Sepal')
plt.show()

In [None]:
# Petal Length
ptl = X_new[:,2]
ptl


In [None]:
# Petal Width
ptw =  X_new[:,3]
ptw

In [None]:
X_new

In [None]:
sns.scatterplot(data=df, x='petal_length', y='petal_width',
                 hue='species', style='species', s=80)

plt.scatter(ptl, ptw, marker='o', s=130, c='b')

plt.title('Iris dataset: Petal')
plt.show()

# SVM

In [None]:
from sklearn.svm import SVC

# model = SVC(kernel='linear') 
model = SVC(kernel='rbf')     # default

X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=20)  

# model.fit(X_train, y_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import cross_val_score

model = KNeighborsClassifier(n_neighbors=7)

cvs = cross_val_score(model, X, y, cv=5)  # no need to use model.fit
print('cross val scores {}'.format(cvs.round(3)))
print('mean (%) = {:.3f}' .format(cvs.mean() * 100 ))

#  k-nearest neighbors (kNN)

## Cross-validation

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import cross_val_score

k = 27      # hyperparameter
model = KNeighborsClassifier(n_neighbors=k)

cvs = cross_val_score(model, X, y, cv=5)  # no need to use model.fit
print('cross val scores {}'.format(cvs.round(3)))
print('mean (%) = {:.3f}' .format(cvs.mean() * 100 ))

In [None]:
k_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 19, 21]
cvs_list = []
for i, k in enumerate(k_list):
    # print(k)
    model = KNeighborsClassifier(n_neighbors=k)

    cvs = cross_val_score(model, X, y, cv=5)  # no need to use model.fit
    cvs_list.append(cvs.mean().round(4))

cvs_list

In [None]:
list(zip(k_list, cvs_list))

In [None]:
plt.title('Score and k')
plt.plot(k_list, cvs_list)
# plt.bar(k_list, cvs_list)

plt.xlabel('k')
plt.ylabel('Score')
plt.xticks(k_list)
plt.show()

## Train

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k = 11
model = KNeighborsClassifier(n_neighbors=k)

X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=20)  

model.fit(X_train, y_train)

## Evaluation

In [None]:
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix

score = model.score(X_test, y_test)

y_pred = model.predict(X_test)

print('Score: {:.4f}' . format(score))

print(classification_report(y_test, y_pred, target_names=class_names))
print(confusion_matrix(y_test, y_pred))

## Predict

In [None]:
X_new = np.array([[4.2, 4.0, 1.9, 0.8],     # 
                  [6.1, 3.5, 3.9, 1.5],     # 
                  [6.7, 3.5, 6.3, 1.4]])    # 

y_pred = model.predict(X_new)
y_pred
# print(class_names[y_pred][0])
for i in y_pred:
    print(class_names[i])

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

plt.rcParams.update({'font.size':12})
# labels = df.species.unique() # 

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm, display_labels=class_names).plot() # cmap='Pastel1'
plt.title('Confusion Matrix')
plt.show()

## Decision Regions

In [None]:
from mlxtend.plotting import plot_decision_regions

value = 0
width = 9.5

ax = plot_decision_regions(np.array(X_test), y_test, 
                      clf=model, feature_index=[2, 3],  
                      filler_feature_values={0: value, 1: value},
                      filler_feature_ranges={0: width, 1: width},
                      legend=2) 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, class_names, framealpha=0.5, loc='upper left')

plt.title('kNN: Iris Prediction')

# plt.xticks([])
# plt.yticks([])
plt.xlabel(df.columns[2]) 
plt.ylabel(df.columns[3])

plt.scatter(X_new[:,2], X_new[:,3], marker='o', s=130, c='b')
plt.show()

In [None]:
from mlxtend.plotting import plot_decision_regions

value = 0
width = 9.5

ax = plot_decision_regions(np.array(X_test), y_test, 
                      clf=model, feature_index=[1, 2],  
                      filler_feature_values={0: value, 3: value},
                      filler_feature_ranges={0: width, 3: width},
                      legend=2) 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, class_names, framealpha=0.5, loc='upper left')

plt.title('kNN: Iris Prediction')

plt.xticks()
plt.yticks()
plt.xlabel(df.columns[1]) 
plt.ylabel(df.columns[2])

plt.scatter(X_new[:,2], X_new[:,3], marker='o', s=130, c='b')
plt.show()

In [None]:
from mlxtend.plotting import plot_decision_regions

value = 0
width = 9.5

ax = plot_decision_regions(np.array(X_train), y_train, 
                      clf=model, feature_index=[3, 1],  
                      filler_feature_values={0: value, 2: value},
                      filler_feature_ranges={0: width, 2: width},
                      legend=2) 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, class_names, framealpha=0.5, loc='upper left')

plt.title('kNN: Iris Prediction')

plt.xticks()
plt.yticks()
plt.xlabel(df.columns[0]) 
plt.ylabel(df.columns[1])

plt.scatter(X_new[:,2], X_new[:,3], marker='o', s=130, c='b')
plt.show()

# PCA : Principal component analysis

เปลี่ยน 4 features -> 2 features เพื่อ plot x-y graph

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

X_pca = pca.fit_transform(X)
X_train, X_test,y_train,y_test = train_test_split(X_pca, y, test_size=0.3, random_state=20) 

In [None]:
X_train.shape, X_test.shape

## Model

In [None]:
k = 5
model = KNeighborsClassifier(n_neighbors=k)

model.fit(X_pca, y)

## Decision Regions

In [None]:
from mlxtend.plotting import plot_decision_regions

ax = plot_decision_regions(np.array(X_test), y_test, 
                      clf=model, 
                      legend=2) 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, class_names, framealpha=0.5, loc='upper left')

plt.title(f'kNN: Iris Dataset (k={k})')

plt.xticks([])
plt.yticks([])
plt.xlabel('PCA1')
plt.ylabel('PCA2')

plt.show()

In [None]:
X_new_pca = pca.transform(X_new)
X_new_pca

In [None]:
# from mlxtend.plotting import plot_decision_regions

ax = plot_decision_regions(np.array(X_test), y_test, 
                      clf=model, 
                      legend=2) 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, class_names, framealpha=0.5, loc='upper left')

plt.title('kNN: Iris Prediction')

plt.xticks()
plt.yticks()
plt.xlabel('PCA1')
plt.ylabel('PCA2')

plt.scatter(X_new_pca[:,0], X_new_pca[:,1], marker='o', s=130, c='b')
plt.show()

## Predict

In [None]:
y_pred = model.predict(X_new_pca)
for i in y_pred:
    print(class_names[i])