In [None]:
# filename: knn.ipynb
# purpose: knn model implemenation

# OHT knn model implementation 

### KNN implementation method 
- aaa

### Processing flow
- aaa

In [None]:
# packages
import time
import pathlib
import numpy as np
import pandas as pd

import humanfriendly as human

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt

# Plotting decision boundaries
from matplotlib.colors import ListedColormap

import ohtconf as conf
import ohtcomm as comm

## Main

In [None]:
mainstart = time.time()

In [None]:
# read table data
_start = time.time()

dfmix = comm.read_tabdf(conf.TABNAME_MIX)
dfmix.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
dfmix.reset_index(drop=True, inplace=True)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
USESCALE = False  # no big difference between feature scaling or not

### KNN with all, 10 fetures

In [None]:
_start = time.time()

# Split features and label
X = dfmix.loc[:, conf.COLUMN_GRAPH]
y = dfmix[conf.COLUMN_FLAG]  # series for 1-d array

# Feature scaling
if USESCALE:
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=conf.COLUMN_GRAPH)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# check data
if 1 == 1:
    print(f"data type X={type(X)}, X_train={type(X_train)}, y_train={type(y_train)}")
    print(f"X_train={X_train[:3]}")
    print(f"X_test={X_test[:3]}")
    print(f"y_train={y_train[:3]}")
    print(f"y_test={y_test[:3]}")

In [None]:
_start = time.time()

# Create KNN classifier
knn = KNeighborsClassifier()

# Grid search for optimal K value
param_grid = {"n_neighbors": conf.N_NEIGHBORS}
print(f"param_grid={param_grid}")

# K-fold Cross-Validation: Divide the dataset into K folds. Train the model on K-1 folds and evaluate on the remaining fold.
# Repeat this K times, rotating the validation fold.
grid_search = GridSearchCV(knn, param_grid, cv=4, verbose=3)  # cv for cross validator folds
grid_search.fit(X_train, y_train)

# Best K value
best_k = grid_search.best_params_["n_neighbors"]
print(f"best_k={best_k} on param_grid={param_grid}")

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
_start = time.time()

# Final model with best K
final_model = KNeighborsClassifier(n_neighbors=best_k)
final_model.fit(X_train, y_train)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
_start = time.time()

# Predictions
y_pred = final_model.predict(X_test)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# Accuracy score
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

### KNN with selected 2 fetures for 2-D visualization 

In [None]:
_start = time.time()

# Choose 2 fetures.  considered PCA feature reducing method, but choose 2 fetures after correlation matrix heatmap analysis
COLUMN_NAMES_KNN = [conf.COLUMN_NAMES[1], conf.COLUMN_NAMES[6]]
COLUMN_FLAGS_KNN = [1, 6]
X = dfmix.loc[:, COLUMN_NAMES_KNN]  # TEM, NH3

# Update label, flag for 2 features
y = dfmix.loc[:, [conf.COLUMN_FLAG]]
y.loc[(y[conf.COLUMN_FLAG] != COLUMN_FLAGS_KNN[0]) & (y[conf.COLUMN_FLAG] != COLUMN_FLAGS_KNN[1]), conf.COLUMN_FLAG] = (
    0  # clear other column flag
)
y = y[conf.COLUMN_FLAG]  # series for 1-d array

# Feature scaling
if USESCALE:
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=COLUMN_NAMES_KNN)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
_start = time.time()

# Create KNN classifier
knn = KNeighborsClassifier()

# Grid search for optimal K value
param_grid = {"n_neighbors": conf.N_NEIGHBORS}
print(f"param_grid={param_grid}")

# K-fold Cross-Validation: Divide the dataset into K folds. Train the model on K-1 folds and evaluate on the remaining fold.
# Repeat this K times, rotating the validation fold.
grid_search = GridSearchCV(knn, param_grid, cv=4, verbose=3)  # cv for cross validator folds
grid_search.fit(X_train, y_train)

# Best K value
best_k = grid_search.best_params_["n_neighbors"]
print(f"best_k={best_k} on param_grid={param_grid}")

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
_start = time.time()

# Final model with best K
final_model = KNeighborsClassifier(n_neighbors=best_k)
final_model.fit(X_train, y_train)

_elapsed = time.time() - _start
print(f"Rea elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
_start = time.time()

# Predictions
y_pred = final_model.predict(X_test)

_elapsed = time.time() - _start
print(f"Rea elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# Accuracy score
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Visualize KNN Decision boundaries

# Create a mesh grid for plotting decision boundaries
x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
y_min, y_max = X.iloc[:, 1].min() - 10, X.iloc[:, 1].max() + 10
xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1))

# Predict the class for each point in the mesh grid
# xx.ravel() - return flattened array
# np.c_[array,array] - translates slice objects to concatenation along the second axis.
dfgrid = pd.DataFrame(np.c_[xx.ravel(), yy.ravel()], columns=COLUMN_NAMES_KNN)
Z = final_model.predict(dfgrid)
Z = Z.reshape(xx.shape)

# Plot the decision boundaries
cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=conf.PLOTSIZE)
ax.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot the training points
# c=y - sequence of n numbers to be mapped to colors using *cmap*
ax.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=cmap_bold, edgecolor="k", s=10, alpha=0.5)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xlabel(COLUMN_NAMES_KNN[0])
ax.set_ylabel(COLUMN_NAMES_KNN[1])
ax.set_title(" & ".join(COLUMN_NAMES_KNN))

fig.suptitle(f"KNN Decision Boundaries with 2 features (k={best_k})")

plt.tight_layout()
plt.show()

pngfile = "knn-scatter-" + "-".join(COLUMN_NAMES_KNN) + ".png"
if pngfile is not None:
    pngfile = pngfile.lower()
    filepath = pathlib.Path(conf.DIRCHART) / pngfile
    fig.savefig(
        filepath,
        dpi=conf.DPI,
        facecolor="w",
        edgecolor="w",
        orientation="portrait",
        format=None,
        transparent=False,
        bbox_inches=None,
        pad_inches=None,
    )