In [None]:
# filename: knn.ipynb
# purpose: knn model implement

# OHT KNN model implement

### KNN implements
- KNN classification method will be used to detect abnormal,outlier data

### Processing flow
- Read mixed dataset from dukdb table
- Reduce dataset to the required volumn
- Convert int label in flag column to str label for easier understaing of the classification report
- Split training and test dataset and labels
- Search best-K value in the configured neighbor range 
- KNN train with the searched best K value 
- Predict with test dataset
- Report the prediction report with best-K, accuracy score, confusion matrix (precision,recall,f-score,support)
- Chart confunsion matrix by heatmap
- Chart decision boundary scatter with 2 features
  

In [None]:
# packages
import time
import os
import tempfile
import numpy as np
import pandas as pd
import humanfriendly as human
import joblib

import ohtconf as conf
import ohtcomm as comm
import ohtkml as kml


## Main

In [None]:
mainstart = time.time()

In [None]:
# set float display format
pd.set_option("display.float_format", "{:.1f}".format)

In [None]:
# read table data
_start = time.time()

dfmix = comm.read_tabdf(conf.TABNAME_MIX)
dfmix.sort_values(by=conf.COLUMN_NAMES[0], inplace=True)
dfmix.reset_index(drop=True, inplace=True)

dfknn = dfmix.iloc[0 : conf.TRAIN_SIZE]  # execution time key factor

# type convert to prevent model fit, RuntimeWarning: invalid value encountered in cast
type_dict = {col: np.float32 for col in conf.COLUMN_GRAPH}
dfknn = dfknn.astype(type_dict)

print(f"row count, dfmix={len(dfmix)}, dfknn={len(dfknn)}, expected dfknn={conf.TRAIN_SIZE}")

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

### KNN modeling with all 10 features

In [None]:
# knn training
_start = time.time()

# data  feature, column name and labels for flag column value
ALL_ILABELS = list(range(0, 11))  # 0 ~ 10 for normal, outlier-tem,...,outlier-ct4 flag
ALL_COLUMNS = conf.COLUMN_GRAPH
ALL_LABELS = ["NORM"] + ALL_COLUMNS  # NORM for normal, others are for outlier

# data
X = dfknn.loc[:, ALL_COLUMNS]

# label
y = dfknn.loc[:, [conf.COLUMN_FLAG]].copy()  # dataframe
y = y[conf.COLUMN_FLAG]  # series
y = kml.make_strlabel(
    y, ALL_ILABELS, ALL_LABELS
)  # map y, number labels(0~10) to str labels (NORM,TEM,...,CT3) for easier report understanding


best_k, model, X_test, y_test = kml.train(X, y, ALL_COLUMNS, ALL_LABELS, neighbors=kml.calc_neighbors(y))

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# knn predict
_start = time.time()

y_pred = kml.predict(model, X_test)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# knn report

dfcm = kml.report(best_k, y_test, y_pred, ALL_LABELS, title="all 10 features")

In [None]:
# knn confusion matrix heatmap chart
start = time.time()

pngfile = "knn-cm-heatmap-all.png"
kml.cm_heatmap(dfcm, title="all 10 features", pngfile=pngfile)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

### KNN trainned model usage example 
- save trainned model to a file by the trainning program
- load the saved model by another predicting program
- do predicting

In [None]:
# Save model to a named temporary file
temp_joblib_file = None
with tempfile.NamedTemporaryFile(prefix="knn-model-all_", suffix=".joblib", dir=None, delete=False) as temp_file:
    joblib.dump(model, temp_file)
    temp_joblib_file = temp_file.name
    print(f"Temporary joblib file created: {temp_joblib_file}")

In [None]:
# Load the model from the file
load_model = joblib.load(temp_joblib_file)
print(f"Temporary joblib file loaded: {temp_joblib_file}")

load_test = X_test.iloc[0:1]
expect_y = y_pred[0:1]
print(f"load model test data:\n{load_test.to_string()}")

predict_y = kml.predict(load_model, load_test)
if predict_y[0] == expect_y[0]:
    print(f"loaded model predicted value match with expected value: {predict_y[0]}, {expect_y[0]}")
else:
    print(f"loaded model predicted value unmatch with expected value: {predict_y[0]}, {expect_y[0]}")

In [None]:
# Delete the model file
os.remove(temp_joblib_file)
print(f"Temporary joblib file deleted: {temp_joblib_file}")

### KNN modeling with 2 features for 2-D visualization 

In [None]:
_start = time.time()

# data  feature, column name and labels for flag column value
TWO_ILABELS = [0, 1, 6]  # 0 for normal, 1 for outlier-tem, 6 for outlier-nh3
TWO_COLUMNS = [conf.COLUMN_NAMES[TWO_ILABELS[1]], conf.COLUMN_NAMES[TWO_ILABELS[2]]]
TWO_LABELS = ["NORM"] + TWO_COLUMNS  # NORM for normal, other are for outlier

# data
X = dfknn.loc[:, TWO_COLUMNS]

# label
y = dfknn.loc[:, [conf.COLUMN_FLAG]].copy()  # dataframe
# clear non-candidate column's flag value
y.loc[(y[conf.COLUMN_FLAG] != TWO_ILABELS[1]) & (y[conf.COLUMN_FLAG] != TWO_ILABELS[2]), conf.COLUMN_FLAG] = 0
y = y[conf.COLUMN_FLAG]  # series
# map y, label values TWO_INDEXES to TWO_LABELS
y = kml.make_strlabel(y, TWO_ILABELS, TWO_LABELS)

best_k, model, X_test, y_test = kml.train(X, y, TWO_COLUMNS, TWO_LABELS, neighbors=kml.calc_neighbors(y))

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# knn predict
_start = time.time()

y_pred = kml.predict(model, X_test)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# knn report

dfcm = kml.report(best_k, y_test, y_pred, TWO_LABELS, title=" & ".join(TWO_COLUMNS))

In [None]:
# knn confusion matrix heatmap chart
start = time.time()

pngfile = f"knn-cm-heatmap-{'-'.join(TWO_COLUMNS)}.png"
kml.cm_heatmap(dfcm, title=f"2 features - {' & '.join(TWO_COLUMNS)}", pngfile=pngfile)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
# knn decision boundary scatter, 2-D visualization
start = time.time()

pngfile = f"knn-db-scatter-{'-'.join(TWO_COLUMNS)}.png"
kml.f2_boundary_scatter(X, y, model, best_k, title=" & ".join(TWO_COLUMNS), pngfile=pngfile)

_elapsed = time.time() - _start
print(f"elapsed time: {human.format_timespan(_elapsed)}")

In [None]:
_elapsed = time.time() - mainstart
print(f"main elapsed time: {human.format_timespan(_elapsed)}")

### eof