In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2 as cv
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import random

import sys
sys.path.append('..')
import utils


In [2]:
X, y = utils.load_training_data("../training_data/")

for i in range(len(X)):
    X[i].drop(2, axis=1, inplace=True)
    X[i] = (X[i] - X[i].mean()) / X[i].std()


In [3]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)


In [4]:
LINE_BINS = 12
ANGLE_BINS = 12
BINS = LINE_BINS + ANGLE_BINS

def bucketize(x, num_buckets, range):
    res = int((x - range[0]) / (range[1] - range[0]) * num_buckets)
    if res == num_buckets:
        res -= 1
    return res
    
def featurize_point_vector(df: pd.DataFrame):
    features = []
    buckets = np.zeros((ANGLE_BINS))
    for i in range(df.shape[0]-1):
        angle = np.arctan2(df.iloc[i, 1], df.iloc[i, 0])
        bucket = bucketize(angle, ANGLE_BINS, (-np.pi, np.pi))
        magnitude = np.linalg.norm(df.iloc[i])
        buckets[bucket] += magnitude

    # res = np.histogram(features, bins=ANGLE_BINS, range=(-np.pi, np.pi))[0]
    res = buckets / buckets.sum()
    return res

def featurize_lines(df: pd.DataFrame):
    features = []
    buckets = np.zeros((LINE_BINS))

    for i in range(df.shape[0]-1):
        dir = df.iloc[i+1] - df.iloc[i]
        angle = np.arctan2(dir.iloc[1], dir.iloc[0])

        bucket = bucketize(angle, LINE_BINS, (-np.pi, np.pi))
        magnitude = np.linalg.norm(dir)
        buckets[bucket] += magnitude

    # res = np.histogram(features, bins=LINE_BINS, range=(-np.pi, np.pi))[0]
    # res = res / res.sum()
    res = buckets / buckets.sum()

    return res

def featurize(df: pd.DataFrame):
    angle_features = featurize_point_vector(df)
    line_features = featurize_lines(df)
    # line_features = []
    res = np.concatenate((line_features, angle_features))
    return res


In [5]:
num_clusters = [
    2,  # 0
    4,  # 1
    3,  # 2
    2,  # 3
    2,  # 4
    2,  # 5
    2,  # 6
    3,  # 7
    2,  # 8
    2,  # 9
]

cluster_mapping = [i for i in range(10) for _ in range(num_clusters[i])]
print(cluster_mapping)


[0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9]


In [6]:
fvs = []

for i, df in enumerate(train_X):
    fvs.append(featurize(df))


In [7]:
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from scipy.special import kl_div

def f(x, y):
    return np.sum(kl_div(x, y))

knn = KNeighborsClassifier(n_neighbors=1, metric=f)
knn.fit(fvs, train_y)

correct = 0
incorrect = 0

y_true = []
y_pred = []

for i, df in enumerate(test_X):
    test_features = featurize(df)
    selected = knn.predict([test_features])[0]

    y_true.append(test_y.iloc[i])
    y_pred.append(selected)

print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.92      0.87        25
           1       1.00      0.52      0.68        25
           2       0.96      0.92      0.94        25
           3       0.78      0.84      0.81        25
           4       1.00      0.88      0.94        25
           5       0.96      0.92      0.94        25
           6       0.86      0.96      0.91        25
           7       0.96      0.92      0.94        25
           8       0.86      0.96      0.91        25
           9       0.69      0.88      0.77        25

    accuracy                           0.87       250
   macro avg       0.89      0.87      0.87       250
weighted avg       0.89      0.87      0.87       250



In [8]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true, y_pred)


array([[23,  0,  0,  0,  0,  0,  1,  0,  0,  1],
       [ 0, 13,  1,  3,  0,  1,  0,  0,  0,  7],
       [ 0,  0, 23,  1,  0,  0,  1,  0,  0,  0],
       [ 2,  0,  0, 21,  0,  0,  0,  0,  2,  0],
       [ 1,  0,  0,  0, 22,  0,  0,  1,  0,  1],
       [ 0,  0,  0,  0,  0, 23,  1,  0,  1,  0],
       [ 1,  0,  0,  0,  0,  0, 24,  0,  0,  0],
       [ 0,  0,  0,  1,  0,  0,  0, 23,  0,  1],
       [ 1,  0,  0,  0,  0,  0,  0,  0, 24,  0],
       [ 0,  0,  0,  1,  0,  0,  1,  0,  1, 22]])