In [149]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2 as cv
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import random

import sys
sys.path.append('..')
import utils


In [157]:
X, y = utils.load_training_data("../training_data/")

for i in range(len(X)):
    X[i].drop(2, axis=1, inplace=True)
    X[i] = (X[i] - X[i].mean()) / X[i].std()


Unnamed: 0,0,1
0,-8.9283,304.45
1,-9.1475,304.36
2,-9.8688,304.32
3,-11.69,303.81
4,-14.821,302.69
5,-19.585,300.62
6,-25.511,295.96
7,-31.59,287.91
8,-36.297,275.38
9,-37.797,264.15


In [151]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)


In [152]:
LINE_BINS = 12
ANGLE_BINS = 12
BINS = LINE_BINS + ANGLE_BINS

def bucketize(x, num_buckets, range):
    res = int((x - range[0]) / (range[1] - range[0]) * num_buckets)
    if res == num_buckets:
        res -= 1
    return res
    
def featurize_point_vector(df: pd.DataFrame):
    features = []
    buckets = np.zeros((ANGLE_BINS))
    for i in range(df.shape[0]-1):
        angle = np.arctan2(df.iloc[i, 1], df.iloc[i, 0])
        bucket = bucketize(angle, ANGLE_BINS, (-np.pi, np.pi))
        magnitude = np.linalg.norm(df.iloc[i])
        buckets[bucket] += magnitude

    # res = np.histogram(features, bins=ANGLE_BINS, range=(-np.pi, np.pi))[0]
    res = buckets / buckets.sum()
    return res

def featurize_lines(df: pd.DataFrame):
    features = []
    buckets = np.zeros((LINE_BINS))

    for i in range(df.shape[0]-1):
        dir = df.iloc[i+1] - df.iloc[i]
        angle = np.arctan2(dir.iloc[1], dir.iloc[0])

        bucket = bucketize(angle, LINE_BINS, (-np.pi, np.pi))
        magnitude = np.linalg.norm(dir)
        buckets[bucket] += magnitude

    # res = np.histogram(features, bins=LINE_BINS, range=(-np.pi, np.pi))[0]
    # res = res / res.sum()
    res = buckets / buckets.sum()

    return res

def featurize(df: pd.DataFrame):
    angle_features = featurize_point_vector(df)
    line_features = featurize_lines(df)
    # line_features = []
    res = np.concatenate((line_features, angle_features))
    return res


In [153]:
num_clusters = [
    2,  # 0
    4,  # 1
    3,  # 2
    2,  # 3
    2,  # 4
    2,  # 5
    2,  # 6
    3,  # 7
    2,  # 8
    2,  # 9
]

cluster_mapping = [i for i in range(10) for _ in range(num_clusters[i])]
print(cluster_mapping)


[0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9]


In [154]:
fvs = []

for num in range(10):
    features = []
    for i, df in enumerate(train_X):
        if train_y.iloc[i][0] != num:
            continue
        features.append(featurize(df))

    km = KMeans(n_clusters=num_clusters[num], n_init=20)
    km.fit(features)
    fvs += km.cluster_centers_.tolist()
    for i in range(num_clusters[num]):
        print(f"Num: {num},Cluster {i}: {km.labels_.tolist().count(i)}")


Num: 0,Cluster 0: 51
Num: 0,Cluster 1: 24
Num: 1,Cluster 0: 31
Num: 1,Cluster 1: 21
Num: 1,Cluster 2: 8
Num: 1,Cluster 3: 15
Num: 2,Cluster 0: 29
Num: 2,Cluster 1: 23
Num: 2,Cluster 2: 23
Num: 3,Cluster 0: 31
Num: 3,Cluster 1: 44
Num: 4,Cluster 0: 52
Num: 4,Cluster 1: 23
Num: 5,Cluster 0: 64
Num: 5,Cluster 1: 11
Num: 6,Cluster 0: 43
Num: 6,Cluster 1: 32
Num: 7,Cluster 0: 21
Num: 7,Cluster 1: 24
Num: 7,Cluster 2: 30
Num: 8,Cluster 0: 31
Num: 8,Cluster 1: 44
Num: 9,Cluster 0: 52
Num: 9,Cluster 1: 23


In [155]:
from scipy.special import kl_div
from sklearn.metrics import classification_report

correct = 0
incorrect = 0

y_true = []
y_pred = []

for i, df in enumerate(test_X):
    test_features = featurize(df)

    scores = []
    for ti, template in enumerate(fvs):
        # score = chisquare(tmp, template)
        score = np.sum(kl_div(test_features, template))
        # scores.append(score.pvalue)
        scores.append(score)

    selected = cluster_mapping[np.argmin(scores)]
    # selected = np.argmax(scores)//2
    # print(f"i: {i}, Num: {num}, Estimate: {selected}")
    y_true.append(test_y.iloc[i][0])
    y_pred.append(selected)

print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.88      0.81        25
           1       0.50      0.04      0.07        25
           2       0.89      0.96      0.92        25
           3       0.85      0.92      0.88        25
           4       0.74      1.00      0.85        25
           5       1.00      0.76      0.86        25
           6       0.74      0.92      0.82        25
           7       0.57      0.80      0.67        25
           8       0.90      0.76      0.83        25
           9       0.92      0.92      0.92        25

    accuracy                           0.80       250
   macro avg       0.79      0.80      0.76       250
weighted avg       0.79      0.80      0.76       250



In [156]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true, y_pred)


array([[22,  0,  0,  0,  0,  0,  1,  0,  2,  0],
       [ 0,  1,  0,  0,  7,  0,  0, 15,  0,  2],
       [ 0,  0, 24,  1,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 23,  1,  0,  1,  0,  0,  0],
       [ 0,  0,  0,  0, 25,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  2,  0, 19,  4,  0,  0,  0],
       [ 2,  0,  0,  0,  0,  0, 23,  0,  0,  0],
       [ 0,  1,  3,  0,  1,  0,  0, 20,  0,  0],
       [ 4,  0,  0,  1,  0,  0,  1,  0, 19,  0],
       [ 1,  0,  0,  0,  0,  0,  1,  0,  0, 23]])