In [1]:
from os import listdir
from os.path import isfile, join
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from collections import Counter
from sklearn.metrics import accuracy_score
from scipy.stats import skew, kurtosis
import tensorflow as tf
import pandas as pd
import random
import pandas as pd
import numpy as np




In [2]:
def get_row_id(row):
    return str(row.frame) + "-" + row.type + "-" + str(row.landmark_index)

def duplicate_vals(pf, values: np.array, iter):
    frames_to_dup = pf.loc[pf.frame.isin(values)].copy()
    frames_to_dup.frame += (frames_to_dup.frame-round(frames_to_dup.frame))*0.001 + 0.01*iter
    
    frames_to_dup.row_id = frames_to_dup.apply(get_row_id, axis=1)

    pf = pd.concat([pf, frames_to_dup], ignore_index=True).sort_values('frame')

    return pf

def remove_vals(pf, values: np.array):
    values = np.sort(values)

    for val in values:
        pf = pf.loc[pf.frame != val]

    return pf


def transform_data(pf, frame_amt_goal, iter=1):

    frame_nums = pf.frame.unique()
    frame_diff = abs(frame_amt_goal - len(frame_nums))
    operation = frame_amt_goal > len(frame_nums)

    values_to_operate = np.array([])

    if frame_diff%2 == 1:
        central_point = frame_nums[int(len(frame_nums)/2)]
        values_to_operate = np.append(values_to_operate, [central_point])
        frame_nums = np.delete(frame_nums, [int(len(frame_nums)/2)])
        frame_diff -= 1

    if frame_diff != 0:
        step_val = len(frame_nums)/frame_diff
        step_val = 1 if step_val < 1 else step_val
        
        loop_cnt = len(frame_nums) if frame_diff > len(frame_nums) else frame_diff
        values_to_operate = np.append(values_to_operate, frame_nums[[int(i*step_val) for i in range(0, loop_cnt)]])
    else:
        loop_cnt = 0

    if operation:
        pf = duplicate_vals(pf, values_to_operate, iter)
    else:
        pf = remove_vals(pf, values_to_operate)

    if frame_diff - loop_cnt != 0:
        pf = transform_data(pf, frame_amt_goal, iter+1)

    return pf

def populate_table(pf, video_data):
    frame_num = 0

    for frame in pf.frame.unique():
        x_vals = list(pf['x'].loc[pf.frame==frame])
        y_vals = list(pf['y'].loc[pf.frame==frame])
        z_vals = list(pf['z'].loc[pf.frame==frame])

        video_data[f'{frame_num}x'] = x_vals
        video_data[f'{frame_num}y'] = y_vals
        video_data[f'{frame_num}z'] = z_vals
        
        frame_num += 1

    return video_data

def create_data_table(pf):
    col_labels = ['type','landmark_index']

    for i in range(len(pf.frame.unique())):
        col_labels.append(f'{i}x')
        col_labels.append(f'{i}y')
        col_labels.append(f'{i}z')

    landmarks = []
    types = []

    for i in pf.type.unique():
        for j in pf.landmark_index.loc[pf.type==i].unique():
            landmarks.append(j)
            types.append(i)

    data = {col: [0.0] * len(types) for col in col_labels}
    data['type'] = types
    data['landmark_index'] = landmarks

    video_data = pd.DataFrame(columns=col_labels, data=data)
    video_data = populate_table(pf, video_data)

    return video_data

def apply_PCA(n_components, x_train, x_test):
    pca = PCA(n_components=n_components)
    pca.fit(x_train)
    return pca.transform(x_train), pca.transform(x_test)

def drop_empty_rows(pf):
    pf = pf.drop(pf.loc[(pf.x == 0) & (pf.y == 0) & (pf.z == 0)].index, axis=0)
    return pf

In [3]:
class Normalize(tf.Module):
  def __init__(self, x):
    # Initialize the mean and standard deviation for normalization
    self.mean = tf.Variable(tf.math.reduce_mean(x, axis=0))
    self.std = tf.Variable(tf.math.reduce_std(x, axis=0)) + 0.001

  def norm(self, x):
    # Normalize the input
    return (x - self.mean)/self.std

  def unnorm(self, x):
    # Unnormalize the input
    return (x * self.std) + self.mean

In [4]:
folders = ["alligator/", "flower/", "kiss/", "listen/", "orange/"]
aggregated_files = []

for folder in folders:
    path = "./asl-kaggle/averaged_by_labels/"+folder
    # path = "./asl-kaggle/by_labels/"+folder
    parquets = [f for f in listdir(path) if isfile(join(path, f))]

    for parquet in parquets[:50]:
        parquet_path = path+parquet
        pf = pd.read_parquet(parquet_path)
        
        pf = pf.drop(pf.loc[pf.type=="face"].index)
        pf = pf.fillna(0)

        # pf = default_to_right(pf)
        pf = transform_data(pf, 30)
        video_data = create_data_table(pf)

        # Folder is used as the label here
        aggregated_row = np.array([folder])

        # Drop unnecessary columns
        dropped_columns = video_data.drop(['type', 'landmark_index'], axis=1)

        # Calculate mean, median, min, max for each column
        basic_stats = dropped_columns.agg(['mean', 'median', 'min', 'max']).values.flatten()

        # Calculate skew and kurtosis for each column
        skew_kurtosis_stats = dropped_columns.apply(lambda x: pd.Series([skew(x), kurtosis(x)])).values.flatten()
        
        # Concatenate all the calculated values
        aggregated_values = np.concatenate([basic_stats, skew_kurtosis_stats])

        # Append the calculated values to aggregated_row
        aggregated_row = np.append(aggregated_row, aggregated_values)

        aggregated_files.append(aggregated_row)

aggregated_pf = pd.DataFrame(aggregated_files)
aggregated_pf = aggregated_pf.rename(columns={0: 'label'})

In [None]:
scores = []

for i in range(20):
    r_state = random.randint(1,1000)
    train_dataset = aggregated_pf.sample(frac=0.75, random_state=r_state)
    test_dataset = aggregated_pf.drop(train_dataset.index)

    x_train, y_train = train_dataset.iloc[:, 1:], train_dataset.iloc[:, 0]
    x_test, y_test = test_dataset.iloc[:, 1:], test_dataset.iloc[:, 0]

    x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
    x_test = tf.convert_to_tensor(x_test, dtype=tf.float32)

    norm_x = Normalize(x_train)
    x_train_norm, x_test_norm = norm_x.norm(x_train), norm_x.norm(x_test)

    # pca_x_train, pca_x_test = apply_PCA(40, x_train_norm, x_test_norm)
    pca_x_train, pca_x_test = x_train_norm, x_test_norm

    classif = LogisticRegression(random_state=0, max_iter=10000).fit(pca_x_train, y_train)
    # classif = RandomForestClassifier().fit(pca_x_train, y_train)
    # classif = SVC(gamma='auto').fit(pca_x_train, y_train)
    
    y_preds = classif.predict(pca_x_test)
    scores.append(accuracy_score(y_test, y_preds))

print(np.mean(scores))

In [5]:
# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import os

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Logistic Regression",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(alpha=1, max_iter=10000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(max_iter=10000, random_state=42),
]

os.environ["LOKY_MAX_CPU_COUNT"] = "12"

scores_per_classif = dict(zip(names, [[] for i in range(len(names))]))

for i in range(20):
    r_state = random.randint(1,100)
    train_dataset = aggregated_pf.sample(frac=0.75, random_state=r_state)
    test_dataset = aggregated_pf.drop(train_dataset.index)

    x_train, y_train = train_dataset.iloc[:, 1:], train_dataset.iloc[:, 0]
    x_test, y_test = test_dataset.iloc[:, 1:], test_dataset.iloc[:, 0]

    x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
    x_test = tf.convert_to_tensor(x_test, dtype=tf.float32)

    norm_x = Normalize(x_train)
    x_train_norm, x_test_norm = norm_x.norm(x_train), norm_x.norm(x_test)

    pca_x_train, pca_x_test = apply_PCA(80, x_train_norm, x_test_norm)
    # pca_x_train, pca_x_test = x_train_norm, x_test_norm

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        classif = clf
        classif.fit(pca_x_train, y_train)
        y_pred = classif.predict(pca_x_test)

        scores_per_classif[name].append(accuracy_score(y_pred, y_test))

found 0 physical cores < 1
  File "c:\Users\zorko\anaconda3\envs\IS460MLApps\lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


In [7]:
pd.DataFrame(scores_per_classif).mean()

Nearest Neighbors      0.322581
Linear SVM             0.458065
RBF SVM                0.149194
Gaussian Process       0.184677
Decision Tree          0.268548
Random Forest          0.356452
Neural Net             0.467742
AdaBoost               0.307258
Naive Bayes            0.380645
QDA                    0.218548
Logistic Regression    0.431452
dtype: float64