In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
def get_row_id(row):
    return str(row.frame) + "-" + row.type + "-" + str(row.landmark_index)

def reset_frame_num(pf):
    frames = pf.frame.unique()
    frames_cnt = len(frames)

    lh_cnt = len(pf.loc[(pf.frame==min(frames))&(pf.type=="left_hand")])
    rh_cnt = len(pf.loc[(pf.frame==min(frames))&(pf.type=="right_hand")])
    ps_cnt = len(pf.loc[(pf.frame==min(frames))&(pf.type=="pose")])

    points_p_frame = lh_cnt + rh_cnt + ps_cnt

    new_frame_nums = [[i for _ in range(points_p_frame)] for i in range(frames_cnt)]
    new_frame_nums = sum(new_frame_nums, [])

    pf.frame = new_frame_nums
    pf.row_id = pf.apply(get_row_id, axis=1)
    return pf

def convert_parquet_to_np_format(pf):
    arrayd_parquet = []
    types = ["pose","left_hand","right_hand"]

    for frame in pf.frame.unique():
        frame_arr = np.array([])
        frame_chunk = pf.loc[pf.frame==frame].sort_values('landmark_index')
        
        for type in types:
            xyz_vals = frame_chunk.loc[frame_chunk.type==type, ["x","y","z"]].values
            xyz_vals = np.concatenate(xyz_vals)

            frame_arr = np.concatenate([frame_arr, xyz_vals])

        arrayd_parquet.append(frame_arr)

    return arrayd_parquet

In [3]:
# from os import listdir
# from os.path import isfile, join
# import sys


# folders = ["alligator/", "flower/", "kiss/", "listen/", "orange/"]
# types_pad = [[0.0 for i in range(33*3)],[0.0 for i in range(33*3)],[0.0 for i in range(33*3)]]

# for folder in folders:
#     path = "./asl-kaggle/by_labels/"+folder
#     save_path = "./asl-kaggle/no_preprocess_np_labels/"+folder
#     parquets = [f for f in listdir(path) if isfile(join(path, f))]

#     for parquet in parquets[:50]:
#         parquet_path = path+parquet
#         pf = pd.read_parquet(parquet_path)
        
#         pf = pf.drop(pf.loc[pf.type=="face"].index)
#         pf = pf.fillna(0)
#         pf = pf.sort_values('frame')

#         pf['xyz'] = pf.apply(bond_xyz, axis=1)
#         np_array = convert_parquet_to_np_format(pf)

#         np_array = np_array + [types_pad for i in range(267 - len(np_array))]
#         np_array = np.array(np_array)
        
#         np.save(save_path+parquet, np_array)

In [4]:
def duplicate_vals(pf, values: np.array, iter):
    frames_to_dup = pf.loc[pf.frame.isin(values)].copy()
    frames_to_dup.frame += (frames_to_dup.frame-round(frames_to_dup.frame))*0.001 + 0.01*iter
    
    frames_to_dup.row_id = frames_to_dup.apply(get_row_id, axis=1)

    pf = pd.concat([pf, frames_to_dup], ignore_index=True).sort_values('frame')

    return pf

# def averaged_duplicate(pf, values: np.array, iter):
#     frames_to_dup = pf.loc[pf.frame.isin(values)].copy()
#     ahead_1 = pf.loc[pf.frame.isin(values+1)].copy()

#     if len(ahead_1) != len(frames_to_dup):
#         ahead_1 = pd.concat([
#             ahead_1,
#             frames_to_dup.loc[frames_to_dup.frame==max(frames_to_dup.frame)]
#             ], ignore_index=True)

#     frames_to_dup.x = (ahead_1.x.values+frames_to_dup.x)/2
#     frames_to_dup.y = (ahead_1.y.values+frames_to_dup.y)/2
#     frames_to_dup.z = (ahead_1.z.values+frames_to_dup.z)/2

#     frames_to_dup.frame += 0.01*iter

#     pf = pd.concat([pf, frames_to_dup], ignore_index=True).sort_values('frame')

#     return pf

def remove_vals(pf, values: np.array):
    values = np.sort(values)

    for val in values:
        pf = pf.loc[pf.frame != val]

    return pf


In [5]:
def transform_data(pf, frame_amt_goal, iter=1):

    frame_nums = pf.frame.unique()
    frame_diff = abs(frame_amt_goal - len(frame_nums))
    operation = frame_amt_goal > len(frame_nums)

    values_to_operate = np.array([])

    if frame_diff%2 == 1:
        central_point = frame_nums[int(len(frame_nums)/2)]
        values_to_operate = np.append(values_to_operate, [central_point])
        frame_nums = np.delete(frame_nums, [int(len(frame_nums)/2)])
        frame_diff -= 1

    if frame_diff != 0:
        step_val = len(frame_nums)/frame_diff
        step_val = 1 if step_val < 1 else step_val
        
        loop_cnt = len(frame_nums) if frame_diff > len(frame_nums) else frame_diff
        values_to_operate = np.append(values_to_operate, frame_nums[[int(i*step_val) for i in range(0, loop_cnt)]])
    else:
        loop_cnt = 0

    if operation:
        pf = duplicate_vals(pf, values_to_operate, iter)
    else:
        pf = remove_vals(pf, values_to_operate)

    if frame_diff - loop_cnt != 0:
        pf = transform_data(pf, frame_amt_goal, iter+1)

    return pf

In [6]:
pf = pd.read_parquet("./asl-kaggle/averaged_by_labels/alligator/20165761.parquet")
pf

Unnamed: 0,frame,row_id,type,landmark_index,x,y,z
0,22,22-face-0,face,0,0.452728,0.497625,-0.026592
1,22,22-face-1,face,1,0.444277,0.469700,-0.053501
2,22,22-face-2,face,2,0.446708,0.476862,-0.027491
3,22,22-face-3,face,3,0.438305,0.441203,-0.039765
4,22,22-face-4,face,4,0.444290,0.461555,-0.057071
...,...,...,...,...,...,...,...
5968,32,32-right_hand-16,right_hand,16,0.259397,0.725355,-0.272741
5969,32,32-right_hand-17,right_hand,17,0.159414,0.684204,-0.181114
5970,32,32-right_hand-18,right_hand,18,0.175069,0.677991,-0.243891
5971,32,32-right_hand-19,right_hand,19,0.188972,0.697104,-0.263154


In [10]:
from os import listdir
import os
from os.path import isfile, join
import sys


# folders = ["alligator/", "flower/", "kiss/", "listen/", "orange/"]
folders = ["hello/", "all/", "thankyou/", "for/", "time/", "will/", "now/", "please/", "quiet/", "down/", "listen/", "close/", "have/", "time/", "no/", "nap/", "bye/"]
optional = ["if/", "noisy/", "mad/", "sad/"]

for folder in optional[:]:
    path = "./asl-kaggle/averaged_by_labels/"+folder
    save_path = "./asl-kaggle/averaged_np_labels/"+folder
    parquets = [f for f in listdir(path) if isfile(join(path, f))]

    i = 1
    for parquet in parquets[:]:
        parquet_path = path+parquet
        pf = pd.read_parquet(parquet_path)
        
        pf = pf.drop(pf.loc[pf.type=="face"].index)
        pf = pf.fillna(0)

        # pf = transform_data(pf, 267)

        np_array = convert_parquet_to_np_format(pf)
        np_array = np.array(np_array)

        save_path2 = save_path+f"video{i}/"

        if not os.path.exists(save_path2):
            os.makedirs(save_path2)

        j = 1
        for arr in np_array:
            np.save(save_path2+f"{j}", arr)
            j += 1

        i += 1

        # np.save(save_path+parquet, np_array)

------------------------------------------------------------------------------------------------------------------------------

In [None]:
def populate_table(pf, video_data):
    frame_num = 0

    for frame in pf.frame.unique():
        x_vals = list(pf['x'].loc[pf.frame==frame])
        y_vals = list(pf['y'].loc[pf.frame==frame])
        z_vals = list(pf['z'].loc[pf.frame==frame])

        video_data[f'{frame_num}x'] = x_vals
        video_data[f'{frame_num}y'] = y_vals
        video_data[f'{frame_num}z'] = z_vals
        
        frame_num += 1

    return video_data

def create_data_table(pf):
    col_labels = ['type','landmark_index']

    for i in range(len(pf.frame.unique())):
        col_labels.append(f'{i}x')
        col_labels.append(f'{i}y')
        col_labels.append(f'{i}z')

    landmarks = []
    types = []

    for i in pf.type.unique():
        for j in pf.landmark_index.loc[pf.type==i].unique():
            landmarks.append(j)
            types.append(i)

    data = {col: [0.0] * len(types) for col in col_labels}
    data['type'] = types
    data['landmark_index'] = landmarks

    video_data = pd.DataFrame(columns=col_labels, data=data)
    video_data = populate_table(pf, video_data)

    return video_data

In [None]:
def default_to_right(pf):
    l_non = sum(pf.x.loc[pf.type=='left_hand'] == 0)
    r_non = sum(pf.x.loc[pf.type=='right_hand'] == 0)

    if l_non > r_non:
        pf = pf.drop(pf.x.loc[pf.type=='right_hand'].index, axis=0)
        pf.loc[pf.type=='left_hand', 'x'] = 1-pf.x.loc[pf.type=='left_hand']
        pf.loc[pf.type=='left_hand', 'type'] = 'right_hand'
    # else:
        pf = pf.drop(pf.x.loc[pf.type=='left_hand'].index, axis=0)
        
    return pf

In [None]:
pf = pd.read_parquet("./asl-kaggle/averaged_by_labels/alligator/20165761.parquet")
pf

In [None]:
from os import listdir
from os.path import isfile, join
from scipy.stats import skew, kurtosis, entropy
import sys


folders = ["alligator/", "flower/", "kiss/", "listen/", "orange/"]
aggregated_files = []

for folder in folders:
    path = "./asl-kaggle/averaged_by_labels/"+folder
    # path = "./asl-kaggle/by_labels/"+folder
    parquets = [f for f in listdir(path) if isfile(join(path, f))]

    for parquet in parquets[:50]:
        parquet_path = path+parquet
        pf = pd.read_parquet(parquet_path)
        
        pf = pf.drop(pf.loc[pf.type=="face"].index)
        pf = pf.fillna(0)

        # pf = default_to_right(pf)
        pf = transform_data(pf, 30)
        video_data = create_data_table(pf)

        aggregated_row = np.array([folder])
        # aggregated_row = np.append(aggregated_row, video_data.drop(['type','landmark_index'], axis=1).agg(['mean', 'median', 'min', 'max']).values.flatten())
        # Drop unnecessary columns
        dropped_columns = video_data.drop(['type', 'landmark_index'], axis=1)

        # Calculate mean, median, min, max for each column
        basic_stats = dropped_columns.agg(['mean', 'median', 'min', 'max']).values.flatten()

        # Calculate skew and kurtosis for each column
        skew_kurtosis_stats = dropped_columns.apply(lambda x: pd.Series([skew(x), kurtosis(x)])).values.flatten()
        
        # Concatenate all the calculated values
        aggregated_values = np.concatenate([basic_stats, skew_kurtosis_stats])

        # Append the calculated values to aggregated_row
        aggregated_row = np.append(aggregated_row, aggregated_values)

        aggregated_files.append(aggregated_row)

In [None]:
aggregated_pf = pd.DataFrame(aggregated_files)
aggregated_pf = aggregated_pf.rename(columns={0: 'label'})

In [None]:
# nas = 0

# for i in aggregated_pf.columns:
#     nas += sum(aggregated_pf[i].isna())

# nas

In [None]:
import tensorflow as tf
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as sk_metrics
import tempfile
import os

# Preset matplotlib figure sizes.
matplotlib.rcParams['figure.figsize'] = [9, 6]

print(tf.__version__)
# To make the results reproducible, set the random seed value.
tf.random.set_seed(22)

In [None]:
class Normalize(tf.Module):
  def __init__(self, x):
    # Initialize the mean and standard deviation for normalization
    self.mean = tf.Variable(tf.math.reduce_mean(x, axis=0))
    self.std = tf.Variable(tf.math.reduce_std(x, axis=0)) + 0.001

  def norm(self, x):
    # Normalize the input
    return (x - self.mean)/self.std

  def unnorm(self, x):
    # Unnormalize the input
    return (x * self.std) + self.mean

In [None]:
import numpy as np
from sklearn.decomposition import PCA

def reduce_dims(n_components, x_train, x_test):
    pca = PCA(n_components=n_components)
    pca.fit(x_train)
    return pca.transform(x_train), pca.transform(x_test)

In [None]:
import random
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

scores = []

for i in range(1, 21):
    r_state = random.randint(1,1000)
    train_dataset = aggregated_pf.sample(frac=0.75, random_state=r_state)
    test_dataset = aggregated_pf.drop(train_dataset.index)

    x_train, y_train = train_dataset.iloc[:, 1:], train_dataset.iloc[:, 0]
    x_test, y_test = test_dataset.iloc[:, 1:], test_dataset.iloc[:, 0]

    x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
    x_test = tf.convert_to_tensor(x_test, dtype=tf.float32)

    norm_x = Normalize(x_train)
    x_train_norm, x_test_norm = norm_x.norm(x_train), norm_x.norm(x_test)

    # pca_x_train, pca_x_test = reduce_dims(40, x_train_norm, x_test_norm)
    pca_x_train, pca_x_test = x_train_norm, x_test_norm

    classif = LogisticRegression(random_state=0, max_iter=10000).fit(pca_x_train, y_train)
    # classif = RandomForestClassifier().fit(pca_x_train, y_train)
    # classif = SVC(gamma='auto').fit(pca_x_train, y_train)
    
    y_preds = classif.predict(pca_x_test)
    scores.append(accuracy_score(y_test, y_preds))

print(np.mean(scores))

In [None]:
# With augmented points dataset :-
#   LogisticRegression - Best score at pca_comps = 30 - score = 0.439
#   RandomForest - Best score at pca_comps = 10 - score = 0.458
#   SVC - Best score at pca_comps = 70 - score = 0.405

In [None]:
# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Logistic Regression",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(alpha=1, max_iter=10000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(max_iter=10000, random_state=42),
]

os.environ["LOKY_MAX_CPU_COUNT"] = "12"

scores_per_classif = dict(zip(names, [[] for i in range(len(names))]))

for i in range(20):
    r_state = random.randint(1,100)
    train_dataset = aggregated_pf.sample(frac=0.75, random_state=r_state)
    test_dataset = aggregated_pf.drop(train_dataset.index)

    x_train, y_train = train_dataset.iloc[:, 1:], train_dataset.iloc[:, 0]
    x_test, y_test = test_dataset.iloc[:, 1:], test_dataset.iloc[:, 0]

    x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
    x_test = tf.convert_to_tensor(x_test, dtype=tf.float32)

    norm_x = Normalize(x_train)
    x_train_norm, x_test_norm = norm_x.norm(x_train), norm_x.norm(x_test)

    pca_x_train, pca_x_test = reduce_dims(80, x_train_norm, x_test_norm)
    # pca_x_train, pca_x_test = x_train_norm, x_test_norm

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        classif = clf
        classif.fit(pca_x_train, y_train)
        y_pred = classif.predict(pca_x_test)

        scores_per_classif[name].append(accuracy_score(y_pred, y_test))

In [None]:
pd.DataFrame(scores_per_classif).mean()

In [None]:
np.mean(pd.DataFrame(scores_per_classif).mean().values)