In [1]:
import sys
sys.path.append("d:\\SMU\\ml&applns")

from os import listdir
from os.path import isfile, join
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pandas as pd
import random
import pandas as pd
import numpy as np

from Preprocessing.DataExtration import *
from Preprocessing.DFTransformations import *




In [2]:
def apply_PCA(n_components, x_train, x_test):
    pca = PCA(n_components=n_components)
    pca.fit(x_train)
    return pca.transform(x_train), pca.transform(x_test)

In [3]:
class Normalize(tf.Module):
  def __init__(self, x):
    # Initialize the mean and standard deviation for normalization
    self.mean = tf.Variable(tf.math.reduce_mean(x, axis=0))
    self.std = tf.Variable(tf.math.reduce_std(x, axis=0)) + 0.001

  def norm(self, x):
    # Normalize the input
    return (x - self.mean)/self.std

  def unnorm(self, x):
    # Unnormalize the input
    return (x * self.std) + self.mean

In [4]:
def calc_stats_types(pf):
    grouped_data = pf.groupby('frame')[['x','y','z']]
    grouped_data = np.array([group.values for _, group in grouped_data])
    arr = np.transpose(grouped_data, (0, 2, 1))

    face_data = arr[:, :, :7]
    left_hand_data = arr[:, :, 7:24]
    pose_data = arr[:, :, 24:30]
    right_hand_data = arr[:, :, 30:]

    data = [left_hand_data, pose_data, right_hand_data]

    video_data = calculate_stats_np(face_data)

    for a in data:
        video_data = np.concatenate((video_data, calculate_stats_np(a)), axis=1)

    video_data = np.nan_to_num(video_data, 0)

    return video_data

In [5]:
import warnings

# Initialize a list of folder names containing parquet files
folders = ["hello", "all", "thankyou", "for", "time", "will", "now", "please", "quiet", "down", "listen", "close", "have", "no", "nap", "bye", "base/", "good/", "how/", "show/", "we/", "work/"]
optional = ["if/", "noisy/", "mad/", "sad/"]

folders = np.append(folders, optional)

# Initialize an empty list for all 
aggregated_files = []
labels = []

# Iterate over the folders in the list
for folder in folders:
    
    # Update path to focus on content inside folder in current iteration
    path = "./asl-kaggle/averaged_by_labels/"+folder+"/"
    # Fetch all files names in folder
    parquets = [f for f in listdir(path) if isfile(join(path, f))]

    # Iterate over file names in list
    # Iterate up to the 50th file name
    for parquet in parquets[:]:

        # Update path to focus on file in current iteration
        parquet_path = path+parquet
        # Read the file at path and load data to pf
        pf = pd.read_parquet(parquet_path)
        
        # Remove all rows with type of face
        pf = pf.drop(pf.loc[pf.type=="face"].index)
        # Replace all Nan values with 0
        pf = pf.fillna(0)

        if len(pf.frame.unique()) < 10:
            continue

## ------------------------ To Tune ------------------------ ##

        # Transform data in pf to accomodate desired frame amount
        # If less than frame num, duplicate frames
        # If more than frame num, delete frames
        pf = transform_data(pf, 65, True, True) # (int)

## --------------------------------------------------------- ##
        
        warnings.simplefilter("ignore")
        video_data = calc_stats_types(pf)
        video_data = video_data.reshape((-1))

        aggregated_files.append(video_data)
        labels.append(folder)

# Convert list of file measures into a pandas dataframe
final_dataset = pd.DataFrame(aggregated_files)

In [20]:
# Initialize an empty list for accuracy scores
scores = []

# Loop 20 times
for i in range(1):
    # Split dataset and labels into train and test sets
    # Split ratio defined by test_size parameter (0.3 means 30% of total data is assigned to test set)
    x_train, x_test, y_train, y_test = train_test_split(final_dataset, labels, test_size=0.2, shuffle=True)

    # Convert x_train and x_test into tensorflow tensors
    x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
    x_test = tf.convert_to_tensor(x_test, dtype=tf.float32)

    # Normalize the train and test data
    norm_x = Normalize(x_train)
    x_train_norm, x_test_norm = norm_x.norm(x_train), norm_x.norm(x_test)

    # Apply PCA to data sets
    pca_x_train, pca_x_test = apply_PCA(60, x_train_norm, x_test_norm)
    # pca_x_train, pca_x_test = x_train_norm, x_test_norm

## ------------------------ To Tune ------------------------ ##

    # Initialize Logistic Regression and train on training set
    classif = LogisticRegression(
        penalty = 'l2', # (l1, l2, elasticnet, None)
        dual = False, # (False, True)
        tol = 1e-7, # (1e-4 - 1e-7)
        C = 1.0, # (positive float)
        class_weight = None, # (None, default)
        solver = 'lbfgs', # (lbfgs, liblinear, newton-cg, newton-cholesky, sag, saga)
        random_state = None, # (None, int)
        verbose = 0, # (int)
        l1_ratio = None, # (None, float from 0 to 1)
        max_iter = 100000 # (Don't change unless warnings)
    ).fit(pca_x_train, y_train)

## --------------------------------------------------------- ##

    # classif = RandomForestClassifier().fit(pca_x_train, y_train)
    # classif = SVC(gamma='auto').fit(pca_x_train, y_train)
    classif = DecisionTreeClassifier().fit(pca_x_train, y_train)
    
    # Predict labels for test set
    y_preds = classif.predict(pca_x_test)
    # Calculate accuracy score of predictions and append to list
    scores.append(accuracy_score(y_test, y_preds))

# Calculate the mean accuracy score across all iterations
print(np.mean(scores))

0.28309305373525556


---

# __Ignore Below, unless you feel confident to try other models__

---

## <u> __Welcome brave one!__ </u>


For available parameters to tune follow the following URLs for info:

KNeighbours : https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

SVM : https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

GaussianProcess : https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html

Decision Tree : https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

Random Forest : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

Neural Net : https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

AdaBoost : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

Naive Bayes : https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

QDA : https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html

---

Quick Big Tip!!

Do not go tuning every single classifier plz!

I would suggest focusing on the following:

- SVM or SVC (depends which name you like)
- Random Forest
- Neural Net

In [23]:
# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import os

# !! To reduce amount of classifiers tested/used, remove them from the below lists !!
# i.e. the list names and the list classifiers
# This is if you want to focus on specific models, but don't want to waste time on training and testing other models
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    # "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Logistic Regression",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    # GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(alpha=1, max_iter=10000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(max_iter=10000, random_state=42),
]

# Got no idea how many logical processors you have
# From Monday's talk you said 8 cores, so I set it to 8
# If you are able to find out number of logical cores in your laptop or if your cpu supports hyperthreading, change the value to 16 :)
os.environ["LOKY_MAX_CPU_COUNT"] = "6"

# Initialize a dictionary of lists
# This will be used to store the accuracy scores of each model for each iteration
scores_per_classif = dict(zip(names, [[] for i in range(len(names))]))

# Loop 20 times
for i in range(5):
    # Split dataset and labels into train and test sets
    # Split ratio defined by test_size parameter (0.3 means 30% of total data is assigned to test set)
    x_train, x_test, y_train, y_test = train_test_split(final_dataset, labels, test_size=0.2, shuffle=True)

    # Convert x_train and x_test into tensorflow tensors
    x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
    x_test = tf.convert_to_tensor(x_test, dtype=tf.float32)

    # Normalize the train and test data
    norm_x = Normalize(x_train)
    x_train_norm, x_test_norm = norm_x.norm(x_train), norm_x.norm(x_test)

    # Apply PCA to data sets
    pca_x_train, pca_x_test = apply_PCA(60, x_train_norm, x_test_norm)
    # pca_x_train, pca_x_test = x_train_norm, x_test_norm

    # Iterate over classifiers and their names
    for name, clf in zip(names, classifiers):
        # Train model on train data
        clf.fit(pca_x_train, y_train)
        # Predict labels of test set
        y_pred = clf.predict(pca_x_test)

        # Calculate accuracy score and append to list in dictionary at key = name of classifier
        scores_per_classif[name].append(accuracy_score(y_pred, y_test))

In [24]:
# Output mean accuracy scores for each model
pd.DataFrame(scores_per_classif).mean()

Nearest Neighbors      0.313499
Linear SVM             0.306422
RBF SVM                0.058847
Decision Tree          0.272215
Random Forest          0.347706
Neural Net             0.387287
AdaBoost               0.183879
Naive Bayes            0.209699
QDA                    0.282307
Logistic Regression    0.300786
dtype: float64