In [1]:
import json
import os
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, Subset
from torch.nn.utils.rnn import pad_sequence

In [2]:
id_joints_dict = {0: 'nose',
                1: 'left_eye',
                2: 'right_eye',
                3: 'left_ear',
                4: 'right_ear',
                5: 'left_shoulder',
                6: 'right_shoulder',
                7: 'left_elbow',
                8: 'right_elbow',
                9: 'left_wrist',
                10: 'right_wrist',
                11: 'left_hip',
                12: 'right_hip',
                13: 'left_knee',
                14: 'right_knee',
                15: 'left_ankle',
                16: 'right_ankle'}
joints_id_dict = {v: k for k, v in id_joints_dict.items()}

In [3]:
def get_df_from_preds(preds, instance_id=0):

        """
        Get a dataframe from a json file containing the poses. The dataframe contains the coordinates of the joints of the instance_id-th.

        Args:
            preds (str or list): Path of the json file containing the poses or list of poses.

        Returns:
            df (pd.DataFrame): A dataframe of poses.
        """

        if type(preds) == list:
            pose_sequence = preds
        else:
            with open(preds) as json_file:
                pose_sequence = json.load(json_file)
        
        dic_list = []

        for i in range(len(pose_sequence)):
            keypoints_dict = {}

            try:
                keypoints_list = pose_sequence[i]['instances'][instance_id]['keypoints']
            except:
                keypoints_list = [[np.nan, np.nan] for i in range(17)]

            for number, keypoint in enumerate(keypoints_list):
                keypoints_dict["X_" + id_joints_dict[number]] = keypoint[0]
                keypoints_dict["Y_" + id_joints_dict[number]] = keypoint[1]
            
            dic_list.append(keypoints_dict)
            
        df = pd.DataFrame.from_dict(dic_list)
        return df

In [4]:
def get_features_from_window(window):
        """
        Extract features from a window of data. 

        Args:
            window (np.array): A window of data

        Returns:
            features (dict): A dictionary of features
        """
        features = {}

        for keypoint in window.columns:

            if keypoint == 'label':
                continue

            features[keypoint + '_mean'] = window[keypoint].mean()
            features[keypoint + '_std'] = window[keypoint].std()
            features[keypoint + '_min'] = window[keypoint].min()
            features[keypoint + '_max'] = window[keypoint].max()
            features[keypoint + '_range'] = window[keypoint].max() - window[keypoint].min()
            features[keypoint + '_median'] = window[keypoint].median()
            features[keypoint + '_skew'] = window[keypoint].skew()
            features[keypoint + 'velocity_mean'] = window[keypoint].diff().mean()
            features[keypoint + 'velocity_std'] = window[keypoint].diff().std()

                
        return features

In [5]:
root_dir = '../../../outputs/tennis_no_vis_cluster'
classes = os.listdir(root_dir)
list_of_files = []

for classe in classes:
    for file in os.listdir(os.path.join(root_dir, classe, 'predictions')):
        list_of_files.append(os.path.join(root_dir, classe, 'predictions', file))
    list_of_files.sort()

In [6]:
windows = []
labels = []

for file in list_of_files:
    df = get_df_from_preds(file)
    labels.append(file.split('/')[-3])
    windows.append(df)

In [7]:
features = []

for window in windows:
    features.append(get_features_from_window(window))

X, y = pd.DataFrame(features), labels
X.head()

Unnamed: 0,X_nose_mean,X_nose_std,X_nose_min,X_nose_max,X_nose_range,X_nose_median,X_nose_skew,X_nosevelocity_mean,X_nosevelocity_std,Y_nose_mean,...,X_right_anklevelocity_std,Y_right_ankle_mean,Y_right_ankle_std,Y_right_ankle_min,Y_right_ankle_max,Y_right_ankle_range,Y_right_ankle_median,Y_right_ankle_skew,Y_right_anklevelocity_mean,Y_right_anklevelocity_std
0,323.435687,31.230645,261.388379,356.263636,94.875257,336.156527,-0.778076,-0.211416,3.354953,109.67723,...,0.52279,360.799669,2.271765,356.801437,365.801132,8.999695,361.476382,0.212273,0.005106,0.682806
1,307.009524,35.862794,150.830313,351.179689,200.349376,308.518677,-1.187158,0.217559,31.561884,112.312544,...,19.616039,357.213494,21.693737,176.893591,363.975758,187.082168,360.700637,-8.313788,0.027209,31.104692
2,304.675511,39.17081,238.437346,351.927037,113.489691,314.328186,-0.389963,-0.048872,4.551608,112.358458,...,0.834304,361.568135,3.803096,357.085279,369.141363,12.056084,360.387839,0.428852,-0.0146,1.045772
3,304.585725,29.355908,256.298045,389.491663,133.193618,317.451296,-0.168284,-0.322465,15.417172,116.261615,...,16.260785,353.334689,16.850947,207.203133,359.67156,152.468428,354.886475,-8.686701,0.01745,24.168932
4,299.901018,33.050078,239.455798,338.53614,99.080342,318.356994,-0.665777,0.01545,4.008972,118.75239,...,0.778375,355.669228,1.494769,352.766666,358.925846,6.15918,355.535961,0.039084,0.038253,0.642383


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_norm = scaler.fit_transform(X_train)

In [9]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_norm, y_train)

In [13]:
clf.score(scaler.transform(X_train), y_train)

1.0

In [14]:
clf.score(scaler.transform(X_test), y_test)

0.797979797979798

## Grid search

In [17]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],
    'max_features': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [100, 300, 500]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 3)

grid_search.fit(X_norm, y_train)

Fitting 3 folds for each of 972 candidates, totalling 2916 fits


[CV 2/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.748 total time=   0.4s
[CV 2/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=4, n_estimators=100;, score=0.712 total time=   0.5s
[CV 2/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=2, min_samples_leaf=3, min_samples_split=3, n_estimators=100;, score=0.708 total time=   0.4s
[CV 3/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=2, min_samples_leaf=3, min_samples_split=3, n_estimators=100;, score=0.680 total time=   0.4s
[CV 1/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.756 total time=   1.1s
[CV 3/3] END bootstrap=True, criterion=gini, max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=0.741 total time=

In [19]:
print('Best params: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)

Best params:  {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 6, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 500}
Best score:  0.8106060606060606


In [20]:
rf = RandomForestClassifier(**grid_search.best_params_)
rf.fit(X_norm, y_train)
print('Train score: ', rf.score(scaler.transform(X_train), y_train))
print('Test score: ', rf.score(scaler.transform(X_test), y_test))

Train score:  1.0
Test score:  0.8333333333333334
