In [50]:
import os
import sys
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import StratifiedKFold, KFold

import boto3

In [51]:
module_path = os.path.abspath(os.path.join('../../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

using kernel ml.c5.9xlarge

In [52]:
from global_config import ROOT_DIR, emotion_id_to_emotion_abr, conf_cmap
from src.analysis.supervised_learning.evaluation.confusion_matrix import ConfusionMatrixCreator

from src.preprocessing.dataset_creation.aggregation import get_aggregate_measures, normalize

In [122]:
def evaluate_scores(X, y, svc, scoring_method):
    
    skf = KFold(n_splits=100, shuffle=True)
    
    # get scores
    scores = cross_validate(X=X, y=y,
                            estimator           = svc,
                            scoring             = [scoring_method],
                            verbose             = 1,
                            cv                  = skf.split(X, y),
                            n_jobs              = -1,
                            return_train_score  = True                        
                           )
    
    print('printing {} measures'.format(scoring_method))
    print('avg (train):', np.mean(scores['train_{}'.format(scoring_method)]))
    print('std (train):', np.std(scores['train_{}'.format(scoring_method)]))
    print('avg (validation):', np.mean(scores['test_{}'.format(scoring_method)]))
    print('std (validation):', np.std(scores['test_{}'.format(scoring_method)]))

# TODO
Investigate whether min/max normalization or standardscaling in the first step is more effective, do some comparison.

# Min max normalized 

In [96]:
from s3fs.core import S3FileSystem
s3 = S3FileSystem()
from sagemaker import get_execution_role

role = get_execution_role()
bucket='files-and-examples-01'
file = 'datasets/su_dataset/intensity_23_mode_p_minmax_normalized.pickle'

path = s3.open('s3://{}/{}'.format(bucket, file))

In [97]:
file = pickle.load(path)
x = file['x']
y = file['y']

au = x['au']
gaze = x['gaze']
pose = x['pose']

In [98]:
au_agg = get_aggregate_measures(au,
                               means=True,
                               variance=False,
                               deltas=False,
                               peaks=False)

In [99]:
scaler = MinMaxScaler()
au_agg = scaler.fit_transform(au_agg)

In [100]:
skf = KFold(n_splits=5, shuffle=True)
splits = skf.split(au_agg, y)

In [101]:
# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]

# 
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

In [102]:
clf.fit(au_agg, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:   37.5s finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7f88d37bc4d0>,
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 25, 50, 75, 100],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [103]:
clf.best_params_

{'C': 10, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}

In [126]:
# Standard KFOLD
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.9988789426418613
std (train): 9.760880106459751e-05
avg (validation): 0.14827635327635327
std (validation): 0.06996059770650222


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.0s finished


### Without last step min max

In [92]:
clf.best_params_

{'C': 10, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}

In [95]:
# Standard KFOLD
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.5294507575757577
std (train): 0.0027673559083716734
avg (validation): 0.12272727272727275
std (validation): 0.012061881309159592


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished


# Without normalization

In [72]:
from s3fs.core import S3FileSystem
s3 = S3FileSystem()
from sagemaker import get_execution_role

role = get_execution_role()
bucket='files-and-examples-01'
file = 'datasets/su_dataset/sliced_au_gaze_int23_modep_unnormalized.pickle'

path = s3.open('s3://{}/{}'.format(bucket, file))

In [73]:
file = pickle.load(path)
x = file['x']
y = file['y']

au = x['au']
gaze = x['gaze']
pose = x['pose']

In [74]:
au_agg = get_aggregate_measures(au,
                               means=True,
                               variance=False,
                               deltas=False,
                               peaks=False)

In [75]:
scaler = MinMaxScaler()
au_agg = scaler.fit_transform(au_agg)

In [76]:
skf = KFold(n_splits=5, shuffle=True)
splits = skf.split(au_agg, y)

In [77]:
# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]

# 
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

In [78]:
clf.fit(au_agg, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:   40.1s finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7f88c7718d50>,
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 25, 50, 75, 100],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [80]:
clf.best_params_

{'C': 10, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}

In [84]:
# Standard KFOLD
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.9876893939393939
std (train): 0.002074706657216531
avg (validation): 0.10303030303030303
std (validation): 0.013999786827709346


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


# Standarddized data

In [54]:
from s3fs.core import S3FileSystem
s3 = S3FileSystem()
from sagemaker import get_execution_role

role = get_execution_role()
bucket='files-and-examples-01'
file = 'datasets/su_dataset/sliced_au_gaze_pose_intensity_23_mode_p.pickle'

path = s3.open('s3://{}/{}'.format(bucket, file))

In [55]:
file = pickle.load(path)

In [56]:
x = file['x']
y = file['y']

au = x['au']
gaze = x['gaze']
pose = x['pose']

# Only means

In [57]:
au_agg = get_aggregate_measures(au,
                               means=True,
                               variance=False,
                               deltas=False,
                               peaks=False)

In [None]:
# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]

# 
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

In [59]:
#au_agg, y = shuffle(au_agg, y)

In [60]:
au_agg.shape

(2640, 85)

In [61]:
y.shape

(2640,)

In [62]:
skf = KFold(n_splits=5, shuffle=True)
splits = skf.split(au_agg, y)

In [63]:
# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]

# 
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

In [64]:
clf.fit(au_agg, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   40.6s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.2min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7f88d34a6cd0>,
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 25, 50, 75, 100],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [65]:
clf.best_params_

{'C': 5, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}

In [69]:
# Standard KFOLD
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.9379734848484848
std (train): 0.0028723013045649677
avg (validation): 0.12234848484848485
std (validation): 0.017588161767036214


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


In [28]:
clf.best_params_

{'C': 25, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}

In [49]:
# Min Max KFOLD
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.9994318181818181
std (train): 0.0003543236161717848
avg (validation): 0.1431818181818182
std (validation): 0.00947726969939152


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


In [39]:
# Standardscaler
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.9993371212121211
std (train): 0.00023195925594538333
avg (validation): 0.13674242424242428
std (validation): 0.007518724712605766


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s finished


In [89]:
# no scaling
clf.best_params_

{'C': 10, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'rbf'}

In [90]:
# no scaling
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.9994318181818181
std (train): 0.00035432361617178485
avg (validation): 0.1303030303030303
std (validation): 0.009978742327514766


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


In [74]:
# with min max scaler
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.9994318181818181
std (train): 0.00018939393939394473
avg (validation): 0.14053030303030303
std (validation): 0.00833333333333334


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


# Adding Variance

In [26]:
au_agg = get_aggregate_measures(au,
                               means=True,
                               variance=True,
                               deltas=False,
                               peaks=False)

In [27]:
clf.fit(au_agg, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 20, 50, 80, 100],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [28]:
clf.best_params_

{'C': 20, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'rbf'}

In [29]:
svc = SVC(**clf.best_params_)

In [30]:
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.3135416666666667
std (train): 0.0062843409178170795
avg (validation): 0.12045454545454545
std (validation): 0.014050937114387428


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished


# Adding deltas

In [31]:
au_agg = get_aggregate_measures(au,
                               means=True,
                               variance=False,
                               deltas=True,
                               peaks=False)

In [32]:
clf.fit(au_agg, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.2min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 20, 50, 80, 100],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [33]:
clf.best_params_

{'C': 100, 'class_weight': 'balanced', 'gamma': 0.0001, 'kernel': 'rbf'}

In [34]:
svc = SVC(**clf.best_params_)

In [35]:
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.21467803030303031
std (train): 0.005436628823639137
avg (validation): 0.12727272727272726
std (validation): 0.021507681161482408


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.0s finished


# Adding peaks

In [36]:
au_agg = get_aggregate_measures(au,
                               means=True,
                               variance=False,
                               deltas=False,
                               peaks=True)

In [37]:
au_agg.shape

(2640, 102)

In [38]:
clf.best_params_

{'C': 100, 'class_weight': 'balanced', 'gamma': 0.0001, 'kernel': 'rbf'}

In [39]:
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.4
std (train): 0.012187614047875895
avg (validation): 0.09848484848484848
std (validation): 0.01233243984553008


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.0s finished


# Running on all aggregates 

In [40]:
au_agg = get_aggregate_measures(au)

In [41]:
au_agg.shape

(2640, 153)

In [42]:
clf.fit(au_agg, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.9min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 20, 50, 80, 100],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [43]:
clf.best_params_

{'C': 50, 'class_weight': 'balanced', 'gamma': 0.0001, 'kernel': 'rbf'}

In [44]:
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.35331439393939396
std (train): 0.006383446797860897
avg (validation): 0.10606060606060605
std (validation): 0.013392173886108855


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.7s finished


# Removing peaks

In [45]:
au_agg = get_aggregate_measures(au,
                               means=True,
                               variance=True,
                               deltas=True,
                               peaks=False)

In [46]:
au_agg.shape

(2640, 136)

In [47]:
clf.fit(au_agg, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.0min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 20, 50, 80, 100],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [48]:
clf.best_params_

{'C': 20, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'rbf'}

In [49]:
svc = SVC(**clf.best_params_)
evaluate_scores(au_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.31609848484848485
std (train): 0.007405763180309532
avg (validation): 0.11969696969696968
std (validation): 0.014344012294516277


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


# pose

In [15]:
len(pose)

2640

In [16]:
pose[0].shape

(69, 6)

In [12]:
pose_agg = get_aggregate_measures(pose)

In [13]:
pose_agg.shape

(2640, 54)

In [14]:
y.shape

(2640,)

In [20]:
c_values = [0.1, 1, 5, 10, 20, 50]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=51,
                   n_jobs=-1,
                   )

In [None]:
clf.fit(pose_agg, y)

In [22]:
clf.best_params_

{'C': 10, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'linear'}

In [24]:
svc = SVC(**clf.best_params_)
evaluate_scores(pose_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.40123106060606056
std (train): 0.006242821865094504
avg (validation): 0.07727272727272728
std (validation): 0.01733342604340538


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.2min finished


# gaze

In [12]:
len(gaze)

2640

In [13]:
y.shape

(2640,)

In [14]:
gaze_agg = get_aggregate_measures(gaze)

In [15]:
gaze_agg.shape

(2640, 72)

In [25]:
c_values = [0.1, 1, 5, 10, 20, 50, 80]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()

clf = GridSearchCV(
                estimator=svc,
                param_grid=parameters,
                scoring='accuracy',
                verbose=5,
                n_jobs=-1,
                   )

In [26]:
clf.fit(gaze_agg, y)

Fitting 5 folds for each of 140 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed: 19.3min
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed: 60.8min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 20, 50, 80],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [27]:
clf.best_params_

{'C': 0.1, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'linear'}

In [28]:
svc = SVC(**clf.best_params_)
evaluate_scores(gaze_agg, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.


printing accuracy measures
avg (train): 0.17054924242424244
std (train): 0.006577180058403531
avg (validation): 0.060227272727272727
std (validation): 0.01367838642823317


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.7s finished
