In [2]:
import os
import sys
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold

import boto3

In [3]:
module_path = os.path.abspath(os.path.join('../../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
from global_config import ROOT_DIR, emotion_id_to_emotion_abr, conf_cmap, AU_INTENSITY_COLS, GAZE_COLS, POSE_COLS
from src.analysis.supervised_learning.evaluation.confusion_matrix import ConfusionMatrixCreator


from src.analysis.data_exploration import plot_time_series_means_subplots

from src.preprocessing.dataset_creation.scaling import Scaler
from src.preprocessing.dataset_creation.helpers import slice_by, get_cols, get_fixed_col
from src.preprocessing.dataset_creation.aggregation import get_aggregate_measures
from src.preprocessing.dataset_creation.interpolation import Interpolator

In [5]:
def param_search(x_, y_):
    # regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
    c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]

    # 
    gamma = [1, 0.1, 0.01, 0.001, 0.0001]
    kernel = ['rbf', 'linear', 'poly', 'sigmoid']

    parameters = {'class_weight': ['balanced'],
                  'C': c_values,
                  'gamma': gamma,
                  'kernel': kernel,
                  }
    
    
    skf = KFold(n_splits=5, shuffle=True)
    splits = skf.split(x_, y_)
    
    svc = SVC()
    clf = GridSearchCV(estimator=svc,
                       param_grid=parameters,
                       scoring='accuracy',
                       verbose=5,
                       cv=splits,
                       n_jobs=-1,
                       )
    
    clf.fit(x_, y_)
    
    print("best params")
    print(clf.best_params_)
    return clf

In [6]:
def evaluate_scores(x_, y_, svc, scoring_method):
    
    skf = KFold(n_splits=5, shuffle=True)
    
    # get scores
    scores = cross_validate(X=x_, y=y_,
                            estimator           = svc,
                            scoring             = [scoring_method],
                            verbose             = 1,
                            cv                  = skf.split(x_, y_),
                            n_jobs              = -1,
                            return_train_score  = True                        
                           )
    
    print('printing {} measures'.format(scoring_method))
    print('avg (train):', np.mean(scores['train_{}'.format(scoring_method)]))
    print('std (train):', np.std(scores['train_{}'.format(scoring_method)]))
    print('avg (validation):', np.mean(scores['test_{}'.format(scoring_method)]))
    print('std (validation):', np.std(scores['test_{}'.format(scoring_method)]))

In [14]:
def run(x, means, variance, deltas, peaks):
    print("running tests with means={}, variance={}, deltas={} and peaks={}".format(means, variance, deltas, peaks))
    
    
    x = get_aggregate_measures(x,
                               means=means,
                               variance=variance,
                               deltas=deltas,
                               peaks=peaks)
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    
    print("printing shape")
    print(x.shape)
    
    clf = param_search(x, y)
    
    svc = SVC(**clf.best_params_)
    evaluate_scores(x, y, svc, scoring_method="accuracy")

In [8]:
from s3fs.core import S3FileSystem
s3 = S3FileSystem()
from sagemaker import get_execution_role

# this datafile has standardscaled data on a per actor basis
role = get_execution_role()
bucket='files-and-examples-01'
file = 'datasets/su_dataset/query_13_videos.csv'

path = 's3://{}/{}'.format(bucket, file)

df = pd.read_csv(path)

In [9]:
slices = slice_by(df, "filename")
interpolator = Interpolator([*AU_INTENSITY_COLS, *GAZE_COLS, *POSE_COLS])
slices = interpolator.remove_interpolate(slices)

In [10]:
au_gaze_pose = get_cols(slices, [*AU_INTENSITY_COLS, *GAZE_COLS, *POSE_COLS])
y = get_fixed_col(slices, "emotion_1_id")
video_ids = get_fixed_col(slices, "video_id")

In [11]:
scaler = Scaler(au_gaze_pose, "standard")
scaler.scale_by_video_id(video_ids)
au_gaze_pose = scaler.slices

In [12]:
len(au_gaze_pose)

4540

In [15]:
run(au_gaze_pose,
    means=True,
    variance=True,
    deltas=False,
    peaks=False)

running tests with means=True, variance=True, deltas=False and peaks=False
printing shape
(4540, 186)
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 770 out of 800 | elapsed:  1.1min remaining:    2.5s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.1min finished


best params
{'C': 10, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.


printing accuracy measures
avg (train): 0.9999449339207048
std (train): 0.00011013215859030369
avg (validation): 0.3843612334801762
std (validation): 0.008838062840413602


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.3s finished


In [16]:
run(au_gaze_pose,
    means=True,
    variance=False,
    deltas=True,
    peaks=False)

running tests with means=True, variance=False, deltas=True and peaks=False
printing shape
(4540, 217)
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 770 out of 800 | elapsed:  1.2min remaining:    2.7s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.2min finished


best params
{'C': 10, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.


printing accuracy measures
avg (train): 1.0
std (train): 0.0
avg (validation): 0.37819383259911893
std (validation): 0.013336454187859571


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.8s finished


In [17]:
run(au_gaze_pose,
    means=True,
    variance=False,
    deltas=False,
    peaks=True)

running tests with means=True, variance=False, deltas=False and peaks=True
printing shape
(4540, 186)
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 770 out of 800 | elapsed:  1.1min remaining:    2.5s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.1min finished


best params
{'C': 10, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.


printing accuracy measures
avg (train): 1.0
std (train): 0.0
avg (validation): 0.38634361233480174
std (validation): 0.010686792485205475


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.3s finished


In [18]:
run(au_gaze_pose,
    means=True,
    variance=True,
    deltas=True,
    peaks=True)

running tests with means=True, variance=True, deltas=True and peaks=True
printing shape
(4540, 279)
Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 258 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done 770 out of 800 | elapsed:  1.4min remaining:    3.3s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.5min finished


best params
{'C': 25, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 96 concurrent workers.


printing accuracy measures
avg (train): 0.9903634361233481
std (train): 0.0013146846240433125
avg (validation): 0.3665198237885462
std (validation): 0.012448344626383065


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.9s finished
