In [2]:
import os
import sys
import pickle

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold

In [3]:
module_path = os.path.abspath(os.path.join('../../../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
from global_config import AUDIO_FUNCTIONALS_EGEMAPS_COLS
from src.preprocessing.dataset_creation.scaling.functional_scaling import scale_by

In [6]:
def evaluate_scores(x_, y_, svc, scoring_method):
    
    skf = KFold(n_splits=5, shuffle=True)
    
    # get scores
    scores = cross_validate(X=x_, y=y_,
                            estimator           = svc,
                            scoring             = [scoring_method],
                            verbose             = 1,
                            cv                  = skf.split(x_, y_),
                            n_jobs              = -1,
                            return_train_score  = True                        
                           )
    
    print('printing {} measures'.format(scoring_method))
    print('avg (train):', np.mean(scores['train_{}'.format(scoring_method)]))
    print('std (train):', np.std(scores['train_{}'.format(scoring_method)]))
    print('avg (validation):', np.mean(scores['test_{}'.format(scoring_method)]))
    print('std (validation):', np.std(scores['test_{}'.format(scoring_method)]))

In [7]:
from s3fs.core import S3FileSystem
s3 = S3FileSystem()
from sagemaker import get_execution_role

# this datafile has standardscaled data on a per actor basis
role = get_execution_role()
bucket='files-and-examples-01'
file = 'datasets/su_dataset/opensmile_query_13_videos.csv'

path = 's3://{}/{}'.format(bucket, file)

df = pd.read_csv(path)

In [8]:
df

Unnamed: 0,filename,video_id,intensity_level,emotion_1_id,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,A223_ang_p_4,A223,4,12,44.081318,0.112749,43.205925,45.499218,46.877730,3.671806,...,0.046794,0.006695,0.012153,1.156069,0.389105,0.930000,0.280000,1.076667,0.683341,-33.727013
1,A223_ele_v_4,A223,4,8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.006542,0.007306,0.000958,0.361533,0.000000,0.000000,0.000000,13.770000,0.000000,-88.274410
2,A102_ten_p_2,A102,2,28,37.364210,0.012795,37.054905,37.203660,37.935577,0.880672,...,0.019007,0.001844,0.003722,1.117319,0.567108,0.026667,0.009428,1.285000,1.059658,-65.307490
3,A205_sad_p_4,A205,4,6,44.120106,0.045053,42.679100,43.668730,45.004967,2.325867,...,0.027048,0.008397,0.004681,1.610542,0.737463,0.286000,0.098914,1.046000,1.222303,-56.946110
4,A227_neg_sur_v_4,A227,4,11,47.671467,0.133833,42.556465,50.469810,51.363800,8.807335,...,0.004720,0.007221,0.004899,0.623701,0.420168,0.360000,0.340000,1.330000,0.257294,-34.559720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4539,A207_sat_v_4,A207,4,16,29.149166,0.054495,27.490206,29.333115,30.569393,3.079187,...,-0.006541,0.008077,0.002328,0.214133,0.434783,0.125000,0.015000,1.433333,1.119593,-63.634820
4540,A220_disg_v_4,A220,4,35,45.610935,0.193249,35.477707,43.274364,56.909890,21.432182,...,0.008508,0.006249,0.002458,0.601805,0.100806,0.100000,0.000000,3.260000,2.948570,-68.924150
4541,A220_anx_v_3,A220,3,34,27.079062,0.017017,26.622227,27.018835,27.524992,0.902765,...,0.003961,0.005975,0.001410,0.323102,0.162866,0.110000,0.000000,3.000000,0.880000,-69.462190
4542,A221_mov_p_2,A221,2,42,46.290650,0.001560,46.207880,46.311370,46.359245,0.151367,...,0.098675,0.006784,0.005030,3.773585,0.769231,0.030000,0.010000,0.615000,0.668375,-63.103870


# With general normalization

### Min Max

In [7]:
x = df[AUDIO_FUNCTIONALS_EGEMAPS_COLS].values
y = df["emotion_1_id"].values
video_ids = df["video_id"].values

In [8]:
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

In [9]:
skf = KFold(n_splits=5, shuffle=True)
splits = skf.split(x, y)

In [10]:
# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]
#
gamma = [1, 0.1, 0.01, 0.001, 0.0001]

kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

In [11]:
clf.fit(x, y)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.9min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7f1480aed4d0>,
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 5, 10, 25, 50, 75, 100],
                         'class_weight': ['balanced'],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=5)

In [12]:
clf.best_params_

{'C': 75, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'rbf'}

In [14]:
svc = SVC(**clf.best_params_)
evaluate_scores(x, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.


printing accuracy measures
avg (train): 0.6031027339322652
std (train): 0.0062613632445338244
avg (validation): 0.08736787775693869
std (validation): 0.001115020222025127


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.3s finished


### Standard

In [20]:
x = df[AUDIO_FUNCTIONALS_EGEMAPS_COLS].values
y = df["emotion_1_id"].values
video_ids = df["video_id"].values

scaler = StandardScaler()
x = scaler.fit_transform(x)

skf = KFold(n_splits=5, shuffle=True)
splits = skf.split(x, y)

# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]
#
gamma = [1, 0.1, 0.01, 0.001, 0.0001]

kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

clf.fit(x, y)

clf.best_params_

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   51.4s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  3.1min finished


{'C': 75, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}

In [21]:
svc = SVC(**clf.best_params_)
evaluate_scores(x, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.


printing accuracy measures
avg (train): 0.618123033761423
std (train): 0.006379448761293059
avg (validation): 0.08560721710937613
std (validation): 0.009823311841076968


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.2s finished


## With video specific normalization 

### Min max

In [8]:
x = df[AUDIO_FUNCTIONALS_EGEMAPS_COLS].values
y = df["emotion_1_id"].values
video_ids = df["video_id"].values

In [9]:
x = scale_by(x, video_ids, "min_max")

In [10]:
skf = KFold(n_splits=5, shuffle=True)
splits = skf.split(x, y)

# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]
#
gamma = [1, 0.1, 0.01, 0.001, 0.0001]

kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

clf.fit(x, y)

clf.best_params_



Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.1min finished


{'C': 1, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'poly'}

In [12]:
svc = SVC(**clf.best_params_)
evaluate_scores(x, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.


printing accuracy measures
avg (train): 0.9481732423586238
std (train): 0.0029535160643462594
avg (validation): 0.10871703910479154
std (validation): 0.006085505190997724


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.6s finished


### Standard

In [14]:
x = df[AUDIO_FUNCTIONALS_EGEMAPS_COLS].values
y = df["emotion_1_id"].values
video_ids = df["video_id"].values

x = scale_by(x, video_ids, "standard")

In [15]:
skf = KFold(n_splits=5, shuffle=True)
splits = skf.split(x, y)

# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]
#
gamma = [1, 0.1, 0.01, 0.001, 0.0001]

kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

clf.fit(x, y)

clf.best_params_

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   55.8s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  3.9min finished


{'C': 100, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}

In [18]:
svc = SVC(**clf.best_params_)
evaluate_scores(x, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.


printing accuracy measures
avg (train): 0.7978101606584317
std (train): 0.0032727471949278134
avg (validation): 0.09000814178334134
std (validation): 0.004066505165894442


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.4s finished


# Scale by intensity

### Min Max

In [9]:
x = df[AUDIO_FUNCTIONALS_EGEMAPS_COLS].values
y = df["emotion_1_id"].values
video_ids = df["video_id"].values
intensity_levels = df["intensity_level"].values

x = scale_by(x, intensity_levels, "min_max")

scaler = StandardScaler()
x = scaler.fit_transform(x)

skf = KFold(n_splits=5, shuffle=True)
splits = skf.split(x, y)

# regularization paramater, lower C -> more regularization (less overfitting), large C -> less regularization (and possibly more overfitting)
c_values = [0.1, 1, 5, 10, 25, 50, 75, 100]
#
gamma = [1, 0.1, 0.01, 0.001, 0.0001]

kernel = ['rbf', 'linear', 'poly', 'sigmoid']

parameters = {'class_weight': ['balanced'],
              'C': c_values,
              'gamma': gamma,
              'kernel': kernel,
              }

svc = SVC()
clf = GridSearchCV(estimator=svc,
                   param_grid=parameters,
                   scoring='accuracy',
                   verbose=5,
                   cv=splits,
                   n_jobs=-1,
                   )

clf.fit(x, y)

clf.best_params_

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  4.8min finished


{'C': 0.1, 'class_weight': 'balanced', 'gamma': 1, 'kernel': 'linear'}

In [10]:
svc = SVC(**clf.best_params_)
evaluate_scores(x, y, svc, scoring_method="accuracy")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.


printing accuracy measures
avg (train): 0.23811637559904547
std (train): 0.004301989346990589
avg (validation): 0.0814301914772975
std (validation): 0.012880362415746839


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.5s finished
