In [1]:
import os
from glob import glob
from pathlib import Path
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sklearn as sk
import pickle
import copy

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

In [9]:
pwd

'/home/tim/work/su-thesis-project/projects/video_analysis/python/notebooks'

# Paths

In [19]:
input_path = 'files/'
output_path = 'files/'
# Global configuration path
glob_conf_path = '../global_config.py'

# Load global variables

In [13]:
exec(open(glob_conf_path).read())

# Load and shuffle data

In [17]:
from sklearn.utils import shuffle

train_intensity_df = pd.read_csv(input_path + 'video_data_intensity_train.csv')
train_shuffled_df = shuffle(train_intensity_df, random_state=seed)
train_shuffled_df

Unnamed: 0,filename,AU01_r_mean,AU01_r_stddevNorm,AU01_r_percentile20.0,AU01_r_percentile50.0,AU01_r_percentile80.0,AU01_r_iqr60_80-20,AU01_r_numPeaks,AU02_r_mean,AU02_r_stddevNorm,...,AU45_r_mean,AU45_r_stddevNorm,AU45_r_percentile20.0,AU45_r_percentile50.0,AU45_r_percentile80.0,AU45_r_iqr60_80-20,AU45_r_numPeaks,video_id,emotion_1_id,group
1845,A21_dou_p_2,0.021336,0.315003,0.0,0.105263,0.009780,0.009780,0.254545,0.032152,0.229990,...,0.051557,0.370913,0.0,0.142857,0.022392,0.022392,0.22,A21,3,0
578,A102_nos_p_3,0.109540,0.538836,0.0,0.000000,0.056724,0.056724,0.200000,0.078868,0.302091,...,0.262638,0.355495,0.0,0.000000,0.145547,0.145547,0.12,A102,30,2
1547,A218_fea_v_4,0.230998,0.575336,0.0,0.105263,0.078240,0.078240,0.236364,0.286648,0.376600,...,0.064347,0.323707,0.0,0.000000,0.034606,0.034606,0.42,A218,10,2
223,A101_neu_sit4_v,0.017610,0.295936,0.0,0.105263,0.009780,0.009780,0.490909,0.053598,0.384569,...,0.079859,0.820094,0.0,0.142857,0.022901,0.022901,0.28,A101,22,0
1439,A218_bor_p_4,0.150204,0.534026,0.0,0.105263,0.070905,0.070905,0.272727,0.211758,0.392783,...,0.511101,0.247574,0.0,0.000000,0.460560,0.460560,0.20,A218,26,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1026,A18_sex_p_3,0.173248,0.850968,0.0,0.052632,0.009291,0.009291,0.018182,0.734962,0.503134,...,0.081261,0.211990,0.0,0.000000,0.081425,0.081425,0.10,A18,24,1
141,A101_exc_v_2,0.148121,0.363359,0.0,0.052632,0.117359,0.117359,0.290909,0.158461,0.302575,...,0.023911,0.212292,0.0,0.285714,0.017812,0.017812,0.52,A101,36,0
1317,A200_reg_v_3,0.266795,0.373462,0.0,0.052632,0.189242,0.189242,0.163636,0.229841,0.355231,...,0.364030,0.243497,0.0,0.000000,0.360814,0.360814,0.20,A200,0,1
752,A18_conc_p_1,0.217663,0.415563,0.0,0.052632,0.195599,0.195599,0.090909,0.165383,0.352493,...,0.026921,0.248905,0.0,0.285714,0.020356,0.020356,0.22,A18,27,1


# Other functions

In [18]:
from sklearn.metrics import confusion_matrix
    
def plot_save_conf_matrix(X, y, cv, clf, title, file_name, report=False):
    
    # Initialize some variables to store results
    conf_mat_avg = np.zeros([18,18])
    n_groups = 0
    for train_idx, val_idx in cv:
        n_groups += 1
        X_train, X_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]
        
        clf.fit(X_train,y_train)
        
        # Compute results (validation dataset)    
        y_pred = clf.predict(X_val)                       # Predict class
        conf_mat_tmp = confusion_matrix(y_val,            # Compute confusion matrix for this batch
                                        y_pred,
                                        normalize='true'
                                       )
        conf_mat_avg += conf_mat_tmp
    
    conf_mat_avg /= n_groups
    
    
    df_cm = pd.DataFrame(conf_mat_avg, list(emotion_eng_id_to_emotion_num.keys()), list(emotion_eng_id_to_emotion_num.keys()))
    df_cm = df_cm.reindex(sorted(df_cm.columns)).reindex(sorted(df_cm.columns), axis=1)
    
    plt.figure(figsize=(15,15))
    ax = sns.heatmap(df_cm, annot=True, fmt='.2f', vmin=0, vmax=1, cmap=conf_cmap)
    plt.yticks(va='center')
    plt.xlabel('Predicted Label')
    plt.ylabel('Actual Label')
    if report == True:
        plt.savefig(os.path.join(output_path,'confusion_matrix','report_'+file_name), bbox_inches = 'tight') 
    plt.title(title)
    plt.savefig(os.path.join(output_path,'confusion_matrix',file_name), bbox_inches = 'tight')        

# Elastic Net SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

# Obtain X, y, and groups from the training dataset
X = train_shuffled_df.drop(columns=["filename", "video_id", "emotion_1_id", "group"])
y = train_shuffled_df.emotion_1_id
groups = train_shuffled_df.group
n_groups = len(groups.unique())


# Define parameters to evaluate
alpha_values            = [0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.005, 0.0025, 0.0075, 0.01, 0.05, 0.1, 1]
n_iter_no_change_values = [20, 50, 100, 150]
l1_ratio_values         = [0.049, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50,
                           0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.89, 0.95, 0.99]
parameters              = {'penalty': ['elasticnet'],
                           'class_weight': ['balanced'],
                           'loss': ['log'],
                           'random_state': [seed],
                           'alpha': alpha_values,
                           'l1_ratio': l1_ratio_values,
                           'n_iter_no_change': n_iter_no_change_values,
                           'max_iter': [2500]
                          }

elasticnet = SGDClassifier()
logo = LeaveOneGroupOut()

clf = GridSearchCV(estimator  = elasticnet, 
                   param_grid = parameters,
                   scoring    = 'roc_auc_ovo_weighted',
                   cv         = logo.split(X=X,groups=groups),
                   verbose    = 1,
                   n_jobs     = -1,
                  )
clf.fit(X,y)
print(clf.best_estimator_)

# Store results in a DataFrame and then save them as csv
results_df = pd.DataFrame(clf.cv_results_)
results_df.to_csv(os.path.join(output_path,'results','res_elastic_net_'+str(n_groups)+'_logocv_intensity_video.csv'), index=None, header=True)

# Save best parameters to disk
file_path = os.path.join(output_path,'best_params','best_params_elastic_net_intensity_video.sav')
pickle.dump(clf.best_params_, open(file_path, 'wb'))

# Save the best model to disk
file_path = os.path.join(output_path,'models','mod_elastic_net_intensity_video.sav')
pickle.dump(clf.best_estimator_, open(file_path, 'wb'))

Fitting 3 folds for each of 960 candidates, totalling 2880 fits
