### I. All the necessary imports required

In [1]:
import pandas as pd
import glob, os
import numpy as np
import csv
import pdb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

### Extract and dump all the features for each of the modality
**Note: please change the two paths according to your folder location on drive**


*   data = < path to existing data folder where all folder for each modality are kept >
*   out = < path to folder where all the features is to be dumped >


In [2]:
data = 'irpev_clean'
out = 'hw01'

def get_statistics( arr ):
#   '''
#   Write a function to gather the statistics for a single participant for a particular modality

#   Input:
#   arr -> numpy array

#   Return:
#   mean, max, min, std -> all features of the respective columns in the csv
#   '''
    arr = arr[1:]
    mean = np.mean(arr,axis = 0)
    std = np.std(arr, axis = 0)
    mmax = np.max(arr,axis = 0)
    mmin = np.min(arr,axis=0)
    return mean, std, mmax, mmin

In [3]:
def judge(array):

    if array[0] == 'A':
        array = ['1']+ array
    elif array[0] == 'D':
        array = ['0'] + array
    return array

def extract_features( data, dest, folder, features, common ):
#   '''
#   Write a function to generate all the features for each entry for each modality

#   Input:
#   data = path to folder where all modality folders are present
#   dest = path to dump the modality features
#   folder = modality name [ vid, gsr, ego ]
#   features = feature columns names to give header for csv
#   common = common column names to give header for csv
#   '''
    cols = common + features
    with open( dest, 'w' ) as f:
        writer = csv.writer( f )
        writer.writerow( cols )
        row = []
        for filepath in glob.glob( f'{data}/{folder}/*.csv' ):
            root_split = os.path.splitext(filepath)[0]                 #i
            ii_split = root_split.split('_')[1:]                       #ii 
            ii_split[0] = ii_split[0][-1]
            ii_split = judge(ii_split)
            read_eachfile = np.genfromtxt(filepath,delimiter=',')      #iii
            re = pd.DataFrame(read_eachfile)
            re.dropna(how = 'all')
            mean, std,mmax, mmin = get_statistics(re)                  #iv
            row = np.r_[ii_split,mean,std,mmax,mmin]

            # for each entry in the modality, extract the features and dump into the csv
            # i) Extract the filename to prefix and extension (command splitext might be useful here)
            # ii) Split the prefix at '_' to find out the label 'A' or 'D'; set to 1 and 0
            # iii) Extract the data into an array from the csv file (numpy command genfromtext might be useful here )
            # iv) Call the above function get_statistics( arr ) to compute your statistics for this
            #use the command below to write the newly formed feature row into the new file using writer.writerow
            writer.writerow( row )
    print( f'Done with {folder} features' )


In [4]:
# This is a dictonary for each modality and the respective feature column names
# Modify it to match your own feature names (from the statistics you calculated)
# Note that I'm only using mean and std deviation, you should add max and min as well
d = {
    'vid': [ 'engagement_mean', 'contempt_mean', 'engagement_std', 'contempt_std', 'engagement_max', 'contempt_max', 'engagement_min', 'contempt_min' ],
    'gsr': [ 'cv_mean', 'cv_std','cv_max', 'cv_min' ],
    'ego': [ 'delta_x_mean', 'delta_y_mean', 'delta_angle_mean', 'delta_x_std', 'delta_y_std', 'delta_angle_std','delta_x_max', 'delta_y_max', 'delta_angle_max','delta_x_min', 'delta_y_min', 'delta_angle_min' ]
}
# common column names
common = ['label', 'cls', 'pair', 'participant']

In [5]:
# Collect all the features from each modality and merge them; use the pandas dataframe here
res = pd.DataFrame()
i_h = 0
for folder, features in d.items():
    # i) create the destination folder
    dest = f'{out}/{folder}.csv'
    
    # ii) call extract_features( data, dest, folder, features, common )
    extract_features( data, dest, folder, features, common )
    temp = pd.read_csv(dest)
    
    #frist time to add data in 'res'
    if i_h == 0:
        res = temp
        i_h += 2
        continue
        
    # iii) merge the features for each modality to create one aggregate feature
    res = pd.merge(res,temp,on= ['label', 'cls', 'pair', 'participant'],how='left')
    
print( 'Done extracting features for each modality.' )

Done with vid features
Done with gsr features
Done with ego features
Done extracting features for each modality.


### Now review your dataset consisting of {X, y}, then split it into training and test set with 9:1 ratio

In [6]:
# This code splits your data [X,y] into 90% training and 10%test.
print('Shape of my feature matrix: ',res.shape)

features = res.columns[ ~res.columns.isin( common ) ]
X = res[ features ]
y = res[ 'label' ]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=3 )
print( f'Size -> X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}' )
        

Shape of my feature matrix:  (48, 28)
Size -> X_train: (43, 24), X_test: (5, 24), y_train: (43,), y_test: (5,)


### Run the actual classifiers and print your performance metric(s), and confusion matrix

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
res_mean = {'randomForest': -1,
           'SVM':-1,
           'NeuralNet':-1}




# i) Run Random Forest, report results
rf = RandomForestClassifier(n_estimators=4, criterion='gini',max_features=5)
rf.fit(X_train,y_train)
rf_predict = rf.predict(X_test)                                           #predict the test set
rf_accuracy = accuracy_score(y_test,rf_predict)                           # get accuracy of this model
cro_ = cross_val_score(rf, X_train, y_train, scoring='accuracy', cv=10)   #10 fold cross validation, same as below other models
print(f'RandForClassif 10-fold cross val ---> average:{cro_.mean()}, std:{cro_.std()}' )
print('random forest accuracy: ',rf_accuracy)
print('Rf confusion matrix:')
print(confusion_matrix(y_test,rf_predict))
res_mean['randomForest'] = cro_.mean()




# ii) Run SVM (remember there can be multiple kernels, just one is fine), report results
print('====================================================================================')
svm = SVC(probability=True,random_state=10)
svm.fit(X_train,y_train)
svm_predict = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test,svm_predict)
cro_svm = cross_val_score(svm, X_train, y_train, scoring='accuracy', cv=10)
print(f'SVM 10-fold cross val ---> average:{cro_svm.mean()}, std:{cro_svm.std()}' )
print('random forest accuracy: ',svm_accuracy)
print('SVM confusion matrix:')
print(confusion_matrix(y_test,svm_predict))
res_mean['SVM'] = cro_svm.mean()




# iii) Run a Neural Network classifier, report results
print('====================================================================================')
nnc = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,), random_state=0)
nnc.fit(X_train,y_train)
nnc_predict = nnc.predict(X_test)
nnc_accuracy = accuracy_score(y_test,nnc_predict)
cro_nnc = cross_val_score(nnc, X_train, y_train, scoring='accuracy', cv=10)
print(f'NNC 10-fold cross val ---> average:{cro_nnc.mean()}, std:{cro_nnc.std()}' )
print('random forest accuracy: ',nnc_accuracy)
print('Rf confusion matrix:')
print(confusion_matrix(y_test,nnc_predict))
res_mean['NeuralNet'] = cro_nnc.mean()




print('====================================================================================')
# print(res_mean)
# iv) BONUS: Run and report on a fourth classifier of your choice that performs better than the 3 above
# print('====================================================================================')
# print('The best classifier is'+ 'RandomForestClassifier')
# rf_confus = confusion_matrix(y_test,rf_predict)                                           #get confusion matrix
# print('RandomForestClassifier confusion matrix: ')
# print(rf_confus)

print('====================================================================================')
print('GBDT')
#GBDT

from sklearn.ensemble import GradientBoostingClassifier
dt = GradientBoostingClassifier(random_state = 29)
dt = dt.fit(X_train,y_train)
dt_pre = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test,dt_pre)
dt_cro = cross_val_score(dt, X_train, y_train, scoring='accuracy', cv=10)
print(f'GBDT 10-fold cross val ---> average:{dt_cro.mean()}, std:{dt_cro.std()}' )
print('random forest accuracy: ',nnc_accuracy)
dt_confus = confusion_matrix(y_test,dt_pre)                                                #get confusion matrix
print('confusion matrix: ')
print(dt_confus)


RandForClassif 10-fold cross val ---> average:0.6900000000000001, std:0.19078784028338913
random forest accuracy:  1.0
Rf confusion matrix:
[[3 0]
 [0 2]]
SVM 10-fold cross val ---> average:0.63, std:0.10535653752852739
random forest accuracy:  0.6
SVM confusion matrix:
[[3 0]
 [2 0]]
NNC 10-fold cross val ---> average:0.45999999999999996, std:0.22671568097509268
random forest accuracy:  0.6
Rf confusion matrix:
[[1 2]
 [0 2]]
GBDT
GBDT 10-fold cross val ---> average:0.79, std:0.07348469228349534
random forest accuracy:  0.6
confusion matrix: 
[[3 0]
 [1 1]]
