In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Read the file
rawfile = pd.read_csv("bizCase/Business_case_data.csv")
file = rawfile.copy()

In [3]:
def cleanDataset(file):
    # First remove bad values of head translation
    file = file[file.pose_Tx > -1000]
    # Remove infinite values of hand distance
    file = file[file.DistanceHands_1_adjusted != float("inf")]
    # Replace the NaN values with zeros (sic)
    file['DistanceHands_1_adjusted'] = file['DistanceHands_1_adjusted'].fillna(0)
    file['DistanceHands_2_adjusted'] = file['DistanceHands_2_adjusted'].fillna(0)

    # Trying to add an empirical threshold on confidence... would need further investigation
    file = file[file.confidence>0.8]
    # Replace the IDs with easier ones ranging from 1 to 219
    file['easyID'] = file.candidate_id.astype('category').cat.rename_categories(range(1, file.candidate_id.nunique()+1))
    # Replace question IDs with easier values for understanding
    file['easyQID'] = file.question_id.astype('category').cat.rename_categories(range(1, file.question_id.nunique()+1))

    return file

In [4]:
# Compute the gaze, head translation and head rotation distance
def computeDistance(file):
    file['gazeDist'] = np.sqrt(np.square(file['gaze_angle_x']-np.mean(file['gaze_angle_x']))+np.square(file['gaze_angle_y']-np.mean(file['gaze_angle_y'])))
    file['Tdist'] = np.sqrt(np.square(file['pose_Tx'])+np.square(file['pose_Ty'])+np.square(file['pose_Tz']-np.mean(file['pose_Tz'])))
    file['Rdist'] = np.sqrt(np.square(file['pose_Rx'])+np.square(file['pose_Ry'])+np.square(file['pose_Rz']))
    
    return file

In [5]:
# rescale the features
def rescaleFeatures(file):

    rescaleFeatures = ['gazeDist', 'Tdist', 'Rdist', 'AU01_r','AU02_r','AU04_r','AU05_r','AU06_r','AU07_r','AU09_r','AU10_r','AU12_r','AU14_r','AU15_r','AU17_r','AU20_r','AU23_r','AU25_r','AU26_r','AU45_r']
    file[rescaleFeatures] /= file[rescaleFeatures].max()
    
    return file
    

In [6]:
def computeScore(clf, X, y):
  xval = cross_val_score(clf, X, y, cv=2)
  return xval

In [7]:
def getCandidateMeanDataset(file):

    # Number of candidates
    nbCand = file['easyID'].nunique()

    # selected features for all frames
    selectedFeatures = ['gazeDist', 'Tdist', 'Rdist', 'AU01_r','AU02_r','AU04_r','AU05_r','AU06_r','AU07_r','AU09_r','AU10_r','AU12_r','AU14_r','AU15_r','AU17_r','AU20_r','AU23_r','AU25_r','AU26_r','AU45_r','AU01_c','AU02_c','AU04_c','AU05_c','AU06_c','AU07_c','AU09_c','AU10_c','AU12_c','AU14_c','AU15_c','AU17_c','AU20_c','AU23_c','AU25_c','AU26_c','AU28_c','AU45_c','Proba_hands_1_binned', 'Proba_hands_2_binned', 'confidence', 'DistanceHands_1_adjusted', 'DistanceHands_2_adjusted']
    # kept features per candidate (not for all frames)
    finalFeatures = ['gazeDistm', 'Tdistm', 'Rdistm', 'AU01_rm','AU02_rm','AU04_rm','AU05_rm','AU06_rm','AU07_rm','AU09_rm','AU10_rm','AU12_rm','AU14_rm','AU15_rm','AU17_rm','AU20_rm','AU23_rm','AU25_rm','AU26_rm','AU45_rm','AU01_cm','AU02_cm','AU04_cm','AU05_cm','AU06_cm','AU07_cm','AU09_cm','AU10_cm','AU12_cm','AU14_cm','AU15_cm','AU17_cm','AU20_cm','AU23_cm','AU25_cm','AU26_cm','AU28_cm','AU45_cm','Proba_hands_1_binnedm', 'Proba_hands_2_binnedm', 'confidencem', 'DistanceHands_1_adjustedm', 'DistanceHands_2_adjustedm','gazeDists', 'Tdists', 'Rdists', 'AU01_rs','AU02_rs','AU04_rs','AU05_rs','AU06_rs','AU07_rs','AU09_rs','AU10_rs','AU12_rs','AU14_rs','AU15_rs','AU17_rs','AU20_rs','AU23_rs','AU25_rs','AU26_rs','AU45_rs','AU01_cs','AU02_cs','AU04_cs','AU05_cs','AU06_cs','AU07_cs','AU09_cs','AU10_cs','AU12_cs','AU14_cs','AU15_cs','AU17_cs','AU20_cs','AU23_cs','AU25_cs','AU26_cs','AU28_cs','AU45_cs','Proba_hands_1_binneds', 'Proba_hands_2_binneds', 'confidences', 'DistanceHands_1_adjusteds', 'DistanceHands_2_adjusteds']

    # data per candidate
    xtrain = pd.DataFrame(index=range(0,nbCand), columns=finalFeatures)
    ytrain = pd.DataFrame(index=range(0,nbCand), columns=['label'])

    # Loop over all candidates
    for i in range(1, nbCand+1):
        for feat in selectedFeatures:
            xtrain[feat+'m'].iloc[i-1] = file[feat][file.easyID == i].mean()
            xtrain[feat+'s'].iloc[i-1] = file[feat][file.easyID == i].std()

        ytrain['label'].iloc[i-1] = file['label'][file.easyID == i].mean()
    
    return xtrain, ytrain
    

In [8]:
# Compute the whole model and output some precision values
def computeModel(file):
    file = cleanDataset(file)
    file = computeDistance(file)
    file = rescaleFeatures(file)
    xtrain, ytrain = getCandidateMeanDataset(file)

    # Output precision for logistic regression, random forest, gradient boosting
    # SVM and k-NN
    lr = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=.1, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
    print("logistic regression: ",np.mean(computeScore(lr, xtrain, np.ravel(ytrain))))
    rdf = RandomForestClassifier(n_estimators=1000, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
    print("random forest classification: ",np.mean(computeScore(rdf, xtrain, np.ravel(ytrain))))
    grad = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=10000, subsample=0.9, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
    print("gradient boosting: ",np.mean(computeScore(grad, xtrain, np.ravel(ytrain)))) 
    mySVC = SVC(C=.5, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)
    print("SVM Classifier: ", np.mean(computeScore(mySVC, xtrain, np.ravel(ytrain))))
    knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1)
    print("k-NN Classifier: ", np.mean(computeScore(knn, xtrain, np.ravel(ytrain))))

    return xtrain, ytrain   

In [9]:
# finally execute the computation
xtrain, ytrain = computeModel(file)

The minimum supported version is 2.4.6



('logistic regression: ', 0.6826923076923077)
('random forest classification: ', 0.6971153846153846)
('gradient boosting: ', 0.5961538461538461)
('SVM Classifier: ', 0.6826923076923077)
('k-NN Classifier: ', 0.6009615384615384)
