## Machine learning on emotion clustering label

### Import Library

In [1]:
# SKlearn libraries
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.svm import SVC

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef

# Other libraries
import argparse
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.join(os.path.curdir, '../', 'util'))
from load_data_basic import *


### Define some global variable for this notebook

In [2]:
# date_time format
date_time_format = '%Y-%m-%dT%H:%M:%S.%f'
date_only_date_time_format = '%Y-%m-%d'

# Data folders
main_data_directory = '../../data'

### Read all ground-truth information for each participant: IGTB, prestudy-info ...

In [3]:
# Read IGTB, prestudy-info, participant id
UserInfo = read_user_information(main_data_directory)

# Read MGT
MGT_df = read_MGT(main_data_directory)

# Read work MGT
overall_MGT, day_MGT, night_MGT = read_all_work_MGT(main_data_directory)

print('--------------------------------------------------------------------------')
print('Number of user in total respond to IGTB and pre-study assessment: %d' % (len(UserInfo)))
print('--------------------------------------------------------------------------')
print('\n')


# Normalization scaler
scaler_name = 'z_norm'

# Input file path
ml_path = os.path.join('../output', 'ml_feat', 'ml_input_feat.csv')

# Output file path
ml_output_path = os.path.join('../output', 'ml_output')

if os.path.exists(ml_output_path) is False:
    os.mkdir(ml_output_path)

# Output result file path
final_result_path = os.path.join(ml_output_path, '2_cluster_logo_rfc_all_components_' + scaler_name + '.csv')
final_feat_importance_path = os.path.join(ml_output_path, '2_cluster_logo_rfc_feature_importance_' + scaler_name + '.csv')


print('--------------------------------------------------------------------------')
print('Input feature and label file: %s' % (ml_path))
print('--------------------------------------------------------------------------')

# MGT labels of interest
lable_col_array = ['pos_af_mgt', 'neg_af_mgt', 'stress_mgt', 'anxiety_mgt']

# Shift types, but if predicting all participants, we don't need to use it
shift_array = ['day', 'night']


--------------------------------------------------------------------------
Number of user in total respond to IGTB and pre-study assessment: 212
--------------------------------------------------------------------------


--------------------------------------------------------------------------
Input feature and label file: ../output/ml_feat/ml_input_feat.csv
--------------------------------------------------------------------------


### Choose participants

In [4]:
# Example to choose only nurses
prefix = 'night_nurses'

if prefix == 'nurses':
    select_users_df = UserInfo[(UserInfo['currentposition'] == 1) | (UserInfo['currentposition'] == 2)]
elif prefix == 'non_nurses':
    select_users_df = UserInfo[(UserInfo['currentposition'] != 1) & (UserInfo['currentposition'] != 2)]
elif prefix == 'day_nurses':
    select_users_df = UserInfo[(UserInfo['currentposition'] == 1) | (UserInfo['currentposition'] == 2)]
    select_users_df = select_users_df[select_users_df['Shift'] == 'Day shift']
elif prefix == 'night_nurses':
    select_users_df = UserInfo[(UserInfo['currentposition'] == 1) | (UserInfo['currentposition'] == 2)]
    select_users_df = select_users_df[select_users_df['Shift'] == 'Night shift']


### Read Machine learning input features and labels

In [5]:
# Read machine learning input features and labels
def read_df(ml_path):
    data_df = pd.read_csv(ml_path, index_col=0)
    
    survey_df = data_df.copy()
    data_df = data_df.drop(['survey_time'], axis=1)
    data_df = data_df.drop('shift', axis=1)
    
    return data_df

# Get col name of input feature
def get_feat_col(cols, feat_str):
    feat_cols = []
    for col in cols:
        if feat_str in col:
            feat_cols.append(col)
    
    return feat_cols

# Select normalization method
def select_scaler(scaler_name):
    # Transformation
    if scaler_name == 'z_norm':
        scaler = preprocessing.StandardScaler()
    elif scaler_name == 'min_max':
        scaler = preprocessing.MinMaxScaler()
    else:
        scaler = preprocessing.Normalizer()
    
    return scaler

# Read input feature and labels
ml_df = read_df(ml_path)
ml_select_user_df = pd.DataFrame()

# ml_df
for index, row in select_users_df.iterrows():
    # get participant id
    participant_id = UserInfo.loc[index, 'ParticipantID']
    
    
    # aggregate data for select user
    ml_select_user_df = ml_select_user_df.append(ml_df.loc[ml_df['participant_id'] == participant_id])
    
# Read col name of input feature
feat_cols = get_feat_col(ml_select_user_df.columns.values, 'feat')

### Machine learning model

In [6]:
def ramdon_forest_pred(ml_df, input_feature_col, positive_prediction_label):
    tuned_parameters = { 'n_estimators': [50, 100, 200], 
                         'max_features': ['auto', 'sqrt', 'log2'], 
                         'max_depth': [4, 5, 6, 7, 8],
                         'criterion': ['gini', 'entropy']}
    
    tuned_parameters = { 'n_estimators': [100, 200], 
                         'max_features': ['auto'], 
                         'max_depth': [7, 8],
                         'criterion': ['gini']}
    
    # result df
    final_result_df = pd.DataFrame(index=['accuracy', 'precision', 'recall', 'f1', 'MCC'])
    feature_importance_final_df = pd.DataFrame()
    
    # unique subject
    unique_subjects = ml_df['subject_idx'].unique()
    
    # Iterate over the grid parameters
    for n_estimators in tuned_parameters['n_estimators']:
        for max_features in tuned_parameters['max_features']:
            for max_depth in tuned_parameters['max_depth']:
                for criterion in tuned_parameters['criterion']:
                    
                    # a. init result for each grid parameter
                    y_true_array, y_pred_array = [], []
                    feature_importance_array, feature_importance_idx_array = [], []

                    # b. Leave one subject out validation
                    for unique_subject_idx, subject_idx in enumerate(unique_subjects):
                        
                        # 1. train df is the data with subject_idx not equal to subject_idx
                        data_df = ml_df.copy()
                        train_df, test_df = data_df.loc[data_df['subject_idx'] != subject_idx], data_df.loc[data_df['subject_idx'] == subject_idx]
                        
                        # 2. Train input
                        x_train, y_train = train_df.drop(['subject_idx', 'label'], axis=1), train_df['label']
    
                        # 3. Test input
                        x_test, y_test = test_df.drop(['subject_idx', 'label'], axis=1), test_df['label']
                        
                        # 4. Fit the model
                        rfc_model = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=n_estimators, 
                                                           max_features=max_features, max_depth=max_depth, criterion=criterion)
                        rfc_model.fit(x_train, y_train)
                        
                        # 5. Append feature importance of the model with one subject out
                        feature_importance_array.append(rfc_model.feature_importances_)
                        
                        # 6. Append results
                        y_true, y_pred = np.array(y_test), rfc_model.predict(x_test)
                        for y_idx, y_true_value in enumerate(y_true):
                            y_true_array.append(y_true_value)
                            y_pred_array.append(y_pred[y_idx])
                    
                    # c. get most important feature array
                    feature_importance_array = np.array(feature_importance_array)
                    feature_importance_array_std = np.std(feature_importance_array, axis=0)
                    
                    feature_importance_array = np.sum(feature_importance_array, axis=0)
                    feature_importance_array = feature_importance_array / len(unique_subjects)
                    feature_importance_idx = np.argsort(feature_importance_array)[::-1]
                    feature_importance_name_array = input_feature_col[feature_importance_idx]
                    feature_importance_weight_array = feature_importance_array[feature_importance_idx]
                    feature_importance_std = feature_importance_array_std[feature_importance_idx]
                                        
                    # d. Accuracy
                    y_true_array, y_pred_array = np.array(y_true_array), np.array(y_pred_array)
                    accuracy = metrics.accuracy_score(y_true_array, y_pred_array)
                    
                    # e. MCC
                    MCC = matthews_corrcoef(y_true_array, y_pred_array)
                    
                    # f. Precision, recall, f1
                    result = precision_recall_fscore_support(y_true_array, y_pred_array, pos_label=positive_prediction_label, average='binary')
                    precision, recall, f1_score = result[0], result[1], result[2]
                                    
                    # g. save important features
                    params_str = 'max_features_' + str(max_features) + '_max_depth_' + str(max_depth) + '_criterion_' + str(criterion) + '_n_estimators_' + str(n_estimators)
                    
                    final_result_df.loc['accuracy', 'cluster0'] = len(ml_df.loc[ml_df['label'] == 0])
                    final_result_df.loc['accuracy', 'cluster1'] = len(ml_df.loc[ml_df['label'] == 1])
                    final_result_df.loc['precision', 'cluster0'] = len(ml_df.loc[ml_df['label'] == 0])
                    final_result_df.loc['precision', 'cluster1'] = len(ml_df.loc[ml_df['label'] == 1])
                    final_result_df.loc['recall', 'cluster0'] = len(ml_df.loc[ml_df['label'] == 0])
                    final_result_df.loc['recall', 'cluster1'] = len(ml_df.loc[ml_df['label'] == 1])
                    final_result_df.loc['f1', 'cluster0'] = len(ml_df.loc[ml_df['label'] == 0])
                    final_result_df.loc['f1', 'cluster1'] = len(ml_df.loc[ml_df['label'] == 1])
                    final_result_df.loc['MCC', 'cluster0'] = len(ml_df.loc[ml_df['label'] == 0])
                    final_result_df.loc['MCC', 'cluster1'] = len(ml_df.loc[ml_df['label'] == 1])

                    final_result_df.loc['accuracy', params_str] = accuracy
                    final_result_df.loc['precision', params_str] = precision
                    final_result_df.loc['recall', params_str] = recall
                    final_result_df.loc['f1', params_str] = f1_score
                    final_result_df.loc['MCC', params_str] = MCC

                    feature_importance_df = pd.DataFrame(index=[params_str])
                    for i in range(15):
                        feature_importance_df['feature_pos_name_' + str(i)] = feature_importance_name_array[i]
                        feature_importance_df['feature_pos_weight_' + str(i)] = feature_importance_weight_array[i]
                        feature_importance_df['feature_pos_std_' + str(i)] = feature_importance_std[i]
                        
                    for i in range(15):
                        idx = len(feature_importance_name_array) - i - 1
                        feature_importance_df['feature_neg_name_' + str(i)] = feature_importance_name_array[idx]
                        feature_importance_df['feature_neg_weight_' + str(i)] = feature_importance_weight_array[idx]
                        feature_importance_df['feature_neg_std_' + str(i)] = feature_importance_std[i]

                    feature_importance_final_df = feature_importance_final_df.append(feature_importance_df)
                    
                    # Print out results for each model
                    print(params_str)
                    print('Validation accuracy: %.3f, precision: %.3f, recall: %.3f, f1_score: %.3f \n' % (accuracy, precision, recall, f1_score))
                    
    return final_result_df, feature_importance_final_df


### Machine Learning

In [7]:
# Model types, options are svm and random_forest
# model_types = ['svm', 'random_forest']
model_types = ['random_forest']

final_result_df = pd.DataFrame()

for model_type in model_types:
    print('--------------------------------------------------------------------------')
    print('ML model name: %s' % (model_type))
    print('--------------------------------------------------------------------------')
    
    # For nurses only, we have shift difference
    # for shift in shift_array:
    
    # Drop the recordings when there is no responses of MGT
    ml_select_user_df = ml_select_user_df.dropna(subset=[lable_col_array])
    ml_select_user_df = ml_select_user_df.dropna()
    
    # Statistics of clustering on remaining labels
    # positive label is the cluster with higher positive affect, lower negative affect ... ...
    mean_emotion_per_cluster_df = pd.DataFrame()
    for i in range(2):
        print('--------------------------------------------------------------------------')
        
        data_df = ml_select_user_df.loc[ml_select_user_df['cluster'] == i][lable_col_array]
        print('cluster: %d, number of recordings: %d' % (i, len(data_df)))
        
        mean_emotion_cluster_df = np.mean(data_df[lable_col_array]).to_frame().transpose()
        mean_emotion_per_cluster_df = mean_emotion_per_cluster_df.append(mean_emotion_cluster_df)
        
        for affect_col in lable_col_array:
            print('mgt name: %s, mean: %.3f, std: %.3f' % (affect_col, np.mean(data_df[affect_col]), np.std(data_df[affect_col])))
        print('--------------------------------------------------------------------------')
        print('\n')
    
    positive_prediction_label = np.argmin(np.array(mean_emotion_per_cluster_df['pos_af_mgt']))
    
    # 1. Seperate lables, participant id, and input features
    subject_label = []
    input_label, input_feature = pd.DataFrame(), pd.DataFrame()
    
    for unique_id_idx, participant_id in enumerate(ml_df['participant_id'].unique()):
        recording_data = ml_select_user_df.loc[ml_select_user_df['participant_id'] == participant_id]

        # Label
        input_label = input_label.append(recording_data['cluster'].to_frame())

        # Data
        recording_data = recording_data[feat_cols]
        recording_data = recording_data.fillna(recording_data.mean())
        input_feature = input_feature.append(recording_data)
        
        # Subject label
        [subject_label.append(unique_id_idx) for i in range(len(recording_data))]
    
    # 2. Normalization
    norm_train = np.array(np.array(input_feature[feat_cols]))
    scaler = select_scaler(scaler_name)
    norm_inputFeature = scaler.fit_transform(np.array(input_feature))
    
    # 3. ML
    input_feature_col = input_feature.columns.values
    
    ml_norm_df = pd.DataFrame(data=input_feature)
    ml_norm_df['subject_idx'] = np.array(subject_label)
    ml_norm_df['label'] = np.array(input_label)
    
    results_final_df, feature_importance_final_df = ramdon_forest_pred(ml_norm_df, input_feature_col, positive_prediction_label)
    
    # 4. Save results to csv
    results_final_df.to_csv(final_result_path)
    feature_importance_final_df.to_csv(final_feat_importance_path)

    
    

--------------------------------------------------------------------------
ML model name: random_forest
--------------------------------------------------------------------------
--------------------------------------------------------------------------
cluster: 0, number of recordings: 202
mgt name: pos_af_mgt, mean: 9.475, std: 3.427
mgt name: neg_af_mgt, mean: 8.045, std: 3.457
mgt name: stress_mgt, mean: 2.559, std: 0.949
mgt name: anxiety_mgt, mean: 2.129, std: 0.992
--------------------------------------------------------------------------


--------------------------------------------------------------------------
cluster: 1, number of recordings: 184
mgt name: pos_af_mgt, mean: 16.707, std: 4.611
mgt name: neg_af_mgt, mean: 5.549, std: 0.999
mgt name: stress_mgt, mean: 1.913, std: 0.796
mgt name: anxiety_mgt, mean: 1.424, std: 0.575
--------------------------------------------------------------------------


max_features_auto_max_depth_7_criterion_gini_n_estimators_100
Validati

### Find the most import features in best performance model

In [61]:
results_df = results_final_df.drop('cluster0', axis=1)
results_df = results_df.drop('cluster1', axis=1)

feature_importance_in_best_model_df = pd.DataFrame()

for index, prediction_result in results_df.iterrows():
    
    row_result_df = pd.DataFrame(index=['model_with_best_' + index])
    max_component = prediction_result.max()
    prediction_result_col = prediction_result.where(prediction_result == max_component).dropna().index.values[0]
    
    row_result_df['best_model_name'] = prediction_result_col
    
    best_results = results_df[prediction_result_col]
    
    for metrics, results in best_results.iteritems():
        if 'accuracy' in metrics:
            row_result_df['accuracy'] = results
        elif 'precision' in metrics:
            row_result_df['precision'] = results
        elif 'recall' in metrics:
            row_result_df['recall'] = results
        elif 'f1' in metrics:
            row_result_df['f1'] = results

    row_result_df['best_score'] = max_component

    model = feature_importance_final_df.loc[prediction_result_col, :]
    
    for model_index, model_feat in model.iteritems():
        # row_result_df[model_index] = model_feat
        # if 'f1' in index:
            # if 'pos' in model_index:
        if 'name' in model_index:
            row_result_df[model_index] = model_feat.split('feat_')[1]
        elif 'std' in model_index:
            row_result_df[model_index] = model_feat
        else:
            row_result_df[model_index] = model_feat
    
    feature_importance_in_best_model_df = feature_importance_in_best_model_df.append(row_result_df)
    feature_importance_in_best_model_df = feature_importance_in_best_model_df[row_result_df.columns.values]
    
feature_importance_in_best_model_path = os.path.join(ml_output_path, 'feature_importance_in_best_model.csv')
feature_importance_in_best_model_df.to_csv(feature_importance_in_best_model_path)
