In [3]:
# Import packages
import pandas as pd
import numpy as np
import datetime
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.metrics import accuracy_score, mean_squared_error, cohen_kappa_score, mean_absolute_error, f1_score
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE, SMOTENC
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.decomposition import PCA
from scipy import stats
from math import sqrt



import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, GaussianNoise
from keras.optimizers import SGD
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

Using TensorFlow backend.


In [4]:
path = "/Users/Stijn/Documents/Master Data Science and Society/Block 3/thesis/code/thesis_Code/"
mood = pd.read_csv(path+'mood_imputed_median.csv', sep = ',', index_col=0)
mood['bored_last'] = mood.groupby('user_id')['bored'].shift()
mood.loc[(pd.isnull(mood.bored_last)), 'bored_last'] = mood['bored']
mood_bored = mood.drop(["anxious", "content", "cheerful", "user_id", "response_time"], axis=1)

In [5]:
# Normalize predictors
mood_normz = mood_bored.copy()
features_to_normalize = ['day_time_window', 'average_TimeUse', 'messaging', 'socialnetworking', 'otherapp']
mood_normz[features_to_normalize] = mood_normz[features_to_normalize].apply(lambda x:(x-x.min()) / (x.max()-x.min()))

In [6]:
# X = predictors, y = target
X = mood_normz.iloc[:,1:]
y = mood_normz.iloc[:, 0:1]

# Convert float to int
y = y.astype(int)

In [7]:
X, y = shuffle(X, y)

In [8]:
X = X.iloc[:,:-1]
X_onlylastmood = X.iloc[:,-1:]

In [9]:
# Frank & hall encoding

def multiple_appends(listname, *element):
    listname.extend(element)

def ordinal_to_frank(y):
    y_trainFH = []
    for index, row in y.iterrows():
        frankhalltrain = []
        if row['bored'] == 0:
            multiple_appends(frankhalltrain, 0, 0, 0, 0, 0)
        elif row['bored'] == 1:
            multiple_appends(frankhalltrain, 1, 0, 0, 0, 0)
        elif row['bored'] == 2:
            multiple_appends(frankhalltrain, 1, 1, 0, 0, 0)
        elif row['bored'] == 3:
            multiple_appends(frankhalltrain, 1, 1, 1, 0, 0)
        elif row['bored'] == 4:
            multiple_appends(frankhalltrain, 1, 1, 1, 1, 0)
        elif row['bored'] == 5:
            multiple_appends(frankhalltrain, 1, 1, 1, 1, 1)
        y_trainFH.append(frankhalltrain)
    y_trainFH_df = pd.DataFrame(y_trainFH)
    return y_trainFH_df

In [10]:
def macroaverage_mae(test, pred):
    mae_0, mae_1, mae_2, mae_3, mae_4, mae_5 = (0,0,0,0,0,0)

    for i in range(len(pred)):
        if test[i] == 0 and pred[i] != 0:
            mae_0 += (abs(test[i]-pred[i]))
        if test[i] == 1 and pred[i] != 1:
            mae_1 += (abs(test[i]-pred[i]))
        if test[i] == 2 and pred[i] != 2:
            mae_2 += (abs(test[i]-pred[i]))
        if test[i] == 3 and pred[i] != 3:
            mae_3 += (abs(test[i]-pred[i]))
        if test[i] == 4 and pred[i] != 4:
            mae_4 += (abs(test[i]-pred[i]))
        if test[i] == 5 and pred[i] != 5:
            mae_5 += (abs(test[i]-pred[i]))

    cnt_0, cnt_1, cnt_2, cnt_3, cnt_4, cnt_5 = (0,0,0,0,0,0)
    for z in range(len(test)):
        if test[z] == 0:
            cnt_0 += 1
        if test[z] == 1:
            cnt_1 += 1
        if test[z] == 2:
            cnt_2 += 1
        if test[z] == 3:
            cnt_3 += 1
        if test[z] == 4:
            cnt_4 += 1
        if test[z] == 5:
            cnt_5 += 1

    mae_macroaverage = ((mae_0/cnt_0) + (mae_1/cnt_1) + (mae_2/cnt_2) + (mae_3/cnt_3) + (mae_4/cnt_4) + (mae_5/cnt_5)) / 6
    return mae_macroaverage 

In [12]:
def transform_fh_pred(pred):
    FH_pred = []
    for i in pred:
        temp_list = []
        for binary in i:
            if binary >= 0.5:
                temp_list.append(1)
            else:
                temp_list.append(0)
        FH_pred.append(temp_list)

    FH_prediction = pd.DataFrame(data=FH_pred, columns=['mood_0', 'mood_1', 'mood_2', 'mood_3', 'mood_4'])
    real_pred = FH_prediction.sum(axis=1)
    return real_pred

In [13]:
def transform_fh_test(test):
    real_test = test.sum(axis=1)
    return real_test

In [15]:
n_split=10
f1_scores = []
mmae_scores= []
f1_scores_classes = []
for train_index, test_index in KFold(n_splits = n_split, random_state=2, shuffle=True).split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    sm = SMOTENC(categorical_features=[0, 2], random_state = 2) 
    X_train_oversampl, y_train_oversampl = sm.fit_sample(X_train, y_train['bored'].ravel())
    
    y_train_df = pd.DataFrame(y_train_oversampl, columns=['bored'])
    y_frank = ordinal_to_frank(y_train_df)

    model = Sequential()
    model.add(Dense(24, input_dim=6, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(5, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train_oversampl, y_frank,epochs=100, batch_size=512, verbose=0)
    y_pred = model.predict(X_test)

    pred = transform_fh_pred(y_pred).values
    test = y_test.values
    
    f1_scores_classes.append(f1_score(test, pred, average=None))
    f1_scores.append(round(f1_score(test, pred, average='weighted'), 4))
    mmae_scores.append(macroaverage_mae(test, pred))
    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [16]:
print("Macro-averaged F1-scores: ", f1_scores)
print("Mean MA F1-scores: ", round(np.mean(f1_scores), 4))
print("Std MA F1-scores: ", round(np.std(f1_scores), 4))

Macro-averaged F1-scores:  [0.0853, 0.1045, 0.0714, 0.087, 0.0783, 0.0925, 0.0883, 0.0773, 0.0908, 0.0893]
Mean MA F1-scores:  0.0865
Std MA F1-scores:  0.0088


In [17]:
mmae_scores_rounded = []
for a in mmae_scores:
    for b in a:
        mmae_scores_rounded.append(round(b, 4))
        
print("Macro-averaged MAE: ", mmae_scores_rounded)
print("Mean MMAE-scores: ", round(np.mean(mmae_scores), 4))
print("Std MMAE-scores: ", round(np.std(mmae_scores), 4))

Macro-averaged MAE:  [1.4844, 1.4898, 1.4421, 1.5162, 1.5223, 1.4424, 1.4637, 1.4969, 1.5663, 1.5129]
Mean MMAE-scores:  1.4937
Std MMAE-scores:  0.0364


In [18]:
print("Mean F1-score per class: ", [round(np.mean(x), 4) for x in zip(*f1_scores_classes)])
print("STD F1-score per class: ", [round(np.std(x), 4) for x in zip(*f1_scores_classes)])

Mean F1-score per class:  [0.0019, 0.1127, 0.2345, 0.1991, 0.0452, 0.0]
STD F1-score per class:  [0.0026, 0.034, 0.0311, 0.0235, 0.0333, 0.0]
