In [1]:
# Import packages
import pandas as pd
import numpy as np
import datetime
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.metrics import accuracy_score, mean_squared_error, cohen_kappa_score, mean_absolute_error, f1_score
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE, SMOTENC
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.decomposition import PCA
from scipy import stats
from math import sqrt



import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, GaussianNoise
from keras.optimizers import SGD
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

Using TensorFlow backend.


In [2]:
path = "/Users/Stijn/Documents/Master Data Science and Society/Block 3/thesis/code/thesis_Code/"
mood = pd.read_csv(path+'mood_imputed_median.csv', sep = ',', index_col=0)
mood_cheerful = mood.copy()

In [3]:
cheerfulR2 = []
for a in mood['cheerful']:
    if a == 0:
      cheerfulR2.append(0)  
    if a == 1:
      cheerfulR2.append(2)
    if a == 2:
      cheerfulR2.append(3)
    if a == 3:
      cheerfulR2.append(4)
    if a == 4:
      cheerfulR2.append(5)
    if a == 5:
      cheerfulR2.append(7)
mood_cheerful['cheerfulR2'] = cheerfulR2
mood_cheerful = mood_cheerful.drop(['cheerful'], axis=1)

In [4]:
mood_cheerful['cheerful_last'] = mood_cheerful.groupby('user_id')['cheerfulR2'].shift()
mood_cheerful.loc[(pd.isnull(mood_cheerful.cheerful_last)), 'cheerful_last'] = mood_cheerful['cheerfulR2']
mood_cheerful = mood_cheerful.drop(["content", "bored", "anxious", "user_id", "response_time"], axis=1)
mood_cheerful = mood_cheerful[['cheerfulR2', 'day_time_window', 'average_TimeUse', 'bulk', 'messaging', 'socialnetworking', 'otherapp', 'cheerful_last']]

In [5]:
# Normalize predictors
mood_normz = mood_cheerful.copy()
features_to_normalize = ['day_time_window', 'average_TimeUse', 'messaging', 'socialnetworking', 'otherapp']
mood_normz[features_to_normalize] = mood_normz[features_to_normalize].apply(lambda x:(x-x.min()) / (x.max()-x.min()))

In [12]:
# X = predictors, y = target
X = mood_normz.iloc[:,1:]
y = mood_normz.iloc[:, 0:1]

# Convert float to int
y = y.astype(int)

In [13]:
X = X.iloc[:,:-1]
X_onlylastmood = X.iloc[:,-1:]

In [18]:
def macroaverage_mae(test, pred):
    mae_0, mae_1, mae_2, mae_3, mae_4, mae_5 = (0,0,0,0,0,0)

    for i in range(len(pred)):
        if test[i] == 0 and pred[i] != 0:
            mae_0 += (abs(test[i]-pred[i]))
        if test[i] == 2 and pred[i] != 2:
            mae_1 += (abs(test[i]-pred[i]))
        if test[i] == 3 and pred[i] != 3:
            mae_2 += (abs(test[i]-pred[i]))
        if test[i] == 4 and pred[i] != 4:
            mae_3 += (abs(test[i]-pred[i]))
        if test[i] == 5 and pred[i] != 5:
            mae_4 += (abs(test[i]-pred[i]))
        if test[i] == 7 and pred[i] != 7:
            mae_5 += (abs(test[i]-pred[i]))

    cnt_0, cnt_1, cnt_2, cnt_3, cnt_4, cnt_5 = (0,0,0,0,0,0)
    for z in range(len(test)):
        if test[z] == 0:
            cnt_0 += 1
        if test[z] == 2:
            cnt_1 += 1
        if test[z] == 3:
            cnt_2 += 1
        if test[z] == 4:
            cnt_3 += 1
        if test[z] == 5:
            cnt_4 += 1
        if test[z] == 7:
            cnt_5 += 1

    mae_macroaverage = ((mae_0/cnt_0) + (mae_1/cnt_1) + (mae_2/cnt_2) + (mae_3/cnt_3) + (mae_4/cnt_4) + (mae_5/cnt_5)) / 6
    return mae_macroaverage 

In [19]:
def nearest_R2(pred):   
    cat_list = [0,3,4,5,7,9]
    transf_pred = []
    for a in pred:
        for b in a:
            d = min(cat_list, key=lambda x:abs(x-b))
            transf_pred.append(round(d))
    return transf_pred

In [20]:
X[:5]

Unnamed: 0,day_time_window,average_TimeUse,bulk,messaging,socialnetworking,otherapp
4825,0.666667,0.000316,0,0.0,0.0,0.008547
4826,1.0,0.030226,0,0.009524,0.0,0.017094
245,0.0,0.009677,0,0.0,0.0,0.017094
4827,0.333333,0.0,0,0.0,0.0,0.0
4828,0.666667,0.025358,0,0.028571,0.0,0.017094


In [30]:
n_split=10
f1_scores = []
mmae_scores = []
f1_scores_classes = []

for train_index, test_index in KFold(n_splits = n_split, random_state=2, shuffle=True).split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    sm = SMOTENC(categorical_features=[0, 2], random_state = 2) 
    X_train_oversampl, y_train_oversampl = sm.fit_sample(X_train, y_train['cheerfulR2'].ravel()) 
    X_use, y_use = shuffle(X_train_oversampl, y_train_oversampl)
    
    model = Sequential()
    model.add(Dense(24, input_dim=6, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1, activation='linear'))

    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy'])
    model.fit(X_use, y_use,epochs=100, batch_size=512, verbose=0)
    y_pred = model.predict(X_test)
    test = y_test.values

    pred = nearest_R2(y_pred)

    c_test = []
    for c in test:
        c_test.append(c)
    
    f1_scores_classes.append(f1_score(c_test, pred, average=None))
    f1_scores.append(round(f1_score(c_test, pred, average='weighted'), 4))
    mmae_scores.append(macroaverage_mae(c_test, pred))
    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [31]:
print("Mean F1-score per class: ", [round(np.mean(x), 4) for x in zip(*f1_scores_classes)])
print("STD F1-score per class: ", [round(np.std(x), 4) for x in zip(*f1_scores_classes)])

Mean F1-score per class:  [0.0, 0.0, 0.311, 0.3531, 0.013, 0.0]
STD F1-score per class:  [0.0, 0.0, 0.0305, 0.0323, 0.0168, 0.0]


In [32]:
print("Macro-averaged F1-scores: ", f1_scores)
print("Mean MA F1-scores: ", round(np.mean(f1_scores), 4))
print("Std MA F1-scores: ", round(np.std(f1_scores), 4))

Macro-averaged F1-scores:  [0.178, 0.1693, 0.1742, 0.1746, 0.1644, 0.1593, 0.1717, 0.1793, 0.169, 0.1745]
Mean MA F1-scores:  0.1714
Std MA F1-scores:  0.0058


In [33]:
mmae_scores_rounded = []
for a in mmae_scores:
    for b in a:
        mmae_scores_rounded.append(round(b, 4))
        
print("Macro-averaged MAE: ", mmae_scores_rounded)
print("Mean MMAE-scores: ", round(np.mean(mmae_scores), 4))
print("Std MMAE-scores: ", round(np.std(mmae_scores), 4))

Macro-averaged MAE:  [1.804, 1.7894, 1.8097, 1.8131, 1.7807, 1.836, 1.8255, 1.7951, 1.8203, 1.8051]
Mean MMAE-scores:  1.8079
Std MMAE-scores:  0.016
