In [1]:
# Import packages
import pandas as pd
import numpy as np
import datetime
from category_encoders import BinaryEncoder
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.metrics import accuracy_score, mean_squared_error, cohen_kappa_score, mean_absolute_error, f1_score
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE, SMOTENC
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.decomposition import PCA
from scipy import stats
from math import sqrt



import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, GaussianNoise
from keras.optimizers import SGD
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

Using TensorFlow backend.


In [2]:
path = "/Users/Stijn/Documents/Master Data Science and Society/Block 3/thesis/code/thesis_Code/"
mood = pd.read_csv(path+'mood_imputed_median.csv', sep = ',', index_col=0)
mood['cheerful_last'] = mood.groupby('user_id')['cheerful'].shift()
mood.loc[(pd.isnull(mood.cheerful_last)), 'cheerful_last'] = mood['cheerful']
mood_cheerful = mood.drop(["anxious", "content", "bored", "user_id", "response_time"], axis=1)

In [3]:
# Normalize predictors
mood_normz = mood_cheerful.copy()
features_to_normalize = ['day_time_window', 'average_TimeUse', 'messaging', 'socialnetworking', 'otherapp']
mood_normz[features_to_normalize] = mood_normz[features_to_normalize].apply(lambda x:(x-x.min()) / (x.max()-x.min()))

In [4]:
# X = predictors, y = target
X = mood_normz.iloc[:,1:]
y = mood_normz.iloc[:, 0:1]

# Convert float to int
y = y.astype(int)

In [5]:
X, y = shuffle(X, y)

In [6]:
X = X.iloc[:,:-1]
X_onlylastmood = X.iloc[:,-1:]

In [11]:
def binary_to_ordinal(df):
    pred = []
    for index, row in df.iterrows():
        if row['cheerful_0'] == 0:
            if row['cheerful_1'] == 0:
                if row['cheerful_2'] == 0:
                    if row['cheerful_3'] == 0:
                        pred.append(0)
                    if row['cheerful_3'] == 1:
                        pred.append(0)
                if row['cheerful_2'] == 1:
                    if row['cheerful_3'] == 0:
                        pred.append(3)
                    if row['cheerful_3'] == 1:
                        pred.append(1)
            if row['cheerful_1'] == 1:
                if row['cheerful_2'] == 0:
                    if row['cheerful_3'] == 0:
                        pred.append(2)
                    if row['cheerful_3'] == 1:
                        pred.append(5)
                if row['cheerful_2'] == 1:
                    if row['cheerful_3'] == 0:
                        pred.append(0)
                    if row['cheerful_3'] == 1:
                        pred.append(4)
    return pred

In [12]:
def multiple_appends(listname, *element):
    listname.extend(element)

def ordinal_to_binary(df):
    encoded = []
    for index, row in df.iterrows():
        encoded_list = []
        if row['cheerful'] == 0:
            multiple_appends(encoded_list, 0, 0, 0, 1)
        elif row['cheerful'] == 1:
            multiple_appends(encoded_list, 0, 0, 1, 1)
        elif row['cheerful'] == 2:
            multiple_appends(encoded_list, 0, 1, 0, 0)
        elif row['cheerful'] == 3:
            multiple_appends(encoded_list, 0, 0, 1, 0)
        elif row['cheerful'] == 4:
            multiple_appends(encoded_list, 0, 1, 1, 1)
        elif row['cheerful'] == 5:
            multiple_appends(encoded_list, 0, 1, 0, 1)
        encoded.append(encoded_list)
    encoded_df = pd.DataFrame(encoded)
    return encoded_df

In [13]:
def macroaverage_mae(test, pred):
    mae_0, mae_1, mae_2, mae_3, mae_4, mae_5 = (0,0,0,0,0,0)

    for i in range(len(pred)):
        if test[i] == 0 and pred[i] != 0:
            mae_0 += (abs(test[i]-pred[i]))
        if test[i] == 1 and pred[i] != 1:
            mae_1 += (abs(test[i]-pred[i]))
        if test[i] == 2 and pred[i] != 2:
            mae_2 += (abs(test[i]-pred[i]))
        if test[i] == 3 and pred[i] != 3:
            mae_3 += (abs(test[i]-pred[i]))
        if test[i] == 4 and pred[i] != 4:
            mae_4 += (abs(test[i]-pred[i]))
        if test[i] == 5 and pred[i] != 5:
            mae_5 += (abs(test[i]-pred[i]))

    cnt_0, cnt_1, cnt_2, cnt_3, cnt_4, cnt_5 = (0,0,0,0,0,0)
    for z in range(len(test)):
        if test[z] == 0:
            cnt_0 += 1
        if test[z] == 1:
            cnt_1 += 1
        if test[z] == 2:
            cnt_2 += 1
        if test[z] == 3:
            cnt_3 += 1
        if test[z] == 4:
            cnt_4 += 1
        if test[z] == 5:
            cnt_5 += 1

    mae_macroaverage = ((mae_0/cnt_0) + (mae_1/cnt_1) + (mae_2/cnt_2) + (mae_3/cnt_3) + (mae_4/cnt_4) + (mae_5/cnt_5)) / 6
    return mae_macroaverage 

In [18]:
n_split=10
f1_scores = []
mmae_scores= []
f1_scores_classes = []
for train_index, test_index in KFold(n_splits = n_split, random_state=2, shuffle=True).split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    sm = SMOTENC(categorical_features=[0, 2], random_state = 2) 
    X_train_oversampl, y_train_oversampl = sm.fit_sample(X_train, y_train['cheerful'].ravel())
    
    y_train_df = pd.DataFrame(y_train_oversampl, columns=['cheerful'])
    y_binary = ordinal_to_binary(y_train_df)

    
    model = Sequential()
    model.add(Dense(24, input_dim=6, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(4, activation='sigmoid'))

    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train_oversampl, y_binary,epochs=100, batch_size=512, verbose=0)
    y_pred = model.predict(X_test)

    binary_pred = []
    for i in y_pred:
        temp_list = []
        for binary in i:
            if binary >= 0.5:
                temp_list.append(1)
            else:
                temp_list.append(0)
        binary_pred.append(temp_list)

    binary_prediction = pd.DataFrame(data=binary_pred, columns=['cheerful_0', 'cheerful_1', 'cheerful_2', 'cheerful_3'])
    
    pred = binary_to_ordinal(binary_prediction)
    test = y_test.values
    
    f1_scores_classes.append(f1_score(test, pred, average=None))
    f1_scores.append(round(f1_score(test, pred, average='weighted'), 4))
    mmae_scores.append(macroaverage_mae(test, pred))
    

Before OverSampling, counts of labels: 3    3337
2    2689
4    2113
1    1685
0    1270
5     804
Name: cheerful, dtype: int64
After OverSampling, counts of label '0': 3337
After OverSampling, counts of label '1': 3337
After OverSampling, counts of label '2': 3337
After OverSampling, counts of label '3': 3337
After OverSampling, counts of label '4': 3337
After OverSampling, counts of label '5': 3337


KeyboardInterrupt: 

In [15]:
print("Macro-averaged F1-scores: ", f1_scores)
print("Mean MA F1-scores: ", round(np.mean(f1_scores), 4))
print("Std MA F1-scores: ", round(np.std(f1_scores), 4))

Macro-averaged F1-scores:  [0.085, 0.0824, 0.1053, 0.0635, 0.0725, 0.0546, 0.0847, 0.0953, 0.1013, 0.0695]
Mean MA F1-scores:  0.0814
Std MA F1-scores:  0.0156


In [16]:
mmae_scores_rounded = []
for a in mmae_scores:
    for b in a:
        mmae_scores_rounded.append(round(b, 4))
        
print("Macro-averaged MAE: ", mmae_scores_rounded)
print("Mean MMAE-scores: ", round(np.mean(mmae_scores), 4))
print("Std MMAE-scores: ", round(np.std(mmae_scores), 4))

Macro-averaged MAE:  [2.0442, 2.0181, 1.7775, 2.3488, 2.3037, 2.3812, 2.1409, 2.1163, 1.9286, 2.1358]
Mean MMAE-scores:  2.1195
Std MMAE-scores:  0.1803


In [17]:
print("Mean F1-score per class: ", [round(np.mean(x), 4) for x in zip(*f1_scores_classes)])
print("STD F1-score per class: ", [round(np.std(x), 4) for x in zip(*f1_scores_classes)])

Mean F1-score per class:  [0.1776, 0.1516, 0.0, 0.0, 0.1751, 0.1348]
STD F1-score per class:  [0.0223, 0.0642, 0.0, 0.0, 0.0524, 0.0202]
