In [3]:
# Import packages
import pandas as pd
import numpy as np
import datetime
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate, KFold
from sklearn.metrics import accuracy_score, mean_squared_error, cohen_kappa_score, mean_absolute_error, f1_score
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE, SMOTENC
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.decomposition import PCA
from scipy import stats
from math import sqrt



import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, GaussianNoise
from keras.optimizers import SGD
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [4]:
path = "/Users/Stijn/Documents/Master Data Science and Society/Block 3/thesis/code/thesis_Code/"
mood = pd.read_csv(path+'mood_imputed_median.csv', sep = ',', index_col=0)
mood_cheerful = mood.copy()

In [5]:
cheerfulR2 = []
for a in mood['cheerful']:
    if a == 0:
      cheerfulR2.append(0)  
    if a == 1:
      cheerfulR2.append(2)
    if a == 2:
      cheerfulR2.append(3)
    if a == 3:
      cheerfulR2.append(4)
    if a == 4:
      cheerfulR2.append(5)
    if a == 5:
      cheerfulR2.append(7)
mood_cheerful['cheerfulR2'] = cheerfulR2
mood_cheerful = mood_cheerful.drop(['cheerful'], axis=1)

In [6]:
mood_cheerful['cheerful_last'] = mood_cheerful.groupby('user_id')['cheerfulR2'].shift()
mood_cheerful.loc[(pd.isnull(mood_cheerful.cheerful_last)), 'cheerful_last'] = mood_cheerful['cheerfulR2']
mood_cheerful = mood_cheerful.drop(["content", "bored", "anxious", "user_id", "response_time"], axis=1)
mood_cheerful = mood_cheerful[['cheerfulR2', 'day_time_window', 'average_TimeUse', 'bulk', 'messaging', 'socialnetworking', 'otherapp', 'cheerful_last']]

In [7]:
# Normalize predictors
mood_normz = mood_cheerful.copy()
features_to_normalize = ['day_time_window', 'average_TimeUse', 'messaging', 'socialnetworking', 'otherapp']
mood_normz[features_to_normalize] = mood_normz[features_to_normalize].apply(lambda x:(x-x.min()) / (x.max()-x.min()))

In [8]:
# X = predictors, y = target
X = mood_normz.iloc[:,1:]
y = mood_normz.iloc[:, 0:1]

# Convert float to int
y = y.astype(int)

In [9]:
y[:5]

Unnamed: 0,cheerfulR2
4825,4
4826,4
245,4
4827,4
4828,4


In [11]:
# Oversampling unbalanced target
print("Before OverSampling, counts of labels: {}".format(y['cheerfulR2'].value_counts()))
  
sm = SMOTENC(categorical_features=[0, 2, 6], random_state = 2) 
X_train, y_train = sm.fit_sample(X, y['cheerfulR2'].ravel()) 

print("After OverSampling, counts of label '0': {}".format(sum(y_train == 0))) 
print("After OverSampling, counts of label '3': {}".format(sum(y_train == 2))) 
print("After OverSampling, counts of label '4': {}".format(sum(y_train == 3))) 
print("After OverSampling, counts of label '5': {}".format(sum(y_train == 4))) 
print("After OverSampling, counts of label '7': {}".format(sum(y_train == 5))) 
print("After OverSampling, counts of label '9': {}".format(sum(y_train == 7))) 

Before OverSampling, counts of labels: 4    3719
3    2966
5    2358
2    1880
0    1413
7     885
Name: cheerfulR2, dtype: int64
After OverSampling, counts of label '0': 3719
After OverSampling, counts of label '3': 3719
After OverSampling, counts of label '4': 3719
After OverSampling, counts of label '5': 3719
After OverSampling, counts of label '7': 3719
After OverSampling, counts of label '9': 3719


In [12]:
# Convert oversample dataset to Pandas DataFrame
X = pd.DataFrame(data=X_train)
y = pd.DataFrame(data=y_train)

In [13]:
X = X.iloc[:,:-1]
X_onlylastmood = X.iloc[:,-1:]

In [14]:
def macroaverage_mae(test, pred):
    mae_0, mae_1, mae_2, mae_3, mae_4, mae_5 = (0,0,0,0,0,0)

    for i in range(len(pred)):
        if test[i] == 0 and pred[i] != 0:
            mae_0 += (abs(test[i]-pred[i]))
        if test[i] == 2 and pred[i] != 2:
            mae_1 += (abs(test[i]-pred[i]))
        if test[i] == 3 and pred[i] != 3:
            mae_2 += (abs(test[i]-pred[i]))
        if test[i] == 4 and pred[i] != 4:
            mae_3 += (abs(test[i]-pred[i]))
        if test[i] == 5 and pred[i] != 5:
            mae_4 += (abs(test[i]-pred[i]))
        if test[i] == 7 and pred[i] != 7:
            mae_5 += (abs(test[i]-pred[i]))

    cnt_0, cnt_1, cnt_2, cnt_3, cnt_4, cnt_5 = (0,0,0,0,0,0)
    for z in range(len(test)):
        if test[z] == 0:
            cnt_0 += 1
        if test[z] == 2:
            cnt_1 += 1
        if test[z] == 3:
            cnt_2 += 1
        if test[z] == 4:
            cnt_3 += 1
        if test[z] == 5:
            cnt_4 += 1
        if test[z] == 7:
            cnt_5 += 1

    mae_macroaverage = ((mae_0/cnt_0) + (mae_1/cnt_1) + (mae_2/cnt_2) + (mae_3/cnt_3) + (mae_4/cnt_4) + (mae_5/cnt_5)) / 6
    return mae_macroaverage 

In [15]:
def nearest_R2(pred):   
    cat_list = [0,2,3,4,5,7]
    transf_pred = []
    for a in pred:
        for b in a:
            d = min(cat_list, key=lambda x:abs(x-b))
            transf_pred.append(round(d))
    return transf_pred

In [16]:
n_split=10
f1_scores = []
mmae_scores = []
for train_index, test_index in KFold(n_splits = n_split, random_state=2, shuffle=True).split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = Sequential()
    model.add(Dense(24, input_dim=6, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1, activation='linear'))

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train,epochs=100, batch_size=512, verbose=0)
    y_pred = model.predict(X_test)
    test = y_test.values

    pred = nearest_R2(y_pred)

    c_test = []
    for c in test:
        c_test.append(c)
    
    f1_scores.append(round(f1_score(c_test, pred, average='weighted'), 4))
    mmae_scores.append(macroaverage_mae(c_test, pred))
    

  'precision', 'predicted', average, warn_for)


In [17]:
print(f1_scores)

mmae_scores_rounded = []
for a in mmae_scores:
    for b in a:
        mmae_scores_rounded.append(round(b, 4))
print(mmae_scores_rounded)

[0.0897, 0.0896, 0.0916, 0.0909, 0.0963, 0.089, 0.0813, 0.0901, 0.0859, 0.084]
[1.817, 1.7937, 1.784, 1.7941, 1.7735, 1.8083, 1.7978, 1.8042, 1.7932, 1.7923]


In [13]:
# Accuracy score
a = accuracy_score(pred,test)
print('Accuracy is:', a*100)

Accuracy is: 11.493383742911153


In [14]:
# true = y_test
# pred = y_pred
# last_value = X_bored_lastarray
same_correct = 0
same_wrong = 0
dif_correct = 0
dif_wrong = 0

X_test_bored = X_test_onlylastmood['bored_last'].values
for i in range(len(pred)):
    if (test[i] == X_test_bored[i]) and (test[i] == pred[i]):
        same_correct += 1
    if (test[i] == X_test_bored[i]) and (test[i] != pred[i]):
        same_wrong += 1
    if (test[i] != X_test_bored[i]) and (test[i] == pred[i]):
        dif_correct += 1
    if (test[i] != X_test_bored[i]) and (test[i] != pred[i]):
        dif_wrong += 1   

print("       ", "Same", "Dif")
print("correct", same_correct, dif_correct)
print("wrong  ", same_wrong, dif_wrong)

#Check if table is similar to accuracy
if (same_correct+dif_correct) / (same_correct+dif_correct+same_wrong+dif_wrong) == a:
    print("Table is right!")
else:
    print("Table is wrong")

        Same Dif
correct 193 111
wrong   1588 753
Table is right!


In [15]:
from collections import Counter
print("Predicted valxues: ", Counter(pred))
print("True values: ", Counter(test))
print("Previous mood:", Counter(X_test_bored))

Predicted valxues:  Counter({5: 1368, 0: 405, 3: 344, 4: 319, 1: 116, 2: 93})
True values:  Counter({0: 1264, 1: 491, 2: 455, 3: 301, 4: 119, 5: 15})
Previous mood: Counter({0.0: 1286, 1.0: 494, 2.0: 451, 3.0: 294, 4.0: 106, 5.0: 14})


In [16]:
same_correct_0, same_wrong_0, same_correct_1, same_wrong_1, same_correct_2, same_wrong_2, same_correct_3, same_wrong_3, same_correct_4, same_wrong_4, same_correct_5, same_wrong_5 = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
dif_correct_0, dif_wrong_0, dif_correct_1, dif_wrong_1, dif_correct_2, dif_wrong_2, dif_correct_3, dif_wrong_3, dif_correct_4, dif_wrong_4, dif_correct_5, dif_wrong_5 = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

for i in range(len(pred)):
    if test[i] == 0 and pred[i] == 0:
        if test[i] == X_test_bored[i]:
            same_correct_0 += 1
        if test[i] != X_test_bored[i]:
            dif_correct_0 += 1
    if test[i] == 0 and pred[i] != 0:
        if test[i] == X_test_bored[i]:
            same_wrong_0 += 1
        if test[i] != X_test_bored[i]:
            dif_wrong_0 += 1

    if test[i] == 1 and pred[i] == 1:
        if test[i] == X_test_bored[i]:
            same_correct_1 += 1
        if test[i] != X_test_bored[i]:
            dif_correct_1 += 1
    if test[i] == 1 and pred[i] != 1:
        if test[i] == X_test_bored[i]:
            same_wrong_1 += 1
        if test[i] != X_test_bored[i]:
            dif_wrong_1 += 1
        
    if test[i] == 2 and pred[i] == 2:
        if test[i] == X_test_bored[i]:
            same_correct_2 += 1
        if test[i] != X_test_bored[i]:
            dif_correct_2 += 1
    if test[i] == 2 and pred[i] != 2:
        if test[i] == X_test_bored[i]:
            same_wrong_2 += 1
        if test[i] != X_test_bored[i]:
            dif_wrong_2 += 1
    
    if test[i] == 3 and pred[i] == 3:
        if test[i] == X_test_bored[i]:
            same_correct_3 += 1
        if test[i] != X_test_bored[i]:
            dif_correct_3 += 1
    if test[i] == 3 and pred[i] != 3:
        if test[i] == X_test_bored[i]:
            same_wrong_3 += 1
        if test[i] != X_test_bored[i]:
            dif_wrong_3 += 1
        
    if test[i] == 4 and pred[i] == 4:
        if test[i] == X_test_bored[i]:
            same_correct_4 += 1
        if test[i] != X_test_bored[i]:
            dif_correct_4 += 1
    if test[i] == 4 and pred[i] != 4:
        if test[i] == X_test_bored[i]:
            same_wrong_4 += 1
        if test[i] != X_test_bored[i]:
            dif_wrong_4 += 1
    
    if test[i] == 5 and pred[i] == 5:
        if test[i] == X_test_bored[i]:
            same_correct_5 += 1
        if test[i] != X_test_bored[i]:
            dif_correct_5 += 1
    if test[i] == 5 and pred[i] != 5:
        if test[i] == X_test_bored[i]:
            same_wrong_5 += 1
        if test[i] != X_test_bored[i]:
            dif_wrong_5 += 1

print("Total length: ", len(X_test_bored))

print("Same Correct 0 guesses is: ", same_correct_0)
print("Same Wrong 0 guesses is: ", same_wrong_0)
print("Dif Correct 0 guesses is: ", dif_correct_0)
print("Dif Wrong 0 guesses is: ", dif_wrong_0)
print('')
print("Same Correct 1 guesses is: ", same_correct_1)
print("Same Wrong 1 guesses is: ", same_wrong_1)
print("Dif Correct 1 guesses is: ", dif_correct_1)
print("Dif Wrong 1 guesses is: ", dif_wrong_1)
print('')
print("Same Correct 2 guesses is: ", same_correct_2)
print("Same Wrong 2 guesses is: ", same_wrong_2)
print("Dif Correct 2 guesses is: ", dif_correct_2)
print("Dif Wrong 2 guesses is: ", dif_wrong_2)
print('')
print("Same Correct 3 guesses is: ", same_correct_3)
print("Same Wrong 3 guesses is: ", same_wrong_3)
print("Dif Correct 3 guesses is: ", dif_correct_3)
print("Dif Wrong 3 guesses is: ", dif_wrong_3)
print('')
print("Same Correct 4 guesses is: ", same_correct_4)
print("Same Wrong 4 guesses is: ", same_wrong_4)
print("Dif Correct 4 guesses is: ", dif_correct_4)
print("Dif Wrong 4 guesses is: ", dif_wrong_4)
print('')
print("Same Correct 5 guesses is: ", same_correct_5)
print("Same Wrong 5 guesses is: ", same_wrong_5)
print("Dif Correct 5 guesses is: ", dif_correct_5)
print("Dif Wrong 5 guesses is: ", dif_wrong_5)

Total length:  2645
Same Correct 0 guesses is:  151
Same Wrong 0 guesses is:  888
Dif Correct 0 guesses is:  53
Dif Wrong 0 guesses is:  172

Same Correct 1 guesses is:  10
Same Wrong 1 guesses is:  262
Dif Correct 1 guesses is:  14
Dif Wrong 1 guesses is:  205

Same Correct 2 guesses is:  3
Same Wrong 2 guesses is:  229
Dif Correct 2 guesses is:  12
Dif Wrong 2 guesses is:  211

Same Correct 3 guesses is:  27
Same Wrong 3 guesses is:  156
Dif Correct 3 guesses is:  11
Dif Wrong 3 guesses is:  107

Same Correct 4 guesses is:  2
Same Wrong 4 guesses is:  53
Dif Correct 4 guesses is:  12
Dif Wrong 4 guesses is:  52

Same Correct 5 guesses is:  0
Same Wrong 5 guesses is:  0
Dif Correct 5 guesses is:  9
Dif Wrong 5 guesses is:  6


In [17]:
mae_0, mae_1, mae_2, mae_3, mae_4, mae_5 = (0,0,0,0,0,0)

for i in range(len(pred)):
    if test[i] == 0 and pred[i] != 0:
        mae_0 += (abs(test[i]-pred[i]))
    if test[i] == 1 and pred[i] != 1:
        mae_1 += (abs(test[i]-pred[i]))
    if test[i] == 2 and pred[i] != 2:
        mae_2 += (abs(test[i]-pred[i]))
    if test[i] == 3 and pred[i] != 3:
        mae_3 += (abs(test[i]-pred[i]))
    if test[i] == 4 and pred[i] != 4:
        mae_4 += (abs(test[i]-pred[i]))
    if test[i] == 5 and pred[i] != 5:
        mae_5 += (abs(test[i]-pred[i]))

cnt_0, cnt_1, cnt_2, cnt_3, cnt_4, cnt_5 = (0,0,0,0,0,0)
for z in range(len(test)):
    if test[z] == 0:
        cnt_0 += 1
    if test[z] == 1:
        cnt_1 += 1
    if test[z] == 2:
        cnt_2 += 1
    if test[z] == 3:
        cnt_3 += 1
    if test[z] == 4:
        cnt_4 += 1
    if test[z] == 5:
        cnt_5 += 1

mae_macroaverage = ((mae_0/cnt_0) + (mae_1/cnt_1) + (mae_2/cnt_2) + (mae_3/cnt_3) + (mae_4/cnt_4) + (mae_5/cnt_5)) / 6

In [18]:
print("Macro-averaged Mean Absolute error is: ", round(mae_macroaverage, 4))
print("Macro-averaged F1-score is: ", round(f1_score(test, pred, average='weighted'), 4))

Macro-averaged Mean Absolute error is:  2.1174
Macro-averaged F1-score is:  0.1573
