In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score, make_scorer, classification_report
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
import mord
from collections import Counter

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [11]:
path = "/Users/Stijn/Documents/Master Data Science and Society/Block 3/thesis/code/thesis_Code/"
mood = pd.read_csv(path+'mood_imputed_median.csv', sep = ',', index_col=0)
mood['anxious_last'] = mood.groupby('user_id')['anxious'].shift()
mood.loc[(pd.isnull(mood.anxious_last)), 'anxious_last'] = mood['anxious']
mood_anxious = mood.drop(["bored", "content", "cheerful", "user_id", "response_time"], axis=1)

In [12]:
mood_anxious[:5]

Unnamed: 0,anxious,day_time_window,average_TimeUse,bulk,messaging,socialnetworking,otherapp,anxious_last
4825,3.0,3.0,0.673,0,0,0,1,3.0
4826,3.0,4.0,64.314667,0,1,0,2,3.0
245,3.0,1.0,20.5905,0,0,0,2,3.0
4827,3.0,2.0,0.0,0,0,0,0,3.0
4828,3.0,3.0,53.9572,0,3,0,2,3.0


In [13]:
# Normalize predictors
mood_normz = mood_anxious.copy()
features_to_normalize = ['day_time_window', 'average_TimeUse', 'messaging', 'socialnetworking', 'otherapp']
mood_normz[features_to_normalize] = mood_normz[features_to_normalize].apply(lambda x:(x-x.min()) / (x.max()-x.min()))

In [14]:
def macroaverage_mae(test, pred):
    mae_0, mae_1, mae_2, mae_3, mae_4, mae_5 = (0,0,0,0,0,0)

    for i in range(len(pred)):
        if test[i] == 0 and pred[i] != 0:
            mae_0 += (abs(test[i]-pred[i]))
        if test[i] == 1 and pred[i] != 1:
            mae_1 += (abs(test[i]-pred[i]))
        if test[i] == 2 and pred[i] != 2:
            mae_2 += (abs(test[i]-pred[i]))
        if test[i] == 3 and pred[i] != 3:
            mae_3 += (abs(test[i]-pred[i]))
        if test[i] == 4 and pred[i] != 4:
            mae_4 += (abs(test[i]-pred[i]))
        if test[i] == 5 and pred[i] != 5:
            mae_5 += (abs(test[i]-pred[i]))

    cnt_0, cnt_1, cnt_2, cnt_3, cnt_4, cnt_5 = (0,0,0,0,0,0)
    for z in range(len(test)):
        if test[z] == 0:
            cnt_0 += 1
        if test[z] == 1:
            cnt_1 += 1
        if test[z] == 2:
            cnt_2 += 1
        if test[z] == 3:
            cnt_3 += 1
        if test[z] == 4:
            cnt_4 += 1
        if test[z] == 5:
            cnt_5 += 1

    mae_macroaverage = ((mae_0/cnt_0) + (mae_1/cnt_1) + (mae_2/cnt_2) + (mae_3/cnt_3) + (mae_4/cnt_4) + (mae_5/cnt_5)) / 6
    return mae_macroaverage 



In [15]:
# X = predictors, y = target
X = mood_normz.iloc[:,1:-1]
y = mood_normz.iloc[:, 0:1]

# Convert float to int
y = y.astype(int)

In [16]:
X_train, y_train = shuffle(X, y)

In [17]:
# Convert oversample dataset to Pandas DataFrame
X = pd.DataFrame(data=X_train)
y = pd.DataFrame(data=y_train)

In [18]:
n_split=10
f1_scores = []
f1_scores_classes = []
mmae_scores = []
for train_index, test_index in KFold(n_splits = n_split, random_state=2, shuffle=True).split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    sm = SMOTENC(categorical_features=[0, 2], random_state = 2) 
    X_train_oversampl, y_train_oversampl = sm.fit_sample(X_train, y_train['anxious'].ravel()) 
    X_use, y_use = shuffle(X_train_oversampl, y_train_oversampl)
    
    model = Sequential()
    model.add(Dense(24, input_dim=6, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    
    model.fit(X_use, y_use, epochs=100, batch_size=512, verbose=0)
    y_pred = model.predict(X_test)
    
    pred = []
    for a in y_pred:
        for b in a:
            pred.append(round(b))
            
    print("Predicted values: ", Counter(pred))
    f1_scores.append(round(f1_score(y_test, pred, average='weighted'), 4))
    f1_scores_classes.append(f1_score(y_test, pred, average=None))
    
    test = y_test.values
    mmae_scores.append(macroaverage_mae(test, pred))

Predicted values:  Counter({3.0: 953, 2.0: 308, 1.0: 58, 0.0: 3, -1.0: 1})


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Predicted values:  Counter({3.0: 687, 2.0: 538, 1.0: 81, 0.0: 15, -1.0: 1})


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Predicted values:  Counter({3.0: 762, 2.0: 448, 1.0: 106, 0.0: 3, 4.0: 1, -4.0: 1, -1.0: 1})


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Predicted values:  Counter({2.0: 678, 3.0: 537, 1.0: 87, 4.0: 14, 0.0: 5, -1.0: 1})


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Predicted values:  Counter({2.0: 690, 3.0: 563, 1.0: 59, 0.0: 8, 4.0: 2})


  'precision', 'predicted', average, warn_for)


Predicted values:  Counter({2.0: 675, 3.0: 541, 1.0: 91, 0.0: 11, 4.0: 2, -2.0: 1, -4.0: 1})


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Predicted values:  Counter({3.0: 752, 2.0: 488, 1.0: 76, -0.0: 4, -1.0: 2})


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Predicted values:  Counter({2.0: 777, 3.0: 476, 1.0: 63, 0.0: 4, 4.0: 2})


  'precision', 'predicted', average, warn_for)


Predicted values:  Counter({3.0: 784, 2.0: 430, 1.0: 99, 4.0: 7, -1.0: 1, 0.0: 1})


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Predicted values:  Counter({3.0: 890, 2.0: 348, 1.0: 80, 4.0: 2, 0.0: 2})


  'precision', 'predicted', average, warn_for)


In [19]:
print("Mean F1-score per class: ", [round(np.mean(x), 4) for x in zip(*f1_scores_classes)])
print("STD F1-score per class: ", [round(np.std(x), 4) for x in zip(*f1_scores_classes)])

Mean F1-score per class:  [0.0022, 0.032, 0.1146, 0.1734, 0.1126, 0.0309]
STD F1-score per class:  [0.0042, 0.0391, 0.0699, 0.0423, 0.0707, 0.0619]


In [20]:
print("Macro-averaged F1-scores: ", f1_scores)
print("Mean MA F1-scores: ", round(np.mean(f1_scores), 4))
print("Std MA F1-scores: ", round(np.std(f1_scores), 4))

Macro-averaged F1-scores:  [0.0573, 0.078, 0.0692, 0.0791, 0.0658, 0.0835, 0.0569, 0.0577, 0.0644, 0.0601]
Mean MA F1-scores:  0.0672
Std MA F1-scores:  0.0094


In [21]:
print("Macro-averaged MAE: ", mmae_scores)
print("Mean MMAE-scores: ", round(np.mean(mmae_scores), 4))
print("Std MMAE-scores: ", round(np.std(mmae_scores), 4))

Macro-averaged MAE:  [array([1.45693798]), array([1.48751011]), array([1.43925808]), array([1.52255568]), array([1.53200698]), array([1.52299358]), array([1.45502025]), array([1.49323558]), array([1.50676595]), array([1.43515362])]
Mean MMAE-scores:  1.4851
Std MMAE-scores:  0.0345
