In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score, make_scorer, classification_report, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
import mord
from collections import Counter

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [2]:
path = "/Users/Stijn/Documents/Master Data Science and Society/Block 3/thesis/code/thesis_Code/"
mood = pd.read_csv(path+'mood_imputed_median.csv', sep = ',', index_col=0)
mood['anxious_last'] = mood.groupby('user_id')['anxious'].shift()
mood.loc[(pd.isnull(mood.anxious_last)), 'anxious_last'] = mood['anxious']
mood_anxious = mood.drop(["bored", "content", "cheerful", "user_id", "response_time"], axis=1)

In [3]:
change_list = []
for index, row in mood_anxious.iterrows():
    if row['anxious'] == row['anxious_last']:
        change_list.append(1)
    if row['anxious'] > row['anxious_last']:
        change_list.append(2)
    if row['anxious'] < row['anxious_last']:
        change_list.append(0)

mood_anxious['change'] = change_list


In [4]:
mood_anxious = mood_anxious.drop(['anxious', 'anxious_last'], axis=1)

In [5]:
# Normalize predictors
mood_normz = mood_anxious.copy()
features_to_normalize = ['day_time_window', 'average_TimeUse', 'messaging', 'socialnetworking', 'otherapp']
mood_normz[features_to_normalize] = mood_normz[features_to_normalize].apply(lambda x:(x-x.min()) / (x.max()-x.min()))

In [6]:
def macroaverage_mae(test, pred):
    mae_1, mae_2, mae_3 = (0,0,0)

    for i in range(len(pred)):
        if test[i] == 0 and pred[i] != 0:
            mae_1 += (abs(test[i]-pred[i]))
        if test[i] == 1 and pred[i] != 1:
            mae_2 += (abs(test[i]-pred[i]))
        if test[i] == 2 and pred[i] != 2:
            mae_3 += (abs(test[i]-pred[i]))

    cnt_1, cnt_2, cnt_3 = (0,0,0)
    for z in range(len(test)):
        if test[z] == 0:
            cnt_1 += 1
        if test[z] == 1:
            cnt_2 += 1
        if test[z] == 2:
            cnt_3 += 1

    mae_macroaverage = ((mae_1/cnt_1) + (mae_2/cnt_2) + (mae_3/cnt_3)) / 6
    return mae_macroaverage 

In [7]:
# X = predictors, y = target
X = mood_normz.iloc[:,:-1]
y = mood_normz.iloc[:, -1:]

In [8]:
X_train, y_train = shuffle(X, y)

In [9]:
# Convert oversample dataset to Pandas DataFrame
X = pd.DataFrame(data=X_train)
y = pd.DataFrame(data=y_train)

In [10]:
n_split=10
f1_scores = []
f1_scores_classes = []
mmae_scores = []
for train_index, test_index in KFold(n_splits = n_split, random_state=2, shuffle=True).split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    sm = SMOTENC(categorical_features=[0, 2], random_state = 2) 
    X_train_oversampl, y_train_oversampl = sm.fit_sample(X_train, y_train['change'].ravel()) 
    X_use, y_use = shuffle(X_train_oversampl, y_train_oversampl)
    
    model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
                                       
    model.fit(X_use, y_use )                                    
    pred = model.predict(X_test)
    print("Predicted values: ", Counter(pred))
    
    f1_scores.append(round(f1_score(y_test, pred, average='weighted'), 4))
    f1_scores_classes.append(f1_score(y_test, pred, average=None))
    
    test = y_test.values
    mmae_scores.append(macroaverage_mae(test, pred))

Predicted values:  Counter({1: 798, 0: 310, 2: 215})
Predicted values:  Counter({1: 851, 0: 304, 2: 167})
Predicted values:  Counter({1: 783, 0: 333, 2: 206})
Predicted values:  Counter({1: 793, 0: 313, 2: 216})
Predicted values:  Counter({1: 785, 0: 337, 2: 200})
Predicted values:  Counter({1: 815, 0: 293, 2: 214})
Predicted values:  Counter({1: 807, 0: 314, 2: 201})
Predicted values:  Counter({1: 813, 0: 308, 2: 201})
Predicted values:  Counter({1: 839, 0: 335, 2: 148})
Predicted values:  Counter({1: 793, 0: 320, 2: 209})


In [11]:
print("Mean F1-score per class: ", [round(np.mean(x), 3) for x in zip(*f1_scores_classes)])
print("STD F1-score per class: ", [round(np.std(x), 3) for x in zip(*f1_scores_classes)])

Mean F1-score per class:  [0.208, 0.707, 0.186]
STD F1-score per class:  [0.012, 0.017, 0.032]


In [12]:
print("Macro-averaged F1-scores: ", f1_scores)
print("Mean MA F1-scores: ", round(np.mean(f1_scores), 3))
print("Std MA F1-scores: ", round(np.std(f1_scores), 3))

Macro-averaged F1-scores:  [0.5721, 0.6054, 0.5823, 0.5698, 0.552, 0.5684, 0.5953, 0.5672, 0.6085, 0.5546]
Mean MA F1-scores:  0.578
Std MA F1-scores:  0.019


In [13]:
mmae_scores_1 = []
for a in mmae_scores:
    for b in a:
        mmae_scores_1.append(round(b, 3))
print("Macro-averaged MAE: ", mmae_scores_1)
print("Mean MMAE-scores: ", round(np.mean(mmae_scores), 3))
print("Std MMAE-scores: ", round(np.std(mmae_scores), 3))

Macro-averaged MAE:  [0.369, 0.363, 0.39, 0.379, 0.406, 0.391, 0.397, 0.404, 0.371, 0.389]
Mean MMAE-scores:  0.386
Std MMAE-scores:  0.014
