In [11]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score, make_scorer, classification_report
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
import mord
from collections import Counter

In [2]:
path = "/Users/Stijn/Documents/Master Data Science and Society/Block 3/thesis/code/thesis_Code/"
mood = pd.read_csv(path+'mood_imputed_median.csv', sep = ',', index_col=0)
mood['bored_last'] = mood.groupby('user_id')['bored'].shift()
mood.loc[(pd.isnull(mood.bored_last)), 'bored_last'] = mood['bored']
mood_bored = mood.drop(["anxious", "content", "cheerful", "user_id", "response_time"], axis=1)

In [25]:
mood_bored[:5]
X.iloc[:,1:]

Unnamed: 0,1,2,3,4,5
0,0.061624,0.0,0.012228,0.020000,0.000000
1,0.004454,0.0,0.000000,0.020000,0.000000
2,0.000000,0.0,0.000000,0.000000,0.000000
3,0.000000,0.0,0.000000,0.000000,0.000000
4,0.029749,0.0,0.007190,0.000000,0.008547
...,...,...,...,...,...
38413,0.000000,0.0,0.000000,0.000000,0.000000
38414,0.017027,0.0,0.068120,0.012316,0.068376
38415,0.014488,0.0,0.009524,0.020000,0.017094
38416,0.000000,0.0,0.000000,0.000000,0.000000


In [4]:
# Normalize predictors
mood_normz = mood_bored.copy()
features_to_normalize = ['day_time_window', 'average_TimeUse', 'messaging', 'socialnetworking', 'otherapp']
mood_normz[features_to_normalize] = mood_normz[features_to_normalize].apply(lambda x:(x-x.min()) / (x.max()-x.min()))

In [5]:
# X = predictors, y = target
X = mood_normz.iloc[:,1:]
y = mood_normz.iloc[:, 0:1]

# Convert float to int
y = y.astype(int)

In [6]:
# Oversampling unbalanced target
print("Before OverSampling, counts of labels: {}".format(y['bored'].value_counts()))
  
sm = SMOTENC(categorical_features=[0, 2, 6], random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X, y['bored'].ravel()) 

print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '2': {}".format(sum(y_train_res == 2))) 
print("After OverSampling, counts of label '3': {}".format(sum(y_train_res == 3))) 
print("After OverSampling, counts of label '4': {}".format(sum(y_train_res == 4))) 
print("After OverSampling, counts of label '5': {}".format(sum(y_train_res == 5))) 

Before OverSampling, counts of labels: 0    6403
1    2418
2    2295
3    1485
4     522
5      98
Name: bored, dtype: int64
After OverSampling, counts of label '0': 6403
After OverSampling, counts of label '1': 6403
After OverSampling, counts of label '2': 6403
After OverSampling, counts of label '3': 6403
After OverSampling, counts of label '4': 6403
After OverSampling, counts of label '5': 6403


In [7]:
# Convert oversample dataset to Pandas DataFrame
X_train = pd.DataFrame(data=X_train_res)
y_train = pd.DataFrame(data=y_train_res)

In [8]:
X_train_nomood = X_train.iloc[:,:-1]
X_train_onlyboredlast = X_train.iloc[:,-1:]

In [9]:
X_train = X_train_nomood.values
y_train = y_train.values.ravel()

In [10]:
# Shuffle both train and test set to cancel patterns
X, y = shuffle(X_train, y_train)

In [16]:
rfc=RandomForestClassifier(random_state=42)
parameter_candidates = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
clf = GridSearchCV(estimator=rfc, 
                    param_grid=parameter_candidates,
                    cv=10,
                    n_jobs=-1)

In [17]:
clf_fit = clf.fit(X, y)

In [18]:
print('Best score:', clf_fit.best_score_) 
print(" ")
print('Best estimator:',clf_fit.best_estimator_) 

Best score: 0.35741058878650633
 
Best estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


In [19]:
def macroaverage_mae(test, pred):
    mae_0, mae_1, mae_2, mae_3, mae_4, mae_5 = (0,0,0,0,0,0)

    for i in range(len(pred)):
        if test[i] == 0 and pred[i] != 0:
            mae_0 += (abs(test[i]-pred[i]))
        if test[i] == 1 and pred[i] != 1:
            mae_1 += (abs(test[i]-pred[i]))
        if test[i] == 2 and pred[i] != 2:
            mae_2 += (abs(test[i]-pred[i]))
        if test[i] == 3 and pred[i] != 3:
            mae_3 += (abs(test[i]-pred[i]))
        if test[i] == 4 and pred[i] != 4:
            mae_4 += (abs(test[i]-pred[i]))
        if test[i] == 5 and pred[i] != 5:
            mae_5 += (abs(test[i]-pred[i]))

    cnt_0, cnt_1, cnt_2, cnt_3, cnt_4, cnt_5 = (0,0,0,0,0,0)
    for z in range(len(test)):
        if test[z] == 0:
            cnt_0 += 1
        if test[z] == 1:
            cnt_1 += 1
        if test[z] == 2:
            cnt_2 += 1
        if test[z] == 3:
            cnt_3 += 1
        if test[z] == 4:
            cnt_4 += 1
        if test[z] == 5:
            cnt_5 += 1

    mae_macroaverage = ((mae_0/cnt_0) + (mae_1/cnt_1) + (mae_2/cnt_2) + (mae_3/cnt_3) + (mae_4/cnt_4) + (mae_5/cnt_5)) / 6
    return mae_macroaverage 

#score = make_scorer(macroaverage_mae, greater_is_better=False)

In [27]:
# X = predictors, y = target
X = mood_normz.iloc[:,1:-1]
y = mood_normz.iloc[:, 0:1]

# Convert float to int
y = y.astype(int)

In [28]:
n_split=10
f1_scores = []
f1_scores_classes = []
mmae_scores = []
for train_index, test_index in KFold(n_splits = n_split, random_state=2, shuffle=True).split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    sm = SMOTENC(categorical_features=[0, 2], random_state = 2) 
    X_train_oversampl, y_train_oversampl = sm.fit_sample(X_train, y_train['bored'].ravel()) 
    
    model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
                                       
    model.fit(X_train_oversampl, y_train_oversampl )                                    
    pred = model.predict(X_test)
    print("Predicted values: ", Counter(pred))
    
    f1_scores.append(round(f1_score(y_test, pred, average='weighted'), 4))
    f1_scores_classes.append(f1_score(y_test, pred, average=None))
    
    test = y_test.values
    mmae_scores.append(macroaverage_mae(test, pred))

Predicted values:  Counter({3: 382, 0: 373, 5: 343, 1: 125, 4: 54, 2: 46})
Predicted values:  Counter({3: 491, 0: 372, 5: 208, 1: 152, 2: 52, 4: 47})
Predicted values:  Counter({3: 499, 0: 353, 5: 194, 1: 168, 2: 62, 4: 46})
Predicted values:  Counter({5: 374, 3: 359, 0: 296, 1: 170, 2: 70, 4: 53})
Predicted values:  Counter({0: 369, 3: 367, 5: 350, 1: 129, 4: 57, 2: 50})
Predicted values:  Counter({5: 387, 0: 377, 3: 334, 1: 120, 2: 61, 4: 43})
Predicted values:  Counter({0: 376, 3: 349, 5: 343, 1: 134, 2: 67, 4: 53})
Predicted values:  Counter({0: 371, 3: 359, 5: 347, 1: 129, 2: 66, 4: 50})
Predicted values:  Counter({3: 503, 0: 360, 5: 205, 1: 144, 2: 61, 4: 49})
Predicted values:  Counter({3: 635, 0: 376, 1: 124, 5: 80, 4: 57, 2: 50})


In [29]:
print("Mean F1-score per class: ", [round(np.mean(x), 4) for x in zip(*f1_scores_classes)])
print("STD F1-score per class: ", [round(np.std(x), 4) for x in zip(*f1_scores_classes)])

Mean F1-score per class:  [0.3852, 0.1668, 0.0706, 0.1898, 0.0414, 0.012]
STD F1-score per class:  [0.0161, 0.0261, 0.0147, 0.0193, 0.0258, 0.0063]


In [31]:
print("Macro-averaged F1-scores: ", f1_scores)
print("Mean MA F1-scores: ", round(np.mean(f1_scores), 4))
print("Std MA F1-scores: ", round(np.std(f1_scores), 4))

Macro-averaged F1-scores:  [0.2448, 0.2636, 0.2667, 0.2399, 0.2402, 0.2513, 0.2621, 0.238, 0.2581, 0.2604]
Mean MA F1-scores:  0.2525
Std MA F1-scores:  0.0104


In [32]:
print("Macro-averaged MAE: ", mmae_scores)
print("Mean MMAE-scores: ", round(np.mean(mmae_scores), 4))
print("Std MMAE-scores: ", round(np.std(mmae_scores), 4))

Macro-averaged MAE:  [array([2.09819959]), array([1.85038341]), array([1.86577965]), array([2.10866282]), array([1.89640009]), array([1.73891568]), array([2.16378582]), array([2.05197649]), array([1.74158547]), array([1.91280214])]
Mean MMAE-scores:  1.9428
Std MMAE-scores:  0.1456
