In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns 
from sklearn.metrics import mean_squared_error, r2_score

### Make benchmark performance model

In [2]:
df = pd.read_csv('data/data_arima.csv', index_col=0)[['id', 'mood', 'time']]
df['time'] = pd.to_datetime(df['time'])
df.head(n=4)

Unnamed: 0,id,mood,time
0,AS14.01,,2014-02-17
1,AS14.01,,2014-02-18
2,AS14.01,,2014-02-19
3,AS14.01,,2014-02-20


In [3]:
# add column with mood of next day
df["moodpreviousday"] = df['mood'].copy().shift(1) 
df.head(n=10)

Unnamed: 0,id,mood,time,moodpreviousday
0,AS14.01,,2014-02-17,
1,AS14.01,,2014-02-18,
2,AS14.01,,2014-02-19,
3,AS14.01,,2014-02-20,
4,AS14.01,,2014-02-21,
5,AS14.01,,2014-02-22,
6,AS14.01,,2014-02-25,
7,AS14.01,6.25,2014-02-26,
8,AS14.01,6.333333,2014-02-27,6.25
9,AS14.01,,2014-02-28,6.333333


In [4]:
df = df[~df.moodpreviousday.isna()]
df = df[~df.mood.isna()]
print(df.head(n=4))

         id      mood       time  moodpreviousday
8   AS14.01  6.333333 2014-02-27             6.25
27  AS14.01  6.400000 2014-03-22             6.20
28  AS14.01  6.800000 2014-03-23             6.40
29  AS14.01  6.000000 2014-03-24             6.80


In [5]:
# benchmarking
benchmark = df.groupby('id').apply(lambda x: mean_squared_error(x.mood, x.moodpreviousday)).to_frame('mse')
benchmark['r2'] = df.groupby('id').apply(lambda x: r2_score(x.mood, x.moodpreviousday))
benchmark['corr'] = df.apply(lambda x: abs(x.mood - x.moodpreviousday) < 0.5, axis=1)
benchmark['corr'] = df.groupby('id').apply(lambda x: x.sum() / x.count())
benchmark
benchmark.to_csv('results/benchmark.csv')