In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from tensorflow.keras import optimizers
from sklearn import svm



In [2]:
root = r'C:\Users\janse\OneDrive\Bureaublad\Master\Data Mining Techniques'
dataset = root + r'\dataset_mood_smartphone.csv'
df_main = pd.read_csv(dataset)

In [3]:
# Extract daily features
df = df_main
variables = df.variable.unique()
df['time'] = pd.to_datetime(df['time'])
date_min = pd.to_datetime('2014-03-01')
df = df[df.time >= date_min]
df = pd.pivot_table(df, index=['id','time'], columns='variable', values='value').reset_index()
df['date'] = df['time'].apply(lambda x: x.date())
features = {'time':'count',
            'mood':'mean',
            'circumplex.valence':'mean',
            'circumplex.arousal' : 'mean',
            'activity':'mean',
            'screen':'sum',
            'call':'sum',
            'sms':'sum',
            'appCat.builtin': 'sum',
            'appCat.communication': 'sum',
            'appCat.entertainment': 'sum',
            'appCat.finance': 'sum',
            'appCat.game': 'sum',
            'appCat.office': 'sum',
            'appCat.other': 'sum',
            'appCat.social': 'sum',
            'appCat.travel': 'sum',
            'appCat.unknown': 'sum',
            'appCat.utilities': 'sum',
            'appCat.weather': 'sum'
           }
df = df.groupby(['id', 'date']).agg(features).reset_index()
df = df[~df.activity.isnull() & ~df.mood.isnull()]
# df = df[~((df.id == 'AS14.12') & (pd.to_datetime(df.date) == pd.to_datetime('2014-03-15')))]
# df[df.id == 'AS14.17'].head()

In [29]:
# Data quality
features = {'date':['min','max','count']}
df_quality = df.groupby('id').agg(features)
df_quality.columns = ["_".join(pair) for pair in df_quality.columns]
df_quality['days_between'] = (df_quality.date_max - df_quality.date_min).dt.days + 1
df_quality['days_missing'] = df_quality['days_between'] - df_quality['date_count']
df_quality

Unnamed: 0_level_0,date_min,date_max,date_count,days_between,days_missing
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AS14.01,2014-03-21,2014-05-04,45,45,0
AS14.02,2014-03-17,2014-04-24,36,39,3
AS14.03,2014-03-22,2014-05-07,46,47,1
AS14.05,2014-03-17,2014-05-05,50,50,0
AS14.06,2014-03-24,2014-05-08,45,46,1
AS14.07,2014-03-25,2014-05-04,41,41,0
AS14.08,2014-03-17,2014-05-05,50,50,0
AS14.09,2014-03-22,2014-04-26,36,36,0
AS14.12,2014-03-27,2014-05-05,40,40,0
AS14.13,2014-03-20,2014-05-02,44,44,0


In [21]:
# Clustering user types k=4

df_for_clustering = df.drop(['time', 'date'], axis=1)
features_mean = {'mood':'mean',
            'circumplex.valence':'mean',
            'circumplex.arousal' : 'mean',
            'activity':'mean',
            'screen':'mean',
            'call':'mean',
            'sms':'mean',
            'appCat.builtin': 'mean',
            'appCat.communication': 'mean',
            'appCat.entertainment': 'mean',
            'appCat.finance': 'mean',
            'appCat.game': 'mean',
            'appCat.office': 'mean',
            'appCat.other': 'mean',
            'appCat.social': 'mean',
            'appCat.travel': 'mean',
            'appCat.unknown': 'mean',
            'appCat.utilities': 'mean',
            'appCat.weather': 'mean'
           }
df_for_clustering = df_for_clustering.groupby('id').agg(features_mean)
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_for_clustering)
group = kmeans.predict(df_for_clustering)
user = df_for_clustering.reset_index()[['id']]
user['user_group'] = group

Unnamed: 0,id,group
0,AS14.01,1
1,AS14.02,2
2,AS14.03,2
3,AS14.05,2
4,AS14.06,3
5,AS14.07,3
6,AS14.08,0
7,AS14.09,2
8,AS14.12,3
9,AS14.13,3


In [4]:
# Preprocessing
window = 5

agg_dict = {'date':'count',
            'mood':['mean','first', 'last'],
            'circumplex.valence':'mean',
            'circumplex.arousal' : 'mean',
            'activity':'sum',
            'screen':'sum',
            'call':'sum',
            'sms':'sum',
            'appCat.builtin': 'sum',
            'appCat.communication': 'sum',
            'appCat.entertainment': 'sum',
            'appCat.finance': 'sum',
            'appCat.game': 'sum',
            'appCat.office': 'sum',
            'appCat.other': 'sum',
            'appCat.social': 'sum',
            'appCat.travel': 'sum',
            'appCat.unknown': 'sum',
            'appCat.utilities': 'sum',
            'appCat.weather': 'sum'
           }

def get_features(row, df, x, agg_dict):
    current_date = row['date']
    user_id = row['id']
    current_mood = row['mood']
    
    # get dataframe from previous x days
    last_window = df[(df.id == user_id) & (df.date >= current_date - pd.Timedelta(days=x)) & (df.date < current_date)]
    date_count = last_window.shape[0]
    
    last_window = last_window.groupby('id').agg(agg_dict).reset_index()
    
    # reset columns names
    last_window.columns = ["_".join(pair) for pair in last_window.columns]
    last_window['inactive_days'] = x - date_count
    last_window['label'] = current_mood
    
    return last_window

train_data = None

for i in range(df.shape[0]):
    row = df.iloc[i]
    temp = get_features(row, df, window, agg_dict)
    if i == 0:
        train_data = temp
    else:
        if i % 100 == 0:
            print('{} rows done'.format(i))
            
        train_data = pd.concat([train_data, temp])


100 rows done
200 rows done
300 rows done
400 rows done
500 rows done
600 rows done
700 rows done
800 rows done
900 rows done
1000 rows done
1100 rows done


In [15]:
labels = train_data['label'].values
features = train_data.drop(['label','id_'], axis=1).values

min_l = labels.min()
max_l = labels.max()

y_train, y_test, x_train, x_test = train_test_split(labels, features, train_size=0.9)

print(y_train.shape, y_test.shape, x_train.shape, x_test.shape)

supp_vec = svm.SVR()
supp_vec.fit(x_train, y_train)
predictions = supp_vec.predict(x_test)
print(mean_squared_error(predictions, y_test))
results = pd.DataFrame({'prediction':predictions, 'actual':y_test})

(1010,) (113,) (1010, 23) (113, 23)
0.589858498205595


In [24]:
results

Unnamed: 0,prediction,actual
0,6.737899,7.600000
1,7.306653,7.800000
2,6.177906,6.800000
3,6.595115,7.000000
4,6.604506,6.400000
5,7.002927,7.000000
6,6.085817,5.800000
7,7.447062,8.000000
8,6.997015,6.500000
9,6.770931,7.200000
