In [118]:
import datetime as dt
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture as GMM
from sklearn import metrics
import altair as alt
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

In [2]:
types = {'user' : 'category', 'sport' : 'category'}

In [3]:
tours = pd.read_csv('tour-events.csv', dtype = types, parse_dates = ['timestamp'])
tours.head()

Unnamed: 0,timestamp,user,sport,longitude,latitude
0,2016-05-29 06:00:00,51663fe728afd0c03701845329190f7fe93c3a8ab2ff56...,hike,16.1,47.7
1,2016-08-15 19:00:00,08f0a9cb02168dce9544c7ce237eb2a291901a048bad24...,hike,12.0,47.7
2,2016-05-14 19:00:00,0762522db4b59efa5a11a535c88477d6c7a63972febdda...,hike,9.4,51.8
3,2017-06-12 20:00:00,70cc307f4148283217dadfcbff54eb9e7d7a944a0b57f6...,touringbicycle,7.8,48.0
4,2014-05-25 09:00:00,c1fc1e63e5e3875538ca23351954ed39405ec27b0c15bb...,mtb,6.8,50.6


In [4]:
len(tours.user.unique())

60201

In [5]:
tours.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676701 entries, 0 to 676700
Data columns (total 5 columns):
timestamp    676701 non-null datetime64[ns]
user         676701 non-null category
sport        676701 non-null category
longitude    676380 non-null float64
latitude     676380 non-null float64
dtypes: category(2), datetime64[ns](1), float64(2)
memory usage: 21.7 MB


In [11]:
tours['date'] = tours.timestamp.dt.date

In [12]:
last_days = tours.groupby('user')['date'].max().reset_index()

In [13]:
sports = tours.groupby(['user', 'sport']).agg({'date' : [(lambda x : len(x.unique())), min, max]}).reset_index()

In [14]:
sports.columns = sports.columns.droplevel(0)
sports.columns = ['user', 'sport', 'active_days', 'first_date', 'last_date']

In [15]:
sports.head()

Unnamed: 0,user,sport,active_days,first_date,last_date
0,00036b3dfee1029b02e46c50fe5316ce96974f763e79e9...,hike,1,2017-08-15,2017-08-15
1,00036b3dfee1029b02e46c50fe5316ce96974f763e79e9...,mtb,6,2017-08-15,2017-09-09
2,0003d5a0aafa8758eef81d1167cc0a37fc36f885d2daa3...,mtb_easy,1,2017-07-01,2017-07-01
3,0003d5a0aafa8758eef81d1167cc0a37fc36f885d2daa3...,touringbicycle,8,2017-06-07,2017-08-30
4,000531fe381257d0684785fb3d04805a67be563a49554c...,mtb,6,2017-05-21,2017-08-27


In [16]:
sports.sport.unique()

[hike, mtb, mtb_easy, touringbicycle, racebike, ..., citybike, mtb_advanced, mountaineering_advanced, snowboard, Other]
Length: 23
Categories (23, object): [hike, mtb, mtb_easy, touringbicycle, ..., mtb_advanced, mountaineering_advanced, snowboard, Other]

In [17]:
sports['sports_period'] = ((sports.last_date - sports.first_date).dt.days) + 1

sports['sports_frequency'] = sports.active_days / sports.sports_period

In [23]:
data = sports.pivot_table(index = ['user'], columns = 'sport', values = ['active_days', 'sports_frequency'], fill_value = 0)
# flatten column names
data.columns = list(map("_".join, data.columns))

In [24]:
data = data.merge(last_days, on = 'user')
data = data.rename({'date' : 'last_usage_date'}, axis='columns')
data['days_inactive'] = (data.last_usage_date.max() - data.last_usage_date).dt.days

In [25]:
data.head()

Unnamed: 0,user,active_days_Other,active_days_citybike,active_days_climbing,active_days_downhillbike,active_days_hike,active_days_jogging,active_days_mountaineering,active_days_mountaineering_advanced,active_days_mtb,...,sports_frequency_skaten,sports_frequency_skialpin,sports_frequency_skitour,sports_frequency_sled,sports_frequency_snowboard,sports_frequency_snowshoe,sports_frequency_touringbicycle,sports_frequency_unicycle,last_usage_date,days_inactive
0,00036b3dfee1029b02e46c50fe5316ce96974f763e79e9...,0,0,0,0,1,0,0,0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-09,4
1,0003d5a0aafa8758eef81d1167cc0a37fc36f885d2daa3...,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.094118,0.0,2017-08-30,14
2,000531fe381257d0684785fb3d04805a67be563a49554c...,0,0,0,0,0,0,0,0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-08-27,17
3,0006a7346af1752583b6d5fd6872dcdd91aa1c46a64c40...,0,0,0,0,5,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-09-10,3
4,00092cf7fb3ac9faadba1f5a0b613690be75c86ec37a0c...,0,0,0,0,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-08-23,21


In [115]:
plot_data = data.drop('last_usage_date', axis = 'columns').melt(id_vars = 'user')

In [122]:
plot_data.head()

Unnamed: 0,user,variable,value
0,00036b3dfee1029b02e46c50fe5316ce96974f763e79e9...,active_days_Other,0.0
1,0003d5a0aafa8758eef81d1167cc0a37fc36f885d2daa3...,active_days_Other,0.0
2,000531fe381257d0684785fb3d04805a67be563a49554c...,active_days_Other,0.0
3,0006a7346af1752583b6d5fd6872dcdd91aa1c46a64c40...,active_days_Other,0.0
4,00092cf7fb3ac9faadba1f5a0b613690be75c86ec37a0c...,active_days_Other,0.0


In [None]:
alt.Chart(plot_data).mark_bar().encode(alt.X('value:Q', bin = alt.Bin(maxbins=500)), y = 'count():Q').facet(row = 'variable:N').resolve_scale(x = 'independent')

most dimensions are zero-inflated

## Clustering

In [38]:
input = data.drop(['user', 'last_usage_date'], axis = 'columns')

In [41]:
gmm = GMM(n_components=4).fit(input)
# covariance_type='full'
#data['label_1'] = gmm.predict(input)

In [109]:
cov = ['spherical', 'tied', 'diag', 'full']
n = np.arange(3, 16)

models = [GMM(n, covariance_type = c, random_state=0).fit(input)
          for n in n for c in cov]

In [146]:
aic = [s.aic(input) for s in models]
bic = [s.bic(input) for s in models]


In [144]:
# Latent Dirichlet allocation for high dimensions
# scaling : needed only when using priors
shil_scores = list()


for m in models:
    l = m.predict(input)
    s = metrics.silhouette_score(input, l, metric='euclidean')
    shil_scores.append(s)

In [148]:
results = pd.DataFrame({'model' : models, 'aic' : aic, 'bic' : bic, 'shil_score' : shil_scores})

In [165]:
results#.model[13]#.head()

Unnamed: 0,model,aic,bic,shil_score
0,"GaussianMixture(covariance_type='spherical', i...",25409920.0,25410790.0,0.543007
1,"GaussianMixture(covariance_type='tied', init_p...",-799884.3,-788870.7,0.75398
2,"GaussianMixture(covariance_type='diag', init_p...",-19619120.0,-19617420.0,-0.224916
3,"GaussianMixture(covariance_type='full', init_p...",-19746880.0,-19725700.0,-0.166997
4,"GaussianMixture(covariance_type='spherical', i...",23208430.0,23209750.0,0.60528
5,"GaussianMixture(covariance_type='tied', init_p...",-811906.3,-800460.4,0.707373
6,"GaussianMixture(covariance_type='diag', init_p...",-22164520.0,-22161960.0,-0.269819
7,"GaussianMixture(covariance_type='full', init_p...",-21432740.0,-21400980.0,-0.224304
8,"GaussianMixture(covariance_type='spherical', i...",22214180.0,22215940.0,0.683097
9,"GaussianMixture(covariance_type='tied', init_p...",-821445.2,-809567.0,0.653972
