In [None]:
from bach_open_taxonomy import ObjectivFrame
import bach
from matplotlib import pyplot as plt

In [None]:
of = ObjectivFrame.from_objectiv_data(start_date='2022-02-02',time_aggregation='YYYY-MM-DD')

### describe all data

In [None]:
# todo : exclude user_id because uuid

In [None]:
of[[x for x in of.data_columns if x !='user_id']].describe(include='all').head()

### look at data

In [None]:
of.head()

### create a feature set 


### numerical

In [None]:
of['root'] = of.location_stack.ls.get_from_context_with_type_series(type='RootLocationContext', key='id')

In [None]:
of.root.head()

### check missing values

In [None]:
of.root.isnull().value_counts().head()

there are none, yay!

### look what we have

In [None]:
of.root.unique().values

### let's combine with event type

In [None]:
of.event_type.unique().values

### only interested in presses in sections of our site

In [None]:
of[(of.event_type=='PressEvent')].root.unique().values

### describe

In [None]:
of[(of.event_type=='PressEvent')].describe(include='string').head()

### now we create the variable: presses in a section per user

In [None]:
features = of[(of.event_type=='PressEvent')].groupby(['user_id','root']).session_hit_number.count()

In [None]:
features_unstacked = features.unstack()

In [None]:
features_unstacked.head()

### emtpy values let's fill them

In [None]:
features_unstacked = features.unstack(fill_value=0)

### describe again and plot, one has a lot higher values. We might want to do something about that, at some point.

In [None]:
features_unstacked.materialize().describe().head()

### Use cut to get histograms for the entire data set

In [None]:
figure, axis = plt.subplots(2, 4,figsize=(15,10))

for idx, name in enumerate(features_unstacked.data_columns):
    df_bins = features_unstacked[name].cut(bins=5)
    df_bins.value_counts().to_pandas().plot(title = name, kind='bar', ax=axis.flat[idx])
plt.tight_layout()

Now you could do something about this, drop outliers, scaling, but in our example, we accept the distribution and continue

### add another feature: time

In [None]:
features_unstacked.head()

In [None]:
session_duration = of.groupby(['user_id','session_id']).aggregate({'moment': ['min', 'max']})
session_duration['session_duration'] = session_duration['moment_max']-session_duration['moment_min']
session_duration = session_duration.reset_index().groupby('user_id').session_duration.sum()
features_unstacked['session_duration'] = session_duration

In [None]:
features_unstacked.session_duration.describe().head()

can't directly load timedelta's in to sklearn though

### boolean / categorical

this is just an made up example, using the matching user agent as a proxy for device

In [None]:
of['mobile'] = of.global_contexts.gc.get_from_context_with_type_series('HttpContext', 'user_agent')=='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'


In [None]:
of.head()

In [None]:
of.mobile.value_counts().head()

### create a feature per user for this

In [None]:
features_unstacked['is_mobile'] = of[['user_id','mobile']].drop_duplicates().set_index('user_id').mobile

In [None]:
features_unstacked[features_unstacked.is_mobile].head()

In [None]:
features_unstacked.head()

### there is our feature set, now you can do other stuff, like sklearn
see for example our [sklearn](sklearn-example.ipynb) notebook

In [None]:
df = features_unstacked.to_pandas()

In [None]:
df

### What's next?
- add categorical variables
- actions after finding stuff in the data, ie:
  - scaling
  - filling empty values / interpolation
- using timedelta's