# Data prepation

In [None]:
import pandas as pd
import numpy as np
import hdbscan
from sklearn.mixture import GaussianMixture
from retentioneering import init_client
from retentioneering import utils, analysis, visualization, preparing

### Download events

Load data from CSV file. 

The table should have at least 3 columns:
- 'event_name',
- 'event_timestamp',
- 'user_pseudo_id'.

In [None]:
data = pd.read_csv('/path/to/your/data.csv')

We divide the events of each user on sessions.

In this case, the session column will be added to the input table, which contains the number of the session the event belongs to.

In [None]:
model = preparing.SessionSplitter(n_components=3)
model.fit(data, columns_config = {'event_name_col': 'event_name',
                                  'event_timestamp_col': 'event_timestamp',
                                  'user_id_col': 'user_pseudo_id'})
data = model.predict(data, thr_prob=0.95, sort=True)

In [None]:
data.head(2)

### Donwload target events for each mechanic

We load the mechanics and all key events for each.

The table must contain the mechanic's id (name) and an event column for that mechanic.

In [None]:
mechanics = pd.read_csv('/path/to/your/mechanics.csv')
mechanics['id'] = mechanics['id'].ffill()
mechanics = mechanics[mechanics.Events.notnull()]

In [None]:
mechanics.head(2)

### Enrich mechanics with new events

We use a function that selects all the events related to a particular mechanics from your source data.

In [None]:
mechanics_events = analysis.mechanics_enrichment(data, mechanics, 'id', 'Events', q=.99, q2=.99)

In [None]:
mechanics_events.head(2)

### Calculate weight for each mechanic for every session (or user)

Let's calculate the weights for each mechanic for each session.

The higher the weight of the mechanics in a session, the more likely the user wanted to use it in that session.

The maximum weight is 1. In one session there can be several mechanics with more weight.

In [None]:
session_stats = analysis.calc_all_norm_mech(data, mechanics_events, 'session')

The result table includes user id, session number, number of events per session, time of the first and last events and duration in seconds.

In [None]:
session_stats.head(2)

# Cluster sessions

## hdbscan

We can use collected statistics for the sessions for clustering

As a clustering algorithm, we use hdbscan.

In [None]:
clust = hdbscan.HDBSCAN(min_cluster_size=int(len(session_stats)*0.01))

Before clustering, we delete the first 5 user sessions, because our new users try different mechanics in their first sessions.

If you are not researching new users, you can skip this step.

In [None]:
# for every user_pseudo_id
# delete first five user's sessions

session_stats.sort_values('session')
session_stats.loc[:, 'user_sess_num'] = 1
session_stats['user_sess_num'] = session_stats \
    .sort_values('session') \
    .groupby('user_pseudo_id').user_sess_num.cumsum()
session_stats = session_stats.loc[session_stats.user_sess_num > 5, :]

Run the clustering algorithm and get the results.

Its main advantage is there is no need to specify the number of clusters because it will choose the most suitable itself.

In [None]:
cols = mechanics_events[mechanics_events['mode'] == 'session'].mechanics.unique()
session_vecs = session_stats.loc[:, cols].values

In [None]:
# clustering
clusters = clust.fit_predict(session_vecs)

Add the cluster number for session to the additional column.

In [None]:
session_stats.loc[:, 'sess_clust'] = clusters

The following command shows the number of sessions in each cluster.

In [None]:
session_stats.sess_clust.value_counts()

## Visualize graphs

Now we can build a graph of events.

In [None]:
data.loc[:, 'event_timestamp'] = data.event_timestamp.astype(int) // 10**6
data = preparing.add_first_and_last_events(data, first_event_name='first_event', last_event_name='last_event')

### Session

We can do this for one session for a single user 

In [None]:
selected_session = 20
selected_session_data = data[data.session == selected_session]

In [None]:
visualization.tree_selectors.print_checkboxes(selected_session_data, checkbox_id='1', is_checked=True)

In [None]:
agg_data_session = analysis.get_all_agg(
    selected_session_data.loc[selected_session_data.event_name.isin(result_filter), :], 
    ['trans_count'])

visualization.plot.plot_graph(agg_data_session, 'trans_count', settings={}, 
                              layout=visualization.layouts.sugiyama_layout)

### Cluster

Or for different sessions and different users within the same cluster

In [None]:
selected_cluster = 2
selected_cluster_data = data[data.session.isin(session_stats[session_stats.sess_clust == selected_cluster].session)]

In [None]:
visualization.tree_selectors.print_checkboxes(selected_cluster_data, checkbox_id='2', is_checked=True)

In [None]:
agg_data_cluster = analysis.get_all_agg(
    selected_cluster_data.loc[selected_cluster_data.event_name.isin(result_filter), :], 
    ['trans_count'])

visualization.plot.plot_graph(agg_data_cluster, 'trans_count', settings={}, 
                              layout=visualization.layouts.sugiyama_layout)