In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from retentioneering import analysis, utils

import matplotlib.pyplot as plt
%matplotlib inline

data_path = '../example_datasets/train.csv'
df = pd.read_csv(data_path)


df.sort_values(by=['user_pseudo_id', 'event_timestamp'], inplace=True)

# Convert time from µs to DateTime
df['event_timestamp'] = pd.to_datetime(df['event_timestamp']/1e6, unit='s')

# Split data in sessions

model = utils.preparing.SessionSplitter(n_components=3)
model.fit(df, columns_config = {'event_name_col': 'event_name',
                                'event_timestamp_col': 'event_timestamp',
                                'user_id_col': 'user_pseudo_id'})
data = model.predict(df, thr_prob=0.95, sort=True)

# kuni stands for "kompletly unique"
data['kuni'] = data[['user_pseudo_id', 'session']].apply(tuple, 1)
data = data.sort_values(by=['kuni', 'event_timestamp'])

# put ranks
data['rank'] = data.groupby('user_pseudo_id').cumcount()

# Load table with triples mecahics - event - label
labelMap = pd.read_csv('/home/ivangon/retentioneering/event_label_map.csv').astype(str)

toTake = labelMap[labelMap.label != 'nan'].apply(lambda x: '__'.join(tuple(x)), 1)
targetEvents = toTake.append(labelMap[labelMap.label == 'nan'].event)

# select only target events
targetOnly = data[data.event_name.isin(targetEvents)]

def normalize_time_by(df, by):
    groups = df.groupby(by)
    mini = groups.event_timestamp.transform(np.min)
    maxi = groups.event_timestamp.transform(np.max)
    
    return ((df.event_timestamp - mini).apply(lambda x: x.total_seconds()) / (maxi-mini).apply(lambda x: x.total_seconds())).fillna(1)


def PlotEventsHist(df, param, path, filename, label=''):
    
    catNum = 3
    
    total = len(set(df[param]))
    
    df['category'] = np.ceil(normalize_time_by(df, 'kuni')*catNum).replace({0: 1})
    
    plt.figure(figsize=(20,6))
    plt.title('Distribution of events by {}'.format(label))
    
    color_lst=['r','y','g']
    
    stats = df.groupby(['event_name', 'category'])[param].apply(lambda x: len(set(x))/total).sort_values(ascending=False)\
                .reset_index()
    stats.columns = ['event_name', 'category', 'frac']
    
    axs = sns.barplot(data=stats, x='event_name', y = 'frac', hue='category', palette=color_lst)
    plt.xticks(rotation=90, fontsize=8)
    h, l = axs.get_legend_handles_labels()
    plt.legend(h, ['First third', 'Second third', 'Last third'], loc="upper right", frameon=True)
    
    plt.savefig('{path}{file}.svg'.format(path=path, file=filename), bbox_inches='tight')
    plt.show()
    plt.close()

path = './experiments/'
if not os.path.exists(path):
    os.mkdir(path)
    
PlotEventsHist(targetOnly, 'user_pseudo_id', path, 'target_events_hist_users', 'fraction of unique users')
PlotEventsHist(targetOnly, 'kuni', path, 'target_events_hist_sessions', 'fraction of unique sessions')
PlotEventsHist(targetOnly, 'event_timestamp', path, 'target_events_hist_number', 'fraction of all events observed')