In [None]:
import sys
import datetime
import plotly
import plotly.graph_objects as go
import sqlalchemy

# TEMP, REMOVE AFTER INDEX FIX
import datetime as dt

# import Objectiv buh_tuh
from buhtuh.pandasql import BuhTuhDataFrame
sys.path.extend([
    '../../buhtuh',
    '../'
])

from objectiv_buhtuh.util import duplo_basic_features

# enable these once we visualize sankey charts
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))
# import pandas as pd

## Get website production data

In [None]:
## get some data, add database and credentials here
engine = sqlalchemy.create_engine('postgresql://objectiv:@localhost:5432/objectiv')

In [None]:
## production website data (from sessionized_data + features)
basic_features = duplo_basic_features()
buh_tuh = BuhTuhDataFrame.from_model(engine=engine, model=basic_features, index=['event_id'])

## Set the timeframe

In [None]:
# set the timeframe for analysis
selector = (buh_tuh['moment'] >= datetime.date(2021,6,1)) & (buh_tuh['moment'] < datetime.date(2021,10,4))

# create one sampled df with timeframe applied and one with the full dataset
timeframe_df = buh_tuh[selector]
full_df = buh_tuh

timeframe_df.sort_values(by='moment', ascending=False).head()

## Set the time aggregation 

In [None]:
# choose for which level of time aggregation the rest of the analysis will run
# supports all Postgres datetime template patterns:
# https://www.postgresql.org/docs/9.1/functions-formatting.html#FUNCTIONS-FORMATTING-DATETIME-TABLE

agg_level = 'YYYYIW'

def time_aggr(bt, format, aggr):
    bt[format.lower()] = bt['moment'].format(format)
    group = bt.groupby([format.lower()])
    return group[list(aggr.keys())].aggregate(aggr)

## User & session totals

In [None]:
# calculate total users & sessions, we can resuse these later
total_users = timeframe_df.groupby()['user_id'].nunique()
total_sessions = timeframe_df.groupby()['session_id'].nunique()

## Users

In [None]:
# calculate unique users 
users = time_aggr(timeframe_df, agg_level, {'user_id':'nunique'})

users.sort_values(by=agg_level.lower(), ascending=False).head()

In [None]:
# visualize users
users.sort_values(by=agg_level.lower(), ascending=True).head(60).plot()

## Sessions

In [None]:
# calculate unique sessions
sessions = time_aggr(timeframe_df, agg_level, {'session_id':'nunique'})

sessions.sort_values(by=agg_level.lower(), ascending=False).head()

In [None]:
# visualize sessions
sessions.sort_values(by=agg_level.lower(), ascending=True).head(60).plot()

## Sessions per user

In [None]:
# merge users and sessions
users_sessions = sessions.merge(users, how='inner', on=agg_level.lower())

# calculate average sessions per user
users_sessions['sessions_per_user_avg'] = users_sessions['session_id_nunique'] / users_sessions['user_id_nunique']

# clean-up columns
del(users_sessions['session_id_nunique'])
del(users_sessions['user_id_nunique'])

users_sessions.sort_values(by=agg_level.lower(), ascending=False).head()

In [None]:
# visualize average sessions per user
users_sessions.sort_values(by=agg_level.lower(), ascending=True).head(60).plot()

## New users

In [None]:
# define first seen per user, based on full dataset
user_first_seen = full_df.groupby(['user_id'])['moment'].min()
user_first_seen[agg_level.lower()] = user_first_seen['moment_min'].format(agg_level)

# calculate new users for each timeframe
new_users = user_first_seen.groupby(agg_level.lower())['user_id'].nunique()

# merge with total users, to calculate ratio and limit to timerange
new_total_users = users.merge(new_users, how='inner', on=agg_level.lower())

# rename and clean-up columns
new_total_users['total_users'] = new_total_users['user_id_nunique_x']
new_total_users['new_users'] = new_total_users['user_id_nunique_y']
del(new_total_users['user_id_nunique_x'])
del(new_total_users['user_id_nunique_y'])

# calculate new & returning user share
new_total_users['new_user_share'] = new_total_users['new_users'] / new_total_users['total_users']
new_total_users['returning_user_share'] = (new_total_users['total_users'] - new_total_users['new_users']) / new_total_users['total_users']

new_total_users.sort_values(by=agg_level.lower(), ascending=False).head()

In [None]:
# visualize new users
new_total_users[['new_users', 'total_users']].sort_values(by=agg_level.lower(), ascending=True).head(60).plot()

In [None]:
# visualize returning users
new_total_users[['returning_user_share']].sort_values(by=agg_level.lower(), ascending=True).head(60).plot()

## Frequency

In [None]:
# number of total sessions per user
total_sessions_user = timeframe_df.groupby(['user_id'])['session_id'].nunique()

# calculate frequency
frequency = total_sessions_user.groupby(['session_id_nunique'])['user_id'].nunique()

# add total users and calculate share per number of sessions
frequency['share_of_users'] = frequency['user_id_nunique'] / total_users['user_id_nunique'][1]

frequency.sort_values(by='session_id_nunique', ascending=True).head()

In [None]:
# visualize frequency
frequency[['share_of_users']].sort_values(by='session_id_nunique', ascending=True).head(10).plot()

## Events

In [None]:
# number of total user per feature
# users_per_event = timeframe_df.groupby(['feature'])['user_id'].nunique()

events_users = timeframe_df[['moment', 'feature', 'user_id']]
events_users[agg_level.lower()] = events_users['moment'].format(agg_level)

# calculate hits per session
users_per_event = events_users.groupby([agg_level.lower(), 'feature'])['user_id'].nunique()

users_per_event.sort_values(by=[agg_level.lower(), 'user_id_nunique'], ascending=False).head()

# TODO: 
# 1) add feature aggregation magic here, so we make the features show-off what we can do much more 
# 2) add location stack, showing the power of this very soon in the demo's

## Conversion

In [None]:
# TODO: 
# We can do much better here once we integrate feature selection & aggregation

# NOTE: WE NEED TO UPDATE THIS ONCE THE FIRST NEW EVENT FORMAT DATA COMES IN FOR A CONVERSION
# set the goal event that you define as conversion, using our subcribe-to-mailing
conv_selector = (timeframe_df['feature'] == '(WebDocumentContext,#document),(InputContext,keep-me-posted-input),(ButtonContext,subscribe)')

# create df with only conversion events
conversions_df = timeframe_df[conv_selector]

# calculate conversions, now per user, but can easily be aggregated to session_id instead
conversions = time_aggr(conversions_df, agg_level, {'user_id':'nunique'})

# merge with users, but can easily be done with sessions instead
conversion_rate = conversions.merge(users)
conversion_rate = conversions.merge(users, how='inner', on=agg_level.lower())

# clean-up and rename columns
conversion_rate['converting_users'] = conversion_rate['user_id_nunique_x']
conversion_rate['total_users'] = conversion_rate['user_id_nunique_y']
del(conversion_rate['user_id_nunique_x'])
del(conversion_rate['user_id_nunique_y'])

# calculate conversion rate
conversion_rate['conversion_rate'] = conversion_rate['converting_users'] / conversion_rate['total_users']

conversion_rate.sort_values(by=agg_level.lower(), ascending=False).head(30)

In [None]:
# visualize conversion rate
conversion_rate[['conversion_rate']].sort_values(by=agg_level.lower(), ascending=True).head(60).plot()

## Bounce rate

In [None]:
# NOTE: we need to limit this to page or screen views, instead of all events. Do this once we have integration feature selection.

# gather sessions, hits per timeframe
hits_sessions = timeframe_df[['moment', 'session_id', 'session_hit_number']]
hits_sessions[agg_level.lower()] = hits_sessions['moment'].format(agg_level)

# calculate hits per session
hits_per_session = hits_sessions.groupby([agg_level.lower(), 'session_id'])['session_hit_number'].nunique()

# select sessions with only one hit
hit_selector = (hits_per_session['session_hit_number_nunique'] == 1)
single_hit_sessions = hits_per_session[hit_selector].to_frame()

# count these single hit sessions per timeframe
bounced_sessions = single_hit_sessions.groupby([agg_level.lower()])['session_id'].nunique()

# merge with total sessions and calculate bounce rate
bounce_rate = bounced_sessions.merge(sessions, how='inner', on=agg_level.lower())

bounce_rate['bounce_rate'] = bounce_rate['session_id_nunique_x'] / bounce_rate['session_id_nunique_y']
del(bounce_rate['session_id_nunique_x'])
del(bounce_rate['session_id_nunique_y'])

bounce_rate.sort_values(by=agg_level.lower(), ascending=False).head()

In [None]:
# visualize bounce rate
bounce_rate[['bounce_rate']].sort_values(by=agg_level.lower(), ascending=True).head(60).plot()

## Session duration

In [None]:
# calculate duration of each session
session_duration = timeframe_df.groupby(['session_id']).aggregate(['moment','moment'],['min','max'])
session_duration['session_duration'] = session_duration['moment_max'] - session_duration['moment_min']

# check which sessions have duration of zero
session_duration['session_duration_zero'] = session_duration['session_duration'] == '0'

# adding time aggregation, so we can group on this
session_duration[agg_level.lower()] = session_duration['moment_min'].format(agg_level)

# calculate average session duration
avg_session_duration = session_duration.groupby([agg_level.lower(), 'session_duration_zero'])['session_duration'].average()

# count the number of sessions with duration zero or more
session_counts = session_duration.groupby([agg_level.lower(), 'session_duration_zero'])['session_id'].count()

# merge avg session duration and counts
avg_duration_counts = avg_session_duration.merge(session_counts, how='inner', on=[(agg_level.lower()),('session_duration_zero')])

# merge with total sessions and calculate share
duration_breakdown = avg_duration_counts.merge(sessions, how='inner', on=agg_level.lower())

duration_breakdown['share_of_sessions'] = duration_breakdown['session_id_count'] / duration_breakdown['session_id_nunique']
del(duration_breakdown['session_id_nunique'])

duration_breakdown.sort_values(by=agg_level.lower(), ascending=False).head(6)

## WIP Session duration between specific events

In [None]:
# define the start and stop events to measure the duration
start_event = '(WebDocumentContext,#document)'
stop_event = '(WebDocumentContext,#document),(SectionContext,footer)'

start_event_selector = (timeframe_df['feature'] == start_event)
stop_event_selector = (timeframe_df['feature'] == stop_event)

# create df filtered on these events
start_event_df = timeframe_df[start_event_selector]
stop_event_df = timeframe_df[stop_event_selector]

# select only the columns needed
start_event_df = start_event_df[['moment', 'session_id']]
stop_event_df = stop_event_df[['moment', 'session_id']]

# merge based on session_id
start_stop_moments = start_event_df.merge(stop_event_df, how='inner', on='session_id')

# clean-up and rename columns
start_stop_moments['moment_start'] = start_stop_moments['moment_x']
start_stop_moments['moment_stop'] = start_stop_moments['moment_y']
del(start_stop_moments['moment_x'])
del(start_stop_moments['moment_y'])

# calculate diff between start & stop, to use later for filtering to real duration
start_stop_moments['moment_diff'] = start_stop_moments['moment_stop'] - start_stop_moments['moment_start']

# diff can not be negative
negative_diff_selector = (start_stop_moments['moment_stop'] >= start_stop_moments['moment_start'])
start_stop_filtered = start_stop_moments[negative_diff_selector]

# for each stop event, select the closest preceeding start event
start_stop_diff = start_stop_filtered.groupby(['session_id', 'moment_stop'])['moment_diff'].min()

# BELOW DOES NOT WORK YET UNTIL WE HAVE A PLAN FOR INDEXES
# start_stop_diff[agg_level.lower()] = start_stop_diff['moment_stop'].format(agg_level)

# TEMP UGLY WORKAROUND UNTIL WE HAVE INDEX PLAN
temp_fix = start_stop_diff.head(100000)
temp_fix_index = temp_fix.reset_index()

# adding time aggregation, based on stop event, so we can group on this
temp_fix_index['date'] = temp_fix_index['moment_stop'].dt.date

# calculate duration between start & stop events
start_stop_duration = temp_fix_index.groupby('date').agg({'moment_diff_min':'sum'})
start_stop_duration.head(100)

# TODO

In [None]:
# below parts first require some next steps in dub_buh_tuh

## Conversion funnel

In [None]:
# TODO
# Self-merge is giving not the ouput we expect. 
# Without that, we can not create a sankey that looks like a familiar funnel. 
# See example here https://gitlab.com/newrelity/objectiv-taxonomy-prototypes/-/blob/web-analytics/data-science/issue_example_self_merge.ipynb

# showing the sequence of events for converting users

# resuse the df with only conversion events, select the users and their conversion moment
converting_users = conversions_df['user_id', 'moment']

# for now, we focus on the first conversion event. Later it is nice to also make it possible to see events between first and 2nd conversion, and so on.
converting_users = converting_users.groupby(['user_id'])['moment'].min()
converting_users['first_conversion_moment'] = converting_users['moment_min']
del(converting_users['moment_min'])

# merge with the df that has all user events in the timeframe
converting_users_events = timeframe_df.merge(converting_users, [('user_id', 'user_id')])

# select all events that converting users had up to their first conversion moment
event_selector = (converting_users_events['moment'] <= converting_users_events['first_conversion_moment'])
pre_conversion_events = converting_users_events[event_selector]

# create pairs of from-to events based on session hit number
event_sequence = pre_conversion_events['session_id', 'session_hit_number', 'feature']

event_pairs = event_sequence.merge(event_sequence, [('session_id')])

event_pairs.head(50)


In [None]:
df_sank = pd.read_csv('buh.csv')

In [None]:
categories = set(df_sank['source']).union(set(df_sank['target']))
df_sank['source'] = pd.Categorical(df_sank['source'], categories=categories)
df_sank['target'] = pd.Categorical(df_sank['target'], categories=categories)

text_in_title = str('title')
node = dict(
      pad=15,
      thickness=20,
      line=dict(color="black", width=0.5),
      label=df_sank.source.cat.categories,
      color='blue'
    )
link = pd.concat([df_sank[['source', 'target']].apply(lambda x: x.cat.codes), df_sank['value']], axis=1).to_dict('list')
fig = go.Figure(go.Sankey(arrangement="fixed", link=link, node=node), {'clickmode': 'event+select'})
fig.update_layout(title_text=text_in_title, font_size=10)

## User timeline

In [None]:
# show the timeline of an indivual user's events
# NOTE: we can make this better with feature selection & aggregation

# select the spefic user we want to replay
user_id_selector = (buh_tuh['user_id'] == '320db8ee-847c-424b-8291-c65d021575aa')

# create df with only this user's events
# NOTE: timeframe_df['user_id_selector'] breaks: "# We only support first level boolean indices for now", so doing on full df for now
selected_user_df = buh_tuh[user_id_selector]

# left join conversions df, so we can check if the user converted
user_timeline = selected_user_df.merge(conversions_df,how='left')

# rename and clean-up columns
user_timeline['moment'] = user_timeline['moment_left'] 
user_timeline['feature'] = user_timeline['feature_left']
user_timeline['conversion_feature'] = user_timeline['feature_right']

# show relevant columns
user_timeline['moment','feature','conversion_feature'].sort_values({'moment':True}).head()

## Retention cohorts

In [None]:
# TODO: 
# continue on this when we have datetime intervals, so we can calculate start & end moments of cohorts.

# get the time aggregations where there are users
timeframes = users
timeframes = timeframes.head(100).reset_index()

# cleanup columns we don't need
del(timeframes['user_id_nunique'])
del(timeframes['share_of_total'])

# reset index and use that as cohort numbering
timeframes2 = timeframes.rename_axis('cohort_nr').reset_index()
timeframes2.head()

## Events flow

In [None]:
# events per session hit number
events_per_hit_number = buh_tuh[selector].groupby(['session_hit_number', 'feature'])['session_id'].nunique()

events_per_hit_number.sort_values({'session_hit_number':True}).head()

## Recency

In [None]:
# TODO: pick this up once we have window functions

# "the number of days between the close of one session and the opening of another"
test = timeframe_df.groupby(['user_id', 'session_id']).aggregate(['moment','moment'],['min','max'])

test.head()

## Traffic source

In [None]:
# TODO
# For Traffic Source, Geo and Device metrics, we would need to get source/geo/device data from GlobalContext in a easy way.
# We can then also blend it in all metrics above as slicing option.

## Geo 

## Devices