In [None]:
import sys
import datetime

sys.path.extend([
    '../../buhtuh',
    '../'
])

In [None]:
import sqlalchemy
from buhtuh import BuhTuhDataFrame
from objectiv_buhtuh.util import duplo_basic_features

## Get website production data

In [None]:
## get some data, add database and credentials here
engine = sqlalchemy.create_engine('postgresql://objectiv:@localhost:5432/objectiv')

In [None]:
## production website data (from sessionized_data + features)
basic_features = duplo_basic_features()
buh_tuh = BuhTuhDataFrame.from_model(engine=engine, model=basic_features, index=['event_id'])
#buh_tuh = BuhTuhDataFrame.from_table(engine=engine, table_name='basic_features', index=['event_id'])

## Set the timeframe

In [None]:
# set the timeframe for analysis
selector = (buh_tuh['moment'] >= datetime.date(2021,6,1)) & (buh_tuh['moment'] < datetime.date(2021,9,6))

# create one sampled df with timeframe applied and one with the full dataset
timeframe_df = buh_tuh[selector]
full_df = buh_tuh

timeframe_df.sort_values({'moment': False}).head()

## Set the time aggregation 

In [None]:
# choose for which level of time aggregation the rest of the analysis will run
# supports all Postgres datetime template patterns:
# https://www.postgresql.org/docs/9.1/functions-formatting.html#FUNCTIONS-FORMATTING-DATETIME-TABLE

agg_level = 'YYYYMMDD'

def time_aggr(bt, format, aggr):
    bt[format.lower()] = bt['moment'].format(format)
    group = bt.groupby([format.lower()])
    return group[list(aggr.keys())].aggregate(aggr)

## User & session totals

In [None]:
# calculate total users & sessions, we can resuse these later
total_users = timeframe_df.groupby()['user_id'].nunique()
total_sessions = timeframe_df.groupby()['session_id'].nunique()

## Users

In [None]:
# calculate unique users 
users = time_aggr(timeframe_df, agg_level, {'user_id':'nunique'})

# for each timeframe, add share of total users
users['share_of_total'] = users['user_id_nunique'] / total_users['user_id_nunique'][1]

users.sort_values({agg_level.lower():False}).head()

In [None]:
# visualize users
users['user_id_nunique'].sort_values({agg_level.lower():True}).head(60).plot()

## Sessions

In [None]:
# calculate unique sessions
sessions = time_aggr(timeframe_df, agg_level, {'session_id':'nunique'})

# for each timeframe, add share of total sessions
sessions['share_of_total'] = sessions['session_id_nunique'] / total_sessions['session_id_nunique'][1]

sessions.sort_values({agg_level.lower():False}).head()

In [None]:
# visualize sessions
sessions['session_id_nunique'].sort_values({agg_level.lower():True}).head(60).plot()

## New users

In [None]:
# define first seen per user, based on full dataset
user_first_seen = full_df.groupby(['user_id'])['moment'].min()
user_first_seen[agg_level.lower()] = user_first_seen['moment_min'].format(agg_level)

# calculate new users for each timeframe
new_users = user_first_seen.groupby(agg_level.lower())['user_id'].nunique()

# merge with total users, to calculate ratio and limit to timerange
new_total_users = users.merge(new_users)

# rename and clean-up columns
new_total_users['total_users'] = new_total_users['user_id_nunique_left']
new_total_users['new_users'] = new_total_users['user_id_nunique_right']
del(new_total_users['user_id_nunique_left','user_id_nunique_right','share_of_total'])

# calculate new user share
new_total_users['new_user_share'] = new_total_users['new_users'] / new_total_users['total_users']

new_total_users.sort_values({agg_level.lower():False}).head()

In [None]:
# visualize new users
new_total_users['new_users', 'total_users'].sort_values({agg_level.lower():True}).head(60).plot()

## Sessions per user

In [None]:
# merge users and sessions
users_sessions = sessions.merge(users)

# calculate average sessions per user
users_sessions['sessions_per_user_average'] = users_sessions['session_id_nunique'] / users_sessions['user_id_nunique']

# clean-up columns
del(users_sessions['session_id_nunique','share_of_total_left', 'share_of_total_right', 'user_id_nunique'])

users_sessions.sort_values({agg_level.lower():False}).head()

## Session duration

In [None]:
# calculate duration of each session
session_duration = timeframe_df.groupby(['session_id']).aggregate(['moment','moment'],['min','max'])
session_duration['session_duration'] = session_duration['moment_max'] - session_duration['moment_min']

# adding time aggregation, so we can group on this
session_duration[agg_level.lower()] = session_duration['moment_min'].format(agg_level)

# calculate average session duration
avg_session_duration = session_duration.groupby(agg_level.lower())['session_duration'].average()

avg_session_duration.sort_values({agg_level.lower():False}).head()

## Frequency

In [None]:
# number of total sessions per user
total_sessions_user = timeframe_df.groupby(['user_id'])['session_id'].nunique()

# calculate frequency
frequency = total_sessions_user.groupby(['session_id_nunique'])['user_id'].nunique()

# add total users and calculate share per number of sessions
frequency['share_of_users'] = frequency['user_id_nunique'] / total_users['user_id_nunique'][1]

frequency.sort_values({'session_id_nunique':True}).head()

In [None]:
# visualize frequency
frequency['user_id_nunique'].sort_values({'session_id_nunique':True}).head(10).plot()

## Events

In [None]:
# number of total user per feature
users_per_event = timeframe_df.groupby(['feature'])['user_id'].nunique()

users_per_event.sort_values({'user_id_nunique':False}).head()

# NOTE: we should start adding feature aggregegation & location stack as next step

## Conversion

In [None]:
# NOTE: we can do better here once we integrate feature aggregation
# NOTE: timeframe_df['feature'] breaks: "# We only support first level boolean indices for now", so doing on full df for now

# set the goal event that you define as conversion, using our subcribe-to-mailing
conv_selector = (buh_tuh['feature'] == '(WebDocumentContext,#document),(InputContext,keep-me-posted-input),(ButtonContext,subscribe)')
                 
# create df with only conversion events
conversions_df = buh_tuh[conv_selector]

# calculate conversions, now per user, but can easily be aggregated to session_id instead
conversions = time_aggr(conversions_df, agg_level, {'user_id':'nunique'})

# merge with users, but can easily be done with sessions instead
conversion_rate = conversions.merge(users)

# clean-up and rename columns
conversion_rate['converting_users'] = conversion_rate['user_id_nunique_left']
conversion_rate['total_users'] = conversion_rate['user_id_nunique_right']
del(conversion_rate['user_id_nunique_left','user_id_nunique_right','share_of_total'])

# calculate conversion rate
conversion_rate['conversion_rate'] = conversion_rate['converting_users'] / conversion_rate['total_users']

conversion_rate.sort_values({agg_level.lower():False}).head(30)

In [None]:
# visualize conversion rate
conversion_rate['conversion_rate'].sort_values({agg_level.lower():True}).head(60).plot()

## User timeline

In [None]:
# show the timeline of an indivual user's events
# NOTE: we can make this better with feature selection & aggregation

# select the spefic user we want to replay
user_id_selector = (buh_tuh['user_id'] == '320db8ee-847c-424b-8291-c65d021575aa')

# create df with only this user's events
# NOTE: timeframe_df['user_id_selector'] breaks: "# We only support first level boolean indices for now", so doing on full df for now
selected_user_df = buh_tuh[user_id_selector]

# left join conversions df, so we can check if the user converted
user_timeline = selected_user_df.merge(conversions_df,how='left')

# rename and clean-up columns
user_timeline['moment'] = user_timeline['moment_left'] 
user_timeline['feature'] = user_timeline['feature_left']
user_timeline['conversion_feature'] = user_timeline['feature_right']

# show relevant columns
user_timeline['moment','feature','conversion_feature'].sort_values({'moment':True}).head()

# TODO

In [None]:
# below parts first require some next steps in dub_buh_tuh

## Retention cohorts

In [None]:
## NOTE: continue on this when we have datetime intervals, so we can calculate start & end moments of cohorts.

# get the time aggregations where there are users
timeframes = users
timeframes = timeframes.head(100).reset_index()

# cleanup columns we don't need
del(timeframes['user_id_nunique'])
del(timeframes['share_of_total'])

# reset index and use that as cohort numbering
timeframes2 = timeframes.rename_axis('cohort_nr').reset_index()
timeframes2.head()

## Events flow

In [None]:
# events per session hit number
events_per_hit_number = buh_tuh[selector].groupby(['session_hit_number', 'feature'])['session_id'].nunique()

events_per_hit_number.sort_values({'session_hit_number':True}).head()

## Conversion funnel

## Recency

In [None]:
# "the number of days between the close of one session and the opening of another"
test = timeframe_df.groupby(['user_id', 'session_id']).aggregate(['moment','moment'],['min','max'])

test.head()
# NOTE: pick this up once we have window functions

## Traffic source

## Geo 

## Devices