In [49]:
import pandas as pd
import numpy as np
from customer_journey import remove_page_duplicates, remove_pages, group_by
from collections import Counter

In [2]:
df = pd.read_csv("user_journey_raw.csv")

In [96]:
# df.head()

### 1. Page count

In [4]:
# The most fundamental metric; it counts how many times each page can be found in all user journeys.
def page_count(datafr, target_col='user_journey', subscription='all'):
    all_sequences = []
    for sequence in datafr[target_col]:
        page_list = sequence.split('-')
        page_to_count ={}
        for page in page_list:
            if page not in page_to_count.keys():
                page_to_count[page] = 1
            else:
                page_to_count[page] = page_to_count[page]+1
        all_sequences.append(page_to_count)
    return all_sequences

In [95]:
# page_count(df)

### 2. Page presence

In [5]:
# counts each page only once if it exists in a journey; it shows how many times each page is part of a journey
def page_presence(datafr, subscription='all'):
    present_pages = []

    for dict_ in page_count(datafr):
        present_pages.append(list(dict_.keys()))
    return present_pages

In [94]:
# page_presence(df)

### 3. Page sequences

In [92]:
""" look at what the most popular run of N pages is. I will consult this metric if I’m interested in the sequence of
three (or any other number) pages that most often show up. Count each sequence only once per journey.
"""
def page_sequence(datafr, n_pages:int, top_n:int, target_col='user_journey'):
    data_ = group_by(remove_page_duplicates(datafr, target_col))
    nested_trunc_tuples = []
    for sequence in data_[target_col]:
        list_ = sequence.split('-')
        # truncate the list_ to n len
        # turn the lists into tuples, so they're hashable and can be used with Counter
        trunc_list = tuple(list_[:n_pages])
        # only return tuples with a min length of n
        if len(trunc_list) >= n_pages:
            nested_trunc_tuples.append(trunc_list)

    counter = Counter(nested_trunc_tuples)
    top_most_common = counter.most_common(top_n)
    return top_most_common

In [93]:
page_sequence(df, 5, 5)

[(('Other', 'Other', 'Other', 'Other', 'Other'), 43),
 (('Other', 'Log in', 'Log in', 'Log in', 'Log in'), 35),
 (('Other', 'Log in', 'Other', 'Log in', 'Log in'), 6),
 (('Other', 'Homepage', 'Log in', 'Homepage', 'Log in'), 6),
 (('Homepage', 'Log in', 'Homepage', 'Log in', 'Homepage'), 5)]

### 4. Journey length

In [88]:
# the average length of a user's journey in terms of pages, per user session, I assume?
def journey_length(datafr, target_col='user_journey'):
    journey_lengths = []
    for sequence in datafr[target_col]:
        list_ = sequence.split('-')
        length = len(list_)
        journey_lengths.append(length)
    return np.mean(journey_lengths)


In [89]:
journey_length(df)

8.671363865123302