In [8]:
import pandas as pd
import numpy as np
from customer_journey import remove_page_duplicates, remove_pages, group_by
from collections import Counter

In [9]:
df = pd.read_csv("user_journey_raw.csv")

In [10]:
df.head()

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
1,1516,2980248,Annual,Other-Sign up-Sign up-Sign up-Sign up-Sign up-...
2,1516,2992252,Annual,Log in-Log in-Log in-Log in-Log in-Log in
3,1516,3070491,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
4,1516,3709807,Annual,Log in-Log in-Log in-Log in-Log in-Log in-Log ...


In [16]:
user_journeys = np.array(df['user_journey'])
user_journeys

array(['Homepage-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Log in-Other',
       'Other-Sign up-Sign up-Sign up-Sign up-Sign up-Log in-Log in-Log in-Log in-Log in-Log in',
       'Log in-Log in-Log in-Log in-Log in-Log in', ..., 'Other-Other',
       'Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other-Other',
       'Other-Other-Other-Other-Other-Other-Coupon-Coupon-Coupon-Coupon-Coupon-Coupon-Coupon-Coupon-Coupon-Coupon'],
      dtype=object)

In [13]:
user_journeys = np.array(df['user_journey'])
for i in range(len(user_journeys)):
        user_journeys[i] = np.array(user_journeys[i].split("-"))
user_journeys

array([array(['Homepage', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Other'],
             dtype='<U8')                                                  ,
       array(['Other', 'Sign up', 'Sign up', 'Sign up', 'Sign up', 'Sign up',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in'],
             dtype='<U7')                                                    ,
       array(['Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in'],
             dtype='<U6')                                                 ,
       ..., array(['Other', 'Other'], dtype='<U5'),
       array(['Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
              'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
              'Other', 'Other', 'Other', 'Other', 'Other', 'Other'], dtype='<U5'),
       array(['Other', 'Oth

## Supporting functions


In [None]:
def make_mask(data, match, target_column):
    """
    Returns a boolean list (mask) indicating when match is contained in the target_column of data.

        Parameters:
            data (pandas.DataFrame): The DataFrame for which we will create the mask

            match (str): The string to be matched in target_column. If "All", creates
                a mask full of True values (i.e. matching the whole DataFrame)

            target_column (str): The column in which to search for the string match


        Returns:
            mask (bool list): A list of bool values mapping the locations where match is contained in target_column
    """

    if match == 'All':
        size = len(data[target_column])
        mask =[True]*size
    else:
        mask = list(data[target_column] == match)
    return mask


In [None]:
def split_pages(data, taqrget_column='user_journey'):
    """
    Given a pandas DataFrame, split the user journey strings into separate pages.
    It does not update the DataFrame, rather returns a new numpy array.

    A user journey string is a string of the pages a user visited before purchassing, separated by a dash '-'.
    This function removes the dash and obtains a numpy array of all the pages (as strings).


        Parameters:
            data (pandas.DataFrame): The DataFrame containg user journeys data

            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name for the column


        Returns:
            user_journey (numpy.array): A numpy array containing arrays of the pages (str)
    """
     Transform the user journey column to a numpy array
    # numpy.array needed in order to be able to use a mask (filter)
    user_journeys = np.array(data[target_column])

    # Split the journey strings into pages -> [["page1", "page2", ..."pageN"], ...]
    for i in range(len(user_journeys)):
        user_journeys[i] = np.array(user_journeys[i].split("-"))


    return user_journeys

## Main functions


### 1. Page count

In [6]:
# The most fundamental metric; it counts how many times each page can be found in all user journeys.
def page_count(datafr, target_col='user_journey', subscription='all'):
    all_sequences = []
    for sequence in datafr[target_col]:
        page_list = sequence.split('-')
        page_to_count ={}
        for page in page_list:
            if page not in page_to_count.keys():
                page_to_count[page] = 1
            else:
                page_to_count[page] = page_to_count[page]+1
        all_sequences.append(page_to_count)
    return all_sequences

In [7]:
page_count(df)

[{'Homepage': 1, 'Log in': 16, 'Other': 1},
 {'Other': 1, 'Sign up': 5, 'Log in': 6},
 {'Log in': 6},
 {'Homepage': 1, 'Log in': 9},
 {'Log in': 14},
 {'Checkout': 4},
 {'Checkout': 6},
 {'Checkout': 26},
 {'Checkout': 2},
 {'Coupon': 2},
 {'Checkout': 4},
 {'Checkout': 12},
 {'Checkout': 2},
 {'Other': 2},
 {'Pricing': 1, 'Sign up': 2, 'Log in': 3},
 {'Homepage': 1, 'Pricing': 1},
 {'Pricing': 1, 'Checkout': 1},
 {'Checkout': 6},
 {'Homepage': 2},
 {'Homepage': 7, 'Career tracks': 4, 'Sign up': 1, 'Log in': 10},
 {'Homepage': 1, 'Resources center': 4, 'Other': 1},
 {'Homepage': 1, 'Career tracks': 3},
 {'Career tracks': 6, 'Courses': 2},
 {'Career tracks': 1, 'Log in': 7},
 {'Homepage': 1, 'Log in': 7},
 {'Checkout': 2},
 {'Checkout': 14},
 {'Log in': 10},
 {'Checkout': 2},
 {'Log in': 10},
 {'Log in': 16},
 {'Checkout': 2},
 {'Checkout': 2},
 {'Checkout': 2},
 {'Homepage': 1, 'Log in': 5},
 {'Homepage': 1, 'Log in': 3},
 {'Homepage': 1, 'Log in': 1},
 {'Homepage': 1, 'Log in': 3},
 {

### 2. Page presence

In [5]:
# counts each page only once if it exists in a journey; it shows how many times each page is part of a journey
def page_presence(datafr, subscription='all'):
    present_pages = []

    for dict_ in page_count(datafr):
        present_pages.append(list(dict_.keys()))
    return present_pages

In [94]:
# page_presence(df)

### 3. Page sequences

In [92]:
""" look at what the most popular run of N pages is. I will consult this metric if I’m interested in the sequence of
three (or any other number) pages that most often show up. Count each sequence only once per journey.
"""
def page_sequence(datafr, n_pages:int, top_n:int, target_col='user_journey'):
    data_ = group_by(remove_page_duplicates(datafr, target_col))
    nested_trunc_tuples = []
    for sequence in data_[target_col]:
        list_ = sequence.split('-')
        # truncate the list_ to n len
        # turn the lists into tuples, so they're hashable and can be used with Counter
        trunc_list = tuple(list_[:n_pages])
        # only return tuples with a min length of n
        if len(trunc_list) >= n_pages:
            nested_trunc_tuples.append(trunc_list)

    counter = Counter(nested_trunc_tuples)
    top_most_common = counter.most_common(top_n)
    return top_most_common

In [97]:
page_sequence(df, 4, 5)

[(('Other', 'Other', 'Other', 'Other'), 57),
 (('Other', 'Log in', 'Log in', 'Log in'), 42),
 (('Homepage', 'Sign up', 'Homepage', 'Log in'), 12),
 (('Other', 'Log in', 'Other', 'Log in'), 10),
 (('Homepage', 'Sign up', 'Log in', 'Log in'), 9)]

### 4. Journey length

In [88]:
# the average length of a user's journey in terms of pages, per user session, I assume?
def journey_length(datafr, target_col='user_journey'):
    journey_lengths = []
    for sequence in datafr[target_col]:
        list_ = sequence.split('-')
        length = len(list_)
        journey_lengths.append(length)
    return np.mean(journey_lengths)


In [89]:
journey_length(df)

8.671363865123302