In [8]:
import pandas as pd
import numpy as np
from customer_journey import remove_page_duplicates, remove_pages, group_by
from collections import Counter

In [9]:
df = pd.read_csv("user_journey_raw.csv")

In [24]:
df.head()

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
1,1516,2980248,Annual,Other-Sign up-Sign up-Sign up-Sign up-Sign up-...
2,1516,2992252,Annual,Log in-Log in-Log in-Log in-Log in-Log in
3,1516,3070491,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
4,1516,3709807,Annual,Log in-Log in-Log in-Log in-Log in-Log in-Log ...


## Supporting functions


In [27]:
def make_mask(data, match, target_column):
    """
    Returns a boolean list (mask) indicating when match is contained in the target_column of data.

        Parameters:
            data (pandas.DataFrame): The DataFrame for which we will create the mask

            match (str): The string to be matched in target_column. If "All", creates
                a mask full of True values (i.e. matching the whole DataFrame)

            target_column (str): The column in which to search for the string match


        Returns:
            mask (bool list): A list of bool values mapping the locations where match is contained in target_column
    """

    if match == 'All':
        size = len(data[target_column])
        mask =[True]*size
    else:
        mask = list(data[target_column] == match)
    return mask


In [28]:
def split_pages(data, target_column='user_journey'):
    """
    Given a pandas DataFrame, split the user journey strings into separate pages.
    It does not update the DataFrame, rather returns a new numpy array.

    A user journey string is a string of the pages a user visited before purchassing, separated by a dash '-'.
    This function removes the dash and obtains a numpy array of all the pages (as strings).


        Parameters:
            data (pandas.DataFrame): The DataFrame containg user journeys data

            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name for the column


        Returns:
            user_journey (numpy.array): A numpy array containing arrays of the pages (str)
    """
    #  Transform the user journey column to a numpy array
    # numpy.array needed in order to be able to use a mask (filter)
    user_journeys = np.array(data[target_column])

    # Split the journey strings into pages -> [["page1", "page2", ..."pageN"], ...]
    for i in range(len(user_journeys)):
        user_journeys[i] = np.array(user_journeys[i].split("-"))


    return user_journeys

In [32]:
split_pages(df)

array([array(['Homepage', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Other'],
             dtype='<U8')                                                  ,
       array(['Other', 'Sign up', 'Sign up', 'Sign up', 'Sign up', 'Sign up',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in'],
             dtype='<U7')                                                    ,
       array(['Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in'],
             dtype='<U6')                                                 ,
       ..., array(['Other', 'Other'], dtype='<U5'),
       array(['Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
              'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
              'Other', 'Other', 'Other', 'Other', 'Other', 'Other'], dtype='<U5'),
       array(['Other', 'Oth

## Main functions


### 1. Page count

In [79]:
# The most fundamental metric.
def page_count(data, target_column="user_journey", plan="All", mask=None):
    """
    Calculates the number of times each page is found in the user journey strings.

        Parameters:
            data (pandas.DataFrame): The DataFrame containing the user journeys data

            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name of the column

            plan (str): The subscription plan to filter the user journeys by. If "All", consideres all user journeys
                without filtering
                Default: "All", includes all data

            mask (bool list): A user provided boolean list to use when selecting which data to analyze
                Default: None, consider plan instead

            sort (bool): Whether to sort the resultant dictionary by values in decreasing order
                Default: True, return a sorted dictionary


        Returns:
            dict {str: int}: A dictionary of every page and its count
                {page: count}
    """
    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")
    print(mask)

    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]

    # Populate the dictionary with page counts
    page_to_count = {}
    for journey in user_journeys:
        for page in journey:
            if page not in page_to_count.keys():
                page_to_count[page] = 1
            else:
                page_to_count[page] = page_to_count[page] + 1
    sorted_keys = sorted(page_to_count, key = page_to_count.get, reverse = True)
    sorted_results = {key: page_to_count[key] for key in sorted_keys}
    return sorted_results

In [80]:
page_count(df)

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, Tru

{'Checkout': 17896,
 'Log in': 17265,
 'Coupon': 11855,
 'Courses': 7149,
 'Sign up': 6824,
 'Other': 6820,
 'Career tracks': 4910,
 'Homepage': 3808,
 'Career track certificate': 3044,
 'Resources center': 2266,
 'Pricing': 2262,
 'Course certificate': 1114,
 'Success stories': 604,
 'Upcoming courses': 188,
 'Instructors': 76,
 'Blog': 36,
 'About us': 33}

### 2. Page presence

In [81]:
def page_presence(data, target_column = 'user_journey', plan = "All", mask = None):
    """
    Finds the number of journeys each page is present in (from the user jounrey strings), i.e. the most popular pages in all user journeys

        Parameters:
            data (pandas.DataFrame): The DataFrame containing the user journeys data

            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name of the column

            plan (str): The subscription plan to filter the user journeys by. If "All", consideres all user journeys
                without filtering
                Default: "All", includes all data

            mask (bool list): A user provided boolean list to use when selecting which data to analyze
                Default: None, consider plan instead

            sort (bool): Whether to sort the resultant dictionary by values in decreasing order
                Default: True, return a sorted dictionary


        Returns:
            dict {str: int}: A dictionary of every page and the number of journeys it is present in
                {page: count_journeys}
    """
    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")

    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]


    present_pages = {}

    for journey in user_journeys:
        pages = set(journey) # Unique pages in a journey
        for page in pages:
            present_pages[page] = present_pages.get(page,0)+1

        # Create a new dictionary, sorted by values (page counts) in decreasing order

        sorted_keys = sorted(present_pages, key = present_pages.get, reverse = True)
        sorted_results = {key: present_pages[key] for key in sorted_keys}



    return sorted_results

In [82]:
page_presence(df)

{'Log in': 3798,
 'Homepage': 2396,
 'Checkout': 2021,
 'Other': 1535,
 'Sign up': 1210,
 'Coupon': 1041,
 'Pricing': 929,
 'Courses': 908,
 'Career tracks': 747,
 'Career track certificate': 355,
 'Resources center': 339,
 'Course certificate': 191,
 'Upcoming courses': 101,
 'Success stories': 49,
 'Instructors': 26,
 'About us': 22,
 'Blog': 15}

### 3. Page sequences

In [92]:
""" look at what the most popular run of N pages is. I will consult this metric if I’m interested in the sequence of
three (or any other number) pages that most often show up. Count each sequence only once per journey.
"""
def page_sequence(datafr, n_pages:int, top_n:int, target_col='user_journey'):
    data_ = group_by(remove_page_duplicates(datafr, target_col))
    nested_trunc_tuples = []
    for sequence in data_[target_col]:
        list_ = sequence.split('-')
        # truncate the list_ to n len
        # turn the lists into tuples, so they're hashable and can be used with Counter
        trunc_list = tuple(list_[:n_pages])
        # only return tuples with a min length of n
        if len(trunc_list) >= n_pages:
            nested_trunc_tuples.append(trunc_list)

    counter = Counter(nested_trunc_tuples)
    top_most_common = counter.most_common(top_n)
    return top_most_common

In [97]:
page_sequence(df, 4, 5)

[(('Other', 'Other', 'Other', 'Other'), 57),
 (('Other', 'Log in', 'Log in', 'Log in'), 42),
 (('Homepage', 'Sign up', 'Homepage', 'Log in'), 12),
 (('Other', 'Log in', 'Other', 'Log in'), 10),
 (('Homepage', 'Sign up', 'Log in', 'Log in'), 9)]

### 4. Journey length

In [88]:
# the average length of a user's journey in terms of pages, per user session, I assume?
def journey_length(datafr, target_col='user_journey'):
    journey_lengths = []
    for sequence in datafr[target_col]:
        list_ = sequence.split('-')
        length = len(list_)
        journey_lengths.append(length)
    return np.mean(journey_lengths)


In [89]:
journey_length(df)

8.671363865123302