In [1]:
import pandas as pd
import numpy as np
from customer_journey import remove_page_duplicates, remove_pages, group_by
from collections import Counter

In [2]:
df = pd.read_csv("user_journey_raw.csv")

In [3]:
df.head()

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
1,1516,2980248,Annual,Other-Sign up-Sign up-Sign up-Sign up-Sign up-...
2,1516,2992252,Annual,Log in-Log in-Log in-Log in-Log in-Log in
3,1516,3070491,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
4,1516,3709807,Annual,Log in-Log in-Log in-Log in-Log in-Log in-Log ...


## Supporting functions


In [4]:
def make_mask(data, match, target_column):
    """
    Returns a boolean list (mask) indicating when match is contained in the target_column of data.

        Parameters:
            data (pandas.DataFrame): The DataFrame for which we will create the mask

            match (str): The string to be matched in target_column. If "All", creates
                a mask full of True values (i.e. matching the whole DataFrame)

            target_column (str): The column in which to search for the string match


        Returns:
            mask (bool list): A list of bool values mapping the locations where match is contained in target_column
    """

    if match == 'All':
        size = len(data[target_column])
        mask =[True]*size
    else:
        mask = list(data[target_column] == match)
    return mask


In [5]:
def split_pages(data, target_column='user_journey'):
    """
    Given a pandas DataFrame, split the user journey strings into separate pages.
    It does not update the DataFrame, rather returns a new numpy array.

    A user journey string is a string of the pages a user visited before purchassing, separated by a dash '-'.
    This function removes the dash and obtains a numpy array of all the pages (as strings).


        Parameters:
            data (pandas.DataFrame): The DataFrame containg user journeys data

            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name for the column


        Returns:
            user_journey (numpy.array): A numpy array containing arrays of the pages (str)
    """
    #  Transform the user journey column to a numpy array
    # numpy.array needed in order to be able to use a mask (filter)
    user_journeys = np.array(data[target_column])

    # Split the journey strings into pages -> [["page1", "page2", ..."pageN"], ...]
    for i in range(len(user_journeys)):
        user_journeys[i] = np.array(user_journeys[i].split("-"))


    return user_journeys

In [6]:
split_pages(df)

array([array(['Homepage', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Other'],
             dtype='<U8')                                                  ,
       array(['Other', 'Sign up', 'Sign up', 'Sign up', 'Sign up', 'Sign up',
              'Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in'],
             dtype='<U7')                                                    ,
       array(['Log in', 'Log in', 'Log in', 'Log in', 'Log in', 'Log in'],
             dtype='<U6')                                                 ,
       ..., array(['Other', 'Other'], dtype='<U5'),
       array(['Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
              'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other',
              'Other', 'Other', 'Other', 'Other', 'Other', 'Other'], dtype='<U5'),
       array(['Other', 'Oth

In [7]:
def remove_consecutive_duplicates(journey):
    """Removes consecutive duplicates from a list."""
    result = []
    # previous = None
    for page in journey:
        if page not in result:
            result.append(page)
            # previous = page
    return result

## Main functions


### 1. Page count

In [8]:
# The most fundamental metric.
def page_count(data, target_column="user_journey", plan="All", mask=None):
    """
    Calculates the number of times each page is found in the user journey strings.

        Parameters:
            data (pandas.DataFrame): The DataFrame containing the user journeys data

            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name of the column

            plan (str): The subscription plan to filter the user journeys by. If "All", consideres all user journeys
                without filtering
                Default: "All", includes all data

            mask (bool list): A user provided boolean list to use when selecting which data to analyze
                Default: None, consider plan instead

            sort (bool): Whether to sort the resultant dictionary by values in decreasing order
                Default: True, return a sorted dictionary


        Returns:
            dict {str: int}: A dictionary of every page and its count
                {page: count}
    """
    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")
    # print(mask)

    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]

    # Populate the dictionary with page counts
    page_to_count = {}
    for journey in user_journeys:
        for page in journey:
            if page not in page_to_count.keys():
                page_to_count[page] = 1
            else:
                page_to_count[page] = page_to_count[page] + 1
    sorted_keys = sorted(page_to_count, key = page_to_count.get, reverse = True)
    sorted_results = {key: page_to_count[key] for key in sorted_keys}
    return sorted_results

### 2. Page presence

In [9]:
def page_presence(data, target_column = 'user_journey', plan = "All", mask = None):
    """
    Finds the number of journeys each page is present in (from the user jounrey strings), i.e. the most popular pages in all user journeys

        Parameters:
            data (pandas.DataFrame): The DataFrame containing the user journeys data

            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name of the column

            plan (str): The subscription plan to filter the user journeys by. If "All", consideres all user journeys
                without filtering
                Default: "All", includes all data

            mask (bool list): A user provided boolean list to use when selecting which data to analyze
                Default: None, consider plan instead

            sort (bool): Whether to sort the resultant dictionary by values in decreasing order
                Default: True, return a sorted dictionary


        Returns:
            dict {str: int}: A dictionary of every page and the number of journeys it is present in
                {page: count_journeys}
    """
    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")

    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]


    present_pages = {}

    for journey in user_journeys:
        pages = set(journey) # Unique pages in a journey
        for page in pages:
            present_pages[page] = present_pages.get(page,0)+1

        # Create a new dictionary, sorted by values (page counts) in decreasing order

        sorted_keys = sorted(present_pages, key = present_pages.get, reverse = True)
        sorted_results = {key: present_pages[key] for key in sorted_keys}



    return sorted_results

### 3. Page sequences

In [10]:
def page_sequence(data, number_of_pages:int, show_results:int, target_column='user_journey', plan = "All", mask = None, sort = True):
    """
    Finds the top consecutive page strings and their count. I will consult this metric if I’m interested in the sequence of
    n pages that most often show up.

    Parameters:
            data (pandas.DataFrame): The DataFrame containing the user journeys data

            number_of_pages (int): The amount of consecutive pages to include in each subset
                Default: 3

            show_results (int): The number of results to return
                Default: 10

            target_column (str): The column containing the user journey strings
                Default: 'user_journey', the expected name of the column

            plan (str): The subscription plan to filter the user journeys by. If "All", consideres all user journeys
                without filtering
                Default: "All", includes all data

            mask (bool list): A user provided boolean list to use when selecting which data to analyze
                Default: None, consider plan instead

            sort (bool): Whether to sort the resultant dictionary by values in decreasing order
                Default: True, return a sorted dictionary


        Returns:
            dict {tuple(str): int}: A dictionary where the key is a page combination tuple and the value is its count.
                The result is sorted and contains only the top (show_results) tuples.
                If sort = False, return all page combinations, unsorted.
                {(page1, page2, ... pageN): count}
    """

    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")


    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]


    # Define the empty result dictionary
    results = {}

    # Populate the result dictionary
    for journey in user_journeys:
        journey = remove_consecutive_duplicates(journey)

        flag = {} # Used to stop double counting of the same page combination in the same journey

        for i in range(len(journey) - number_of_pages + 1):

            page_combination = tuple(journey[i : i + number_of_pages])

            if flag.get(page_combination, False): continue # If we have counted it already, skip it

            results[page_combination] = results.get(page_combination, 0) + 1
            flag[page_combination] = True # Flag this combination as already counted in this journey

        if not sort:
            return results

    # Create a new dictionary, sorted by values (page counts) in decreasing order
    # Take only top 'show_results' pages
    sorted_keys = sorted(results, key = results.get, reverse = True)[:show_results]
    sorted_results = {key: results[key] for key in sorted_keys}



    return sorted_results


### 4. Journey length

In [21]:
# the average length of a user's journey in terms of pages, per user session, I assume?
def journey_length(data, target_column = 'user_journey', plan = "All", mask = None):

    # Set the proper mask if none was provided by the user
    if mask is None:
        mask = make_mask(data, plan, "subscription_type")


    # Obtain the relevant array of user journey pages
    user_journeys = split_pages(data, target_column)
    user_journeys = user_journeys[mask]

    journey_lengths = []
    for journey in user_journeys:
        length = len(journey)
        journey_lengths.append(length)
    return np.mean(journey_lengths)

## Results


#### 1. Page count

In [22]:
page_count(df)

{'Checkout': 17896,
 'Log in': 17265,
 'Coupon': 11855,
 'Courses': 7149,
 'Sign up': 6824,
 'Other': 6820,
 'Career tracks': 4910,
 'Homepage': 3808,
 'Career track certificate': 3044,
 'Resources center': 2266,
 'Pricing': 2262,
 'Course certificate': 1114,
 'Success stories': 604,
 'Upcoming courses': 188,
 'Instructors': 76,
 'Blog': 36,
 'About us': 33}

#### 2. Page presence

In [23]:
page_presence(df)

{'Log in': 3798,
 'Homepage': 2396,
 'Checkout': 2021,
 'Other': 1535,
 'Sign up': 1210,
 'Coupon': 1041,
 'Pricing': 929,
 'Courses': 908,
 'Career tracks': 747,
 'Career track certificate': 355,
 'Resources center': 339,
 'Course certificate': 191,
 'Upcoming courses': 101,
 'Success stories': 49,
 'Instructors': 26,
 'About us': 22,
 'Blog': 15}

#### 3. Page sequences

In [24]:
page_sequence(df, number_of_pages = 3, show_results = 10)

{('Homepage', 'Pricing', 'Checkout'): 103,
 ('Homepage', 'Sign up', 'Log in'): 70,
 ('Homepage', 'Career tracks', 'Sign up'): 58,
 ('Homepage', 'Career tracks', 'Courses'): 54,
 ('Homepage', 'Courses', 'Sign up'): 40,
 ('Homepage', 'Pricing', 'Sign up'): 40,
 ('Courses', 'Sign up', 'Log in'): 35,
 ('Homepage', 'Courses', 'Career tracks'): 35,
 ('Homepage', 'Pricing', 'Courses'): 34,
 ('Homepage', 'Career tracks', 'Pricing'): 30}

#### 4. Average journey length

In [25]:
journey_length(df)

8.671363865123302