In [36]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from collections import Counter, deque
import itertools

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv(r'D:\DONE PROJECTS\user_journey\3 - user_journey_dumped.csv')
df.head(25)

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
1,1516,2980248,Annual,Other-Sign up-Sign up-Sign up-Sign up-Sign up-...
2,1516,2992252,Annual,Log in-Log in-Log in-Log in-Log in-Log in
3,1516,3070491,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
4,1516,3709807,Annual,Log in-Log in-Log in-Log in-Log in-Log in-Log ...
5,1516,3723132,Annual,Checkout-Checkout-Checkout-Checkout
6,1516,3723365,Annual,Checkout-Checkout-Checkout-Checkout-Checkout-C...
7,1516,3723382,Annual,Checkout-Checkout-Checkout-Checkout-Checkout-C...
8,1516,3723427,Annual,Checkout-Checkout
9,1516,3723483,Annual,Coupon-Coupon


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9935 entries, 0 to 9934
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            9935 non-null   int64 
 1   session_id         9935 non-null   int64 
 2   subscription_type  9935 non-null   object
 3   user_journey       9935 non-null   object
dtypes: int64(2), object(2)
memory usage: 310.6+ KB


In [4]:
def group_by_sessions(data: pd.DataFrame, group_col: str, target_col: str, num_of_sessions: str|int = 'All', count_from: str = 'First') -> pd.DataFrame:
    '''Group user pages visited into a single one big string,
        [num_of_sessions] is the amount of sessions to group from,
        [count_from] is the method to set a direction from where to fetch the amount of sessions, either from the start or the end of user journey'''

    # Validate parameters to match required data types and formats
    if count_from not in {'First', 'Last'}:
        raise ValueError('count_from parameter has to be either "First" or "Last"')
    if not isinstance(num_of_sessions, int) and num_of_sessions != 'All':
        raise ValueError('num_of_sessions parameter has to be either "All" or numeric value') 

    # Make a copy of inputed dataframe
    new_df = data.copy()

    # Return a full user journey string 
    if num_of_sessions == 'All':
        new_df = new_df.groupby(group_col)[target_col].agg(lambda x: (x + '-').sum().strip('-')).reset_index()
        #new_df['user_journey'] = new_df['user_journey'].map(cleanup_session_journey)
        return new_df

    # Return first [num_of_sessions] 
    elif count_from == 'First':
        new_df['row_number'] = new_df.sort_values(['user_id', 'session_id'], ascending=[True, True]).groupby('user_id').cumcount() + 1
        new_df = new_df[new_df['row_number'] <= num_of_sessions]
        new_df = new_df.groupby(group_col)[target_col].agg(lambda x: (x + '-').sum().strip('-')).reset_index()
        #new_df['user_journey'] = new_df['user_journey'].map(cleanup_session_journey)
        return new_df
    
    # Return last [num_of_sessions]
    elif count_from == 'Last':
        new_df['row_number'] = new_df.sort_values(['user_id', 'session_id'], ascending=[True, False]).groupby('user_id').cumcount() + 1
        new_df = new_df[new_df['row_number'] <= num_of_sessions]
        new_df = new_df.groupby(group_col)[target_col].agg(lambda x: (x + '-').sum().strip('-')).reset_index()
        #new_df['user_journey'] = new_df['user_journey'].map(cleanup_session_journey)
        return new_df
    

In [5]:
def remove_subsequent_pages(journey: str):
    '''Remove sequence of repeating pages, 
        for example replacing `Homepage-Homepage-Homepage-Login` sequence with `Homepage-Login`'''
    journey: list = journey.split('-')
    new_journey = []
    # Iterate through list of pages visited by user  
    for page in journey:
        # Add first page into the empty new_journey list
        if not new_journey:
            new_journey.append(page)
            continue
        # Append page if it is not equal to the previous added page 
        if page != new_journey[-1]:
            new_journey.append(page)
        # Othervise head to the next iteration
        else:
            continue 
    return '-'.join(new_journey)       

In [6]:
def remove_specific_pages(journey: str, pages_to_remove: list):
    # Ensure that pages to be removed are present as list
    if not isinstance(pages_to_remove, list):
        raise ValueError('The pages to be removed should be presented in the form of a list')
    journey = journey.split('-')
    # Iterate through user_journey and keep only those pages which are not present in pages_to_remove list
    for index, page in enumerate(journey):
        if page in pages_to_remove:
            journey.pop(index)
    return '-'.join(journey)


In [27]:
def get_users_by_subscription(data: pd.DataFrame, subscription: str) -> pd.DataFrame:
    if subscription not in {'Annual', 'Quarterly', 'Monthly'}:
        raise ValueError('There are only 3 subscription types: Annual, Quarterly, Monthly.')
    
    return data[data['subscription_type'] == subscription]

## Make a dataset for future analysis

In [60]:
df1 = group_by_sessions(df, 'user_id', 'user_journey', num_of_sessions='All', count_from='First')

subscription_types = df.groupby('user_id')['subscription_type'].max().reset_index()
all_subs = pd.merge(df1, subscription_types, on='user_id') # Add subscription types 

all_subs['user_journey'] = all_subs['user_journey'].map(remove_subsequent_pages)
# all_subs['user_journey'] = all_subs['user_journey'].map(lambda x: remove_specific_pages(x, ['Homepage']))
all_subs

Unnamed: 0,user_id,user_journey,subscription_type
0,1516,Homepage-Log in-Other-Sign up-Log in-Homepage-...,Annual
1,3395,Other-Pricing-Sign up-Log in-Homepage-Pricing-...,Annual
2,10107,Homepage-Career tracks-Homepage-Career tracks-...,Annual
3,11145,Homepage-Log in-Homepage-Log in-Homepage-Log i...,Monthly
4,12400,Homepage-Career tracks-Sign up-Log in-Other-Ca...,Monthly
...,...,...,...
1345,509060,Other,Annual
1346,509061,Coupon,Annual
1347,509085,Coupon,Annual
1348,509095,Other,Annual


In [31]:
annual_subs = get_users_by_subscription(df1, 'Annual')
monthly_subs = get_users_by_subscription(df1, 'Monthly')
quarterly_subs = get_users_by_subscription(df1, 'Quarterly')

## Page count is the most fundamental metric; it counts how many times each page can be found in all user journeys.

In [32]:
def total_page_count(data: pd.DataFrame) -> str:
    page_count = Counter(data['user_journey'].apply(lambda x: x + '-').sum().strip('-').split('-'))
    for i, j in page_count.items():
        print(f"{i}: {j}")

total_page_count(all_subs)

Homepage: 2679
Log in: 2234
Other: 1189
Sign up: 1247
Checkout: 1351
Coupon: 720
Pricing: 1053
Career tracks: 1070
Resources center: 546
Courses: 1087
Career track certificate: 468
Instructors: 43
Course certificate: 212
Success stories: 49
Upcoming courses: 110
About us: 20
Blog: 20


## Page presence is similar to ‘page count’ but counts each page only once if it exists in a journey; it shows how many times each page is part of a journey 

In [33]:
# Almost the same code as before but set() was added to eliminate all duplicate pages from each session journey
def page_presence(data: pd.DataFrame) -> str:
    page_presence = Counter(data['user_journey'].map(lambda x: '-'.join(set(x.split('-')))).apply(lambda x: x + '-').sum().strip('-').split('-')) 
    for i, j in page_presence.items():
        print(f"{i}: {j}")

page_presence(all_subs)

Log in: 756
Checkout: 821
Homepage: 843
Sign up: 738
Coupon: 606
Other: 623
Pricing: 476
Courses: 453
Career tracks: 380
Resources center: 184
Instructors: 25
Career track certificate: 228
Course certificate: 151
Success stories: 38
Upcoming courses: 83
About us: 17
Blog: 13


## Page destination is a metric that shows the most frequent follow-ups after every page. It looks at every page and counts which pages follow next. <br>
### It is available to configure the number of pages in a sequence.

In [56]:
def page_followups(data: pd.DataFrame, sequence_of_pages: int = 2) -> str:
    all_pages = iter(data['user_journey'].map(lambda x: x + '-').sum().strip('-').split('-'))
    
    pages = list()

    dic = {}

    while len(pages) != sequence_of_pages:
        pages.append(next(all_pages))

    dic[tuple(pages)] = dic.setdefault(tuple(pages), 0) + 1

    for current_value in all_pages:
        pages.pop(0)
        pages.append(current_value)
        # Save pairs (previous value, following value) as keys 
        dic[tuple(pages)] = dic.setdefault(tuple(pages), 0) + 1

    string = ' -> '.join(('{}' for _ in range(sequence_of_pages))) + ': {}'
    for key, value in sorted(dic.items(), key=lambda x: (-x[1])): 
        print(string.format(*key, value))


page_followups(all_subs, sequence_of_pages=2)

Homepage -> Log in -> Homepage -> Log in: 382
Log in -> Homepage -> Log in -> Homepage: 341
Resources center -> Other -> Resources center -> Other: 248
Other -> Resources center -> Other -> Resources center: 236
Log in -> Homepage -> Log in -> Checkout: 122
Career tracks -> Courses -> Career tracks -> Courses: 122
Homepage -> Log in -> Checkout -> Homepage: 110
Courses -> Career tracks -> Courses -> Career tracks: 103
Homepage -> Career tracks -> Homepage -> Career tracks: 83
Log in -> Checkout -> Log in -> Checkout: 76
Log in -> Checkout -> Homepage -> Log in: 65
Sign up -> Log in -> Homepage -> Log in: 59
Checkout -> Homepage -> Log in -> Checkout: 58
Log in -> Checkout -> Other -> Log in: 58
Pricing -> Checkout -> Pricing -> Checkout: 56
Homepage -> Log in -> Checkout -> Other: 55
Career tracks -> Homepage -> Career tracks -> Homepage: 49
Homepage -> Pricing -> Checkout -> Pricing: 49
Homepage -> Pricing -> Homepage -> Pricing: 49
Homepage -> Log in -> Checkout -> Log in: 48
Log in 

## Journey length is a metric that considers the average length of a user journey in terms of pages.

In [59]:
def user_journey_avg_length(data: pd.DataFrame) -> int:
    length = int(data["user_journey"].map(lambda x: len(x.split("-"))).mean())
    print(f'The average length of a user journey is {length} pages.')

user_journey_avg_length(all_subs)

The average length of a user journey is 10 pages.



# Recall, however, that the data provided also had a subscription plan column. A vital part of the analysis is finding patterns and differences between buyers’ behavior of different plans. For example, compare the journey of monthly users versus the one of annual users. This is why it’s essential to incorporate a plan parameter in these functions, allowing a data scientist to obtain the metrics for all subscription plans or any specific one.



In [61]:
all_subs

Unnamed: 0,user_id,user_journey,subscription_type
0,1516,Homepage-Log in-Other-Sign up-Log in-Homepage-...,Annual
1,3395,Other-Pricing-Sign up-Log in-Homepage-Pricing-...,Annual
2,10107,Homepage-Career tracks-Homepage-Career tracks-...,Annual
3,11145,Homepage-Log in-Homepage-Log in-Homepage-Log i...,Monthly
4,12400,Homepage-Career tracks-Sign up-Log in-Other-Ca...,Monthly
...,...,...,...
1345,509060,Other,Annual
1346,509061,Coupon,Annual
1347,509085,Coupon,Annual
1348,509095,Other,Annual
