In [13]:
import pandas as pd
from collections import Counter
from itertools import islice

In [14]:
# Load the preprocessed data
df = pd.read_csv('processed_user_journeys.csv')
df.head()

Unnamed: 0,user_id,user_journey
0,1516,Homepage-Log in-Other-Other-Sign up-Log in-Log...
1,3395,Other-Pricing-Sign up-Log in-Homepage-Pricing-...
2,10107,Homepage-Homepage-Career tracks-Homepage-Caree...
3,11145,Homepage-Log in-Homepage-Log in-Homepage-Log i...
4,12400,Homepage-Career tracks-Sign up-Log in-Other-Ca...


In [15]:
def page_count(data, target_column='user_journey'):
    # Split each journey and count page occurrences
    page_counter = Counter()
    for journey in data[target_column]:
        pages = journey.split('-')
        page_counter.update(pages)
    
    # Convert to DataFrame for easier analysis
    page_count_df = pd.DataFrame(page_counter.items(), columns=['Page', 'Count']).sort_values(by='Count', ascending=False)
    return page_count_df

In [16]:
page_counts = page_count(df)
page_counts.head()

Unnamed: 0,Page,Count
1,Log in,3876
0,Homepage,2738
4,Checkout,2064
2,Other,1763
3,Sign up,1303


In [17]:
def page_presence(data, target_column='user_journey'):
    page_presence_counter = Counter()
    for journey in data[target_column]:
        unique_pages = set(journey.split('-'))
        page_presence_counter.update(unique_pages)
    
    page_presence_df = pd.DataFrame(page_presence_counter.items(), columns=['Page', 'Presence']).sort_values(by='Presence', ascending=False)
    return page_presence_df

In [18]:
page_presences = page_presence(df)
page_presences.head()

Unnamed: 0,Page,Presence
0,Homepage,843
5,Checkout,821
4,Log in,756
1,Sign up,738
2,Other,623


In [19]:
def page_destination(data, target_column='user_journey'):
    followup_counter = Counter()
    for journey in data[target_column]:
        pages = journey.split('-')
        # Count the follow-up pairs
        for i in range(len(pages) - 1):
            followup_pair = (pages[i], pages[i + 1])
            followup_counter[followup_pair] += 1
    
    # Convert to DataFrame and sort by frequency
    page_destination_df = pd.DataFrame(
        [(pair[0], pair[1], count) for pair, count in followup_counter.items()],
        columns=['Page', 'Next_Page', 'Frequency']
    ).sort_values(by=['Page', 'Frequency'], ascending=[True, False])
    return page_destination_df

In [20]:
page_destinations = page_destination(df)

# Remove duplicates based on the 'Next_Page' column
unique_page_destinations = page_destinations.drop_duplicates(subset=['Next_Page'])

# Display the dataframe
unique_page_destinations

Unnamed: 0,Page,Next_Page,Frequency
119,About us,Other,6
203,About us,Pricing,3
201,About us,Resources center,2
215,About us,Homepage,2
219,About us,About us,2
199,About us,Sign up,1
220,About us,Career track certificate,1
222,About us,Instructors,1
226,About us,Courses,1
228,About us,Upcoming courses,1


In [21]:
def page_sequences(data, target_column='user_journey', sequence_length=3):
    sequence_counter = Counter()
    for journey in data[target_column]:
        pages = journey.split('-')
        unique_sequences = set(tuple(pages[i:i + sequence_length]) for i in range(len(pages) - sequence_length + 1))
        sequence_counter.update(unique_sequences)
    
    # Convert to DataFrame and sort by frequency
    sequence_df = pd.DataFrame(
        [(sequence, count) for sequence, count in sequence_counter.items()],
        columns=['Sequence', 'Frequency']
    ).sort_values(by='Frequency', ascending=False)
    return sequence_df

In [22]:
sequences = page_sequences(df, sequence_length=3)
sequences.head()

Unnamed: 0,Sequence,Frequency
1,"(Log in, Homepage, Log in)",220
59,"(Log in, Log in, Log in)",198
35,"(Homepage, Log in, Checkout)",192
0,"(Log in, Checkout, Checkout)",186
6,"(Log in, Log in, Checkout)",167


In [23]:
def journey_length(data, target_column='user_journey'):
    lengths = [len(journey.split('-')) for journey in data[target_column]]
    average_length = sum(lengths) / len(lengths) if lengths else 0
    return average_length

In [24]:
average_journey_length = journey_length(df)
print(f"Average Journey Length: {average_journey_length}")

Average Journey Length: 13.134074074074075


In [25]:
# Run all metrics
page_counts = page_count(df)
page_presences = page_presence(df)
page_destinations = page_destination(df)
sequences = page_sequences(df, sequence_length=3)
average_journey_length = journey_length(df)

# Display results
print("Page Counts:\n", page_counts)
print("\nPage Presences:\n", page_presences)
print("\nPage Destinations:\n", page_destinations)
print("\nMost Common Sequences:\n", sequences)
print(f"\nAverage Journey Length: {average_journey_length}")

Page Counts:
                         Page  Count
1                     Log in   3876
0                   Homepage   2738
4                   Checkout   2064
2                      Other   1763
3                    Sign up   1303
9                    Courses   1187
7              Career tracks   1135
6                    Pricing   1094
5                     Coupon   1041
8           Resources center    580
10  Career track certificate    481
12        Course certificate    217
14          Upcoming courses    114
13           Success stories     53
11               Instructors     43
15                  About us     22
16                      Blog     20

Page Presences:
                         Page  Presence
0                   Homepage       843
5                   Checkout       821
4                     Log in       756
1                    Sign up       738
2                      Other       623
3                     Coupon       606
6                    Pricing       476
7       

In [26]:
from functions import group_by

# Answer Questions

Q1 : What is the average length of a user journey if you consider just the last three sessions?

In [27]:
# Group only the first three sessions
grouped_data_first3 = group_by(df, sessions=3, count_from='first')

# Count the number of records
record_count = len(grouped_data_first3)
record_count

1350

Q2 : What is the 3rd most popular page for quarterly users? (Consider all sessions and pages.)

In [None]:
# Answer

Q3 : What is the 4th most popular page after the user has been on Pricing? (Consider all plans, sessions, and pages.)

In [None]:
# Get page destinations
page_destinations = page_destination(df)

# Filter for follow-ups after "Pricing" and sort by frequency
pricing_followups = page_destinations[page_destinations['Page'] == 'Pricing']
sorted_followups = pricing_followups.sort_values(by='Frequency', ascending=False)
fourth_most_popular_after_pricing = sorted_followups.iloc[3]  # Index 3 gives the 4th most popular
fourth_most_popular_after_pricing

Page         Pricing
Next_Page    Courses
Frequency        112
Name: 46, dtype: object

Q4 : What is the average length of a user journey if you consider just the last three sessions?

In [32]:
# Group only the last three sessions
grouped_data_last3 = group_by(df, sessions=3, count_from='last')

# Calculate average journey length
average_length = journey_length(grouped_data_last3)
average_length

13.134074074074075

Q5 : What is the page with the 4th highest presence in the last three sessions of journeys (not the absolute number of page visits)?

In [33]:
# Group by last three sessions
grouped_data_last3 = group_by(df, sessions=3, count_from='last')

# Calculate page presence
page_presences = page_presence(grouped_data_last3)
sorted_presence = page_presences.sort_values(by='Presence', ascending=False)
fourth_highest_presence = sorted_presence.iloc[3]  # Index 3 gives the 4th highest
fourth_highest_presence

Page        Sign up
Presence        738
Name: 1, dtype: object

Q6 : In how many journeys is the most popular sequence of 4 pages encountered (last 3 sessions, all plans)?

In [34]:
# Group by last three sessions
grouped_data_last3 = group_by(df, sessions=3, count_from='last')

# Calculate 4-page sequences
sequences = page_sequences(grouped_data_last3, sequence_length=4)
most_popular_sequence = sequences.iloc[0]  # Index 0 gives the most popular sequence
print(most_popular_sequence)


Sequence     (Log in, Log in, Log in, Log in)
Frequency                                 137
Name: 67, dtype: object
