In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('user_journey_raw.csv')

In [None]:
df.head()

In [None]:
df['user_journey'][0].split('-')

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['subscription_type'].unique()

In [None]:
df['user_journey'].nunique()

# Data preprocessing

### 1. Remove duplicates from the user_journey column

In [3]:
def remove_page_duplicates(datafr, target_column):
    new_journey_col = []
    for journey in datafr[target_column]:
        list_ = journey.split('-')
        no_duplicates = []
        for page in list_:
            if page not in no_duplicates:
                no_duplicates.append(page)
        new_journey_col.append(no_duplicates)

    # turn the list of lists (new_journey_col) into a series to be able to join the words back with '-'
    s = pd.Series(new_journey_col)
    new_journey = s.str.join('-')

    # create a copy of the original dataframe with an updated column
    data = datafr.copy(deep=True)
    data[target_column] = new_journey
    return data

In [4]:
data = remove_page_duplicates(df, 'user_journey')

In [5]:
data.head()

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Other
1,1516,2980248,Annual,Other-Sign up-Log in
2,1516,2992252,Annual,Log in
3,1516,3070491,Annual,Homepage-Log in
4,1516,3709807,Annual,Log in


### 2. Group user_journey(s) by user_id

In [15]:
def group_by(datafr, group_by_col='user_id', target_col='user_journey', sessions='all', count_from='last'):

    # Create a new dataframe to populate with the results
    df = pd.DataFrame(columns = datafr.columns)

    # Set the start and end of the sessions to be aggreagated. i.e. if sessions is an int and count_from='first', the function should aggregate the first int sessions
    if sessions == 'all':
        start = 0
        end = None
    elif sessions == 'all_except_last':
        start = 0
        end = -1
    elif count_from == 'last':
        start = - sessions
        end = None
    elif count_from == 'first':
        start = 0
        end = sessions


    # get unique values for group_by_col
    groups = set(datafr[group_by_col])

    for group in groups:
        # perform a boolean mask to return a list of all the instances in the dataframe that are True
        group_mask = datafr[group_by_col] == group
        # filter all the rows that are True. This returns all the sessions with the corresponding user_id
        group_table = datafr[group_mask]
        # aggregate all the desirable 'user_journey' rows. This returns a string
        user_journey = '-'.join(list(group_table[target_col])[start:end])
        # At this point we(group_table) still have the repeat ids beside each journey. So we only need one of each
        new_user_journey_series = group_table.iloc[0].copy()
        # replace the user_journey with the new value
        new_user_journey_series[target_col] = user_journey
        # Append a row to the new DataFrame corresponding to a single (aggregated) group
        new_index = len(df)
        df.loc[new_index] = group_table.iloc[0].copy()
        df.loc[new_index][target_col] = user_journey

        # Reset the index (precautionary) and sort the new DataFrame by user_id
        df.sort_values(by=[group_by_col], ignore_index = True, inplace = True)
        df.reset_index(drop = True, inplace = True)

    return df

In [14]:
# group_by(data)

In [None]:
# def inspect(x):
#     return print(type(x))


In [None]:
# data.groupby(['user_id'])[['user_journey']].transform(lambda x: inspect(x))

In [16]:
# data['user_journey'] = data.groupby(['user_id'])['user_journey'].transform(lambda x: "-".join(x.str.strip()))
# data = data.drop_duplicates(subset="user_id")
clean_data = group_by(data, sessions=3)
clean_data.shape

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df.loc[new_index][target_col] = user_journey
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[new_index][

(1350, 4)

In [17]:
clean_data.isnull().sum()

user_id              0
session_id           0
subscription_type    0
user_journey         0
dtype: int64

In [None]:
#  save clean_data as a csv for analysis in next step
clean_data.to_csv('clean_data.csv')

In [None]:
''' find out why groupby(['user_id'])[['user_journey']] was different -- the [[]] makes the result of the groupby
a dataframe, this then doesn't work on transform, as transform only works on one Series at a time. In our case, it
concatenates all the journeys grouoped by user. If we wanted to subtract 2 columns grouped by user, we'd have to use apply
'''
# find out why x.str.join('') was different -- str.join is a method for Series, join only works on iterables, not dataframes

### 3. Remove all unnecessary pages (data scientist's choice)

In [None]:
def remove_pages(data, pages:list, target_col='user_journey'):
    kept_pages_col = []
    for list_ in data[target_col]:
        lst = list_.split('-')
        kept_pages = [page if page not in pages else "" for page in lst]
        kept_pages_col.append(kept_pages)

    # replace empty list with '' to not get NaNs

    # turn the list of lists (new_journey_col) into a series to be able to join the words back with '-'
    s = pd.Series(kept_pages_col)
    new_pages = s.str.join('-')

    # create a copy of the original dataframe with an updated column
    stripped_pages_data = data.copy(deep=True)
    stripped_pages_data[target_col] = new_pages
    return stripped_pages_data

In [None]:
remove_pages(clean_data, ['Log in']).isnull().sum()