In [58]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('user_journey_raw.csv')

In [4]:
df.head()

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
1,1516,2980248,Annual,Other-Sign up-Sign up-Sign up-Sign up-Sign up-...
2,1516,2992252,Annual,Log in-Log in-Log in-Log in-Log in-Log in
3,1516,3070491,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
4,1516,3709807,Annual,Log in-Log in-Log in-Log in-Log in-Log in-Log ...


In [22]:
df['user_journey'][0].split('-')

['Homepage',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Log in',
 'Other']

In [6]:
df.shape

(9935, 4)

In [12]:
df.isnull().sum()

user_id              0
session_id           0
subscription_type    0
user_journey         0
dtype: int64

In [17]:
df['subscription_type'].unique()

array(['Annual', 'Monthly', 'Quarterly'], dtype=object)

In [19]:
df['user_journey'].nunique()

1841

# Data preprocessing

### 1. Remove duplicates from the user_journey column

In [111]:
def remove_page_duplicates(datafr, target_column):
    new_journey_col = []
    for journey in datafr[target_column]:
        list_ = journey.split('-')
        no_duplicates = []
        for page in list_:
            if page not in no_duplicates:
                no_duplicates.append(page)
        new_journey_col.append(no_duplicates)

    # turn the list of lists (new_journey_col) into a series to be able to join the words back with '-'
    s = pd.Series(new_journey_col)
    new_journey = s.str.join('-')

    # create a copy of the original dataframe with an updated column
    data = datafr.copy(deep=True)
    data[target_column] = new_journey
    return data

In [164]:
data = remove_page_duplicates(df, 'user_journey')

In [165]:
data.head()

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Other
1,1516,2980248,Annual,Other-Sign up-Log in
2,1516,2992252,Annual,Log in
3,1516,3070491,Annual,Homepage-Log in
4,1516,3709807,Annual,Log in


### 2. Group user_journey(s) by user_id

In [174]:
def group_by(datafr, group_by_col='user_id', target_col='user_journey', sessions='all', count_from='last'):
    datafr[target_col] = datafr.groupby([group_by_col])[target_col].transform(lambda x: "-".join(x.str.strip()))
    data_cl = datafr.copy(deep=True)
    data_cl = data_cl.drop_duplicates(subset="user_id")
    return data_cl


In [175]:
# def inspect(x):
#     return print(type(x))


In [176]:
# data.groupby(['user_id'])[['user_journey']].transform(lambda x: inspect(x))

In [177]:
# data['user_journey'] = data.groupby(['user_id'])['user_journey'].transform(lambda x: "-".join(x.str.strip()))
# data = data.drop_duplicates(subset="user_id")
clean_data = group_by(data)
clean_data.head()

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Other-Other-Sign up-Log in-Log...
13,3395,1415870,Annual,Other-Pricing-Sign up-Log in-Homepage-Pricing-...
18,10107,360608,Annual,Homepage-Homepage-Career tracks-Sign up-Log in...
34,11145,501166,Monthly,Homepage-Log in-Homepage-Log in-Homepage-Log i...
45,12400,3981254,Monthly,Homepage-Career tracks-Sign up-Log in-Other-Ca...


In [182]:
clean_data.isnull().sum()

user_id              0
session_id           0
subscription_type    0
user_journey         0
dtype: int64

In [178]:
#  save clean_data as a csv for analysis in next step
clean_data.to_csv('clean_data.csv')

In [145]:
''' find out why groupby(['user_id'])[['user_journey']] was different -- the [[]] makes the result of the groupby
a dataframe, this then doesn't work on transform, as transform only works on one Series at a time. In our case, it
concatenates all the journeys grouoped by user. If we wanted to subtract 2 columns grouped by user, we'd have to use apply
'''
# find out why x.str.join('') was different -- str.join is a method for Series, join only works on iterables, not dataframes

" find out why groupby(['user_id'])[['user_journey']] was different -- the [[]] makes the result of the groupby\na dataframe, this then doesn't work on transform, as transform only works on one Series at a time. In our case, it\nconcatenates all the journeys grouoped by user. If we wanted to subtract 2 columns grouped by user, we'd have to use apply\n"

### 3. Remove all unnecessary pages (data scientist's choice)

In [202]:
def remove_pages(data, pages:list, target_col='user_journey'):
    kept_pages_col = []
    for list_ in data[target_col]:
        lst = list_.split('-')
        kept_pages = [page if page not in pages else "" for page in lst]
        kept_pages_col.append(kept_pages)

    # replace empty list with '' to not get NaNs
    
    # turn the list of lists (new_journey_col) into a series to be able to join the words back with '-'
    s = pd.Series(kept_pages_col)
    new_pages = s.str.join('-')

    # create a copy of the original dataframe with an updated column
    stripped_pages_data = data.copy(deep=True)
    stripped_pages_data[target_col] = new_pages
    return stripped_pages_data

In [203]:
remove_pages(clean_data, ['Log in']).isnull().sum()

user_id                 0
session_id              0
subscription_type       0
user_journey         1204
dtype: int64