In [204]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys
import sqlite3
from collections import Counter, OrderedDict
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
cwd = os.getcwd()
PROJECT_PATH = cwd.split("quince_assignment")[0] + 'quince_assignment'
sys.path.insert(0, PROJECT_PATH)

In [3]:
event_data = pd.read_csv(
    f"{PROJECT_PATH}/src/data/event_202012182112.csv.gz", compression="gzip")

In [4]:
event_data['event_time'] = pd.to_datetime(event_data['event_time'],
                                          format="%Y-%m-%d %H:%M:%S")

In [268]:
event_data.head(3)

Unnamed: 0,event_id,event_type,session_id,event_time,lb_user_id,email,product_id,product_category,product_type,order_id,collection_name,ip,device,url,device_category,user_agent,user_gender,user_state,traffic_source
0,1,collection_view,_3yqjlmvar,2020-09-15,_gubus5rdy,,,,,,Best Sellers,66.65.111.174,mobile,https://www.onequince.com/women/best-sellers,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
1,2,product_view,,2020-09-15,,,,,,,,66.249.79.246,mobile,https://54.183.41.207/products/100-organic-lin...,Android,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,,,
2,2,product_view,_khkq9os37,2020-09-15,_um8huipsf,,2070045000000.0,Accessories,Accessories,,,66.249.79.228,desktop,https://www.onequince.com/women/leather/tall-i...,Unknown,"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Ge...",,,


## Overall stats

In [5]:
print(f"Event data has {event_data.shape[0]} rows and {event_data.shape[1]} columns")

Event data has 3638376 rows and 19 columns


In [6]:
min_date = event_data['event_time'].min().strftime('%d%b%Y')
max_date = event_data['event_time'].max().strftime('%d%b%Y')
print(f"Event data is from {min_date} to {max_date}")

Event data is from 15Sep2020 to 01Dec2020


In [198]:
event_data['event_type'].value_counts().cumsum()

product_view                              1355986
collection_view                           2665640
email_subscription_view                   3211952
add_to_cart                               3406773
checkout_page_view_contact_information    3480792
checkout_page_view_processing             3514666
order_complete                            3547623
checkout_page_view_payment_method         3579495
checkout_page_view_shipping_method        3609950
checkout_page_view_stock_problems         3613050
checkout_page_view_forward                3613763
checkout_page_view_review                 3613956
Name: event_type, dtype: int64

In [193]:
event_data.head()

Unnamed: 0,event_id,event_type,session_id,event_time,lb_user_id,email,product_id,product_category,product_type,order_id,collection_name,ip,device,url,device_category,user_agent,user_gender,user_state,traffic_source
0,1,collection_view,_3yqjlmvar,2020-09-15 00:00:00,_gubus5rdy,,,,,,Best Sellers,66.65.111.174,mobile,https://www.onequince.com/women/best-sellers,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
1,2,product_view,,2020-09-15 00:00:00,,,,,,,,66.249.79.246,mobile,https://54.183.41.207/products/100-organic-lin...,Android,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,,,
2,2,product_view,_khkq9os37,2020-09-15 00:00:00,_um8huipsf,,2070045000000.0,Accessories,Accessories,,,66.249.79.228,desktop,https://www.onequince.com/women/leather/tall-i...,Unknown,"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Ge...",,,
3,1,collection_view,_hze69q9et,2020-09-15 00:00:03,_ol42z2v2t,dd71fec29676f26fb0b5ceace4f73879,,,,,Silk,107.77.214.106,mobile,https://www.onequince.com/collections/silk,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,Female,California,
4,1,collection_view,_g263lpdmo,2020-09-15 00:00:06,_pbkmff3ji,305696f95c4ac02cd70379c2fba1babf,,,,,Tees & Tanks,69.115.146.167,mobile,https://www.onequince.com/women/tees,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like...,Female,New York,


## Column wise analysis

In [7]:
unique_items = event_data.nunique().to_dict()

In [18]:
event_data = event_data.assign(event_id=event_data['event_type']) 
event_data.replace({'event_id': event_type_order}, inplace=True)

In [11]:
for col in event_data.columns:
    print("="*50)
    print(f"Analysis for {col}")
    print(f"{col} has {unique_items[col]} unique items - {np.round(unique_items[col]/event_data.shape[0]*100, 2)}%")
    if unique_items[col] <= 20:
        print("Printing the frequency of categories")
        print(event_data[col].value_counts(normalize=True).cumsum())
    else:
        print("Printing sample values")
        print(list(event_data[col][0:5]))
    print("="*50)

Analysis for event_id
event_id has 3638376 unique items - 100.0%
Printing sample values
['1', '2', '2', '1', '1']
Analysis for event_type
event_type has 12 unique items - 0.0%
Printing the frequency of categories
product_view                              0.375208
collection_view                           0.737596
email_subscription_view                   0.888763
add_to_cart                               0.942671
checkout_page_view_contact_information    0.963153
checkout_page_view_processing             0.972526
order_complete                            0.981645
checkout_page_view_payment_method         0.990464
checkout_page_view_shipping_method        0.998892
checkout_page_view_stock_problems         0.999749
checkout_page_view_forward                0.999947
checkout_page_view_review                 1.000000
Name: event_type, dtype: float64
Analysis for session_id
session_id has 817575 unique items - 22.47%
Printing sample values
['_3yqjlmvar', nan, '_khkq9os37', '_hze69q9et', '_g

## Session level data

In [25]:
aggregations = {'event_time': ['max', 'min'],
                'event_id': [lambda x: '->'.join(x), 'count']}
session_data = event_data.groupby(['session_id', 'lb_user_id', 'device',
                                   'device_category', 'user_gender', 'user_state']).agg(aggregations)

In [33]:
session_data.reset_index(inplace=True)
session_data.columns = ['session_id', 'lb_user_id', 'device', 'device_category', 'user_gender',
                        'user_state', 'session_end_time', 'session_start_time', 'event_sequence', 'no_of_events']

In [82]:
session_data = session_data.assign(session_duration_sec=(
    session_data['session_end_time']-session_data['session_start_time']).dt.total_seconds())
session_data = session_data.assign(made_purchase=np.where(session_data['event_sequence'].str.contains("6"), "Yes", "No"))
session_data = session_data.assign(visit_count=session_data.groupby('lb_user_id')['session_end_time'].rank(method='min'))
session_data = session_data.assign(bought_on_first_visit=np.where((session_data['visit_count']==1)&(session_data['made_purchase']=='Yes'), 'Yes', 'No'))

In [84]:
session_data[session_data['visit_count']==1]['bought_on_first_visit'].value_counts()

Yes    116872
No      30475
Name: bought_on_first_visit, dtype: int64

In [35]:
print(f"Number of sessions happened is {session_data.shape[0]}")

Number of sessions happened is 256827


In [39]:
session_data['device_category'].value_counts(normalize=True).cumsum()

iPhone           0.569333
macOS            0.767789
Android          0.865622
Windows          0.950628
iPad             0.994599
Chrome OS        0.999537
Generic Linux    0.999961
iPod             0.999981
BlackBerry       0.999992
Unknown          1.000000
Name: device_category, dtype: float64

In [40]:
session_data['user_gender'].value_counts(normalize=True).cumsum()

Female    0.912587
Male      1.000000
Name: user_gender, dtype: float64

In [71]:
session_data['no_of_events'].value_counts(normalize=True).cumsum()

1      0.227402
2      0.448391
3      0.558785
4      0.628914
5      0.680392
6      0.720734
7      0.752417
8      0.781339
9      0.807372
10     0.830189
11     0.849661
12     0.865886
13     0.879834
14     0.892028
15     0.902802
16     0.912155
17     0.920300
18     0.927994
19     0.934851
20     0.940894
21     0.946240
22     0.950955
23     0.955196
24     0.959019
25     0.962531
26     0.965747
28     0.968446
27     0.971117
29     0.973605
30     0.975700
         ...   
123    0.999883
120    0.999891
146    0.999895
220    0.999899
125    0.999903
130    0.999907
122    0.999910
247    0.999914
132    0.999918
117    0.999922
116    0.999926
115    0.999930
112    0.999934
223    0.999938
95     0.999942
337    0.999945
148    0.999949
200    0.999953
319    0.999957
126    0.999961
138    0.999965
179    0.999969
177    0.999973
172    0.999977
171    0.999981
168    0.999984
162    0.999988
153    0.999992
143    0.999996
182    1.000000
Name: no_of_events, Leng

In [64]:
session_data[['session_duration_sec']].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).transpose()

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
session_duration_sec,256827.0,3940.43342,95419.480645,0.0,0.0,0.0,0.0,6.0,53.0,337.0,1431.0,3702.0,11207.48,6144060.0


In [69]:
session_data['made_purchase'].value_counts(normalize=True).cumsum()

False    0.507205
True     1.000000
Name: made_purchase, dtype: float64

In [None]:
event_type_order = {'checkout_page_view_contact_information': '4a', 'email_subscription_view': '6', 
                    'checkout_page_view_payment_method': '4c', 'checkout_page_view_stock_problems': '4e', 
                    'checkout_page_view_review': '4f', 'checkout_page_view_processing': '4d', 'add_to_cart': '3', 
                    'collection_view': '1', 'checkout_page_view_forward': '4g', 'product_view': '2', 
                    'checkout_page_view_shipping_method': '4b', 'order_complete': '5', np.NaN: '7'
                   }

## User wise aggregation

In [125]:
user_level_aggregation = {
    'session_duration_sec': 'mean', 'no_of_events' : 'mean', 'session_duration_sec' : 'mean', 'visit_count': 'count',
    'made_purchase' : lambda x: (x=="Yes").sum(), 'bought_on_first_visit': lambda x: (x=="Yes").sum(),
    'device': lambda x: set(x), 'device_category': lambda x: set(x), 'user_state': lambda x: set(x)
}

user_data = session_data.groupby(['lb_user_id', 'user_gender']).agg(user_level_aggregation)

In [153]:
user_data.reset_index(inplace=True)
user_data = user_data.iloc[:, -10:]
user_data.columns = ['lb_user_id', 'user_gender', 'avg_session_duration', 'avg_no_of_events', 'times_visited', 
                     'times_purchased', 'bought_on_first_visit', 'devices_used', 'device_categories_used', 'states_used_from']

In [155]:
user_data['bought_on_first_visit'].value_counts()

1    116872
0     31945
Name: bought_on_first_visit, dtype: int64

In [161]:
event_data['event_type'].value_counts()

product_view                              1355986
collection_view                           1309654
email_subscription_view                    546312
add_to_cart                                194821
checkout_page_view_contact_information      74019
checkout_page_view_processing               33874
order_complete                              32957
checkout_page_view_payment_method           31872
checkout_page_view_shipping_method          30455
checkout_page_view_stock_problems            3100
checkout_page_view_forward                    713
checkout_page_view_review                     193
Name: event_type, dtype: int64

In [244]:
session_data

Unnamed: 0,session_id,lb_user_id,device,device_category,user_gender,user_state,session_end_time,session_start_time,event_sequence,no_of_events,session_duration_sec,made_purchase,visit_count,bought_on_first_visit
0,_00022uznd,_oqprf6nav,desktop,Windows,Male,New York,2020-11-23 02:59:50,2020-11-23 02:59:03,1->1->2,3,47.0,No,10.0,No
1,_0004g0yuv,_m8i0dtjj6,mobile,iPhone,Female,Maryland,2020-11-11 12:25:05,2020-11-11 12:24:56,2->6,2,9.0,Yes,1.0,Yes
2,_000dfqvrj,_kn3otbq1a,mobile,iPhone,Female,Massachusetts,2020-10-04 23:32:01,2020-10-04 23:31:39,2->1,2,22.0,No,1.0,No
3,_000frx5lw,_u6ia8ze08,mobile,iPhone,Female,Virginia,2020-09-20 14:35:27,2020-09-20 13:07:20,1->6->1->2->3->2->3->2,8,5287.0,Yes,1.0,Yes
4,_000q43w78,_an7s8iurw,mobile,iPhone,Female,New York,2020-09-29 13:22:18,2020-09-29 13:22:13,2->6,2,5.0,Yes,1.0,Yes
5,_000v8ce6r,_kyfaw2l2b,desktop,macOS,Female,Oregon,2020-11-10 20:43:20,2020-11-10 20:41:02,1->2->2->1->1,5,138.0,No,3.0,No
6,_0011us1s6,_s9niq0pkc,mobile,iPhone,Female,Minnesota,2020-11-22 01:37:15,2020-11-22 01:32:45,6->1->2->1->2->1->2->1->2->1->1->2->1,13,270.0,Yes,1.0,Yes
7,_00149c4po,_xxsl3hx8n,mobile,iPhone,Female,Florida,2020-11-25 03:11:41,2020-11-25 03:07:36,1->2->6->1->2->1->2->1->2->1->2->2,12,245.0,Yes,1.0,Yes
8,_001c6nx95,_jz6zpaexz,desktop,macOS,Female,California,2020-10-08 05:15:52,2020-10-08 05:06:45,4a->2->6->3->1->2->1->2->1->1,10,547.0,Yes,1.0,Yes
9,_001m2vjto,_dfbf4z0fu,mobile,iPhone,Male,Illinois,2020-10-09 11:19:11,2020-10-09 11:19:11,2,1,0.0,No,2.0,No


In [189]:
consequent_events = []
event_sequences = session_data['event_sequence'].values
for event_sequence in event_sequences:
    event_sequence = event_sequence.split('->')
    event_sequence.append('e')
    event_sequence.insert(0, 's')
    consequent_events.extend(list(zip(event_sequence, event_sequence[1:])))

In [242]:
unique_events = []
event_sequences = session_data['event_sequence'].values
for event_sequence in event_sequences:
    event_sequence = event_sequence.split('->')
    event_set_list = list(set(event_sequence))
    unique_events.extend(event_set_list)

In [208]:
inv_event_type_order = {v: k for k, v in event_type_order.items()}

In [264]:
event_data['email'].isna().sum()

1898148

In [265]:
event_data.shape[0]

3638376

In [266]:
event_data

Unnamed: 0,event_id,event_type,session_id,event_time,lb_user_id,email,product_id,product_category,product_type,order_id,collection_name,ip,device,url,device_category,user_agent,user_gender,user_state,traffic_source
0,1,collection_view,_3yqjlmvar,2020-09-15 00:00:00,_gubus5rdy,,,,,,Best Sellers,66.65.111.174,mobile,https://www.onequince.com/women/best-sellers,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
1,2,product_view,,2020-09-15 00:00:00,,,,,,,,66.249.79.246,mobile,https://54.183.41.207/products/100-organic-lin...,Android,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,,,
2,2,product_view,_khkq9os37,2020-09-15 00:00:00,_um8huipsf,,2.070045e+12,Accessories,Accessories,,,66.249.79.228,desktop,https://www.onequince.com/women/leather/tall-i...,Unknown,"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Ge...",,,
3,1,collection_view,_hze69q9et,2020-09-15 00:00:03,_ol42z2v2t,dd71fec29676f26fb0b5ceace4f73879,,,,,Silk,107.77.214.106,mobile,https://www.onequince.com/collections/silk,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,Female,California,
4,1,collection_view,_g263lpdmo,2020-09-15 00:00:06,_pbkmff3ji,305696f95c4ac02cd70379c2fba1babf,,,,,Tees & Tanks,69.115.146.167,mobile,https://www.onequince.com/women/tees,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like...,Female,New York,
5,2,product_view,_13ftwo5x7,2020-09-15 00:00:07,_tv5zz7zxn,,2.067474e+12,Apparel,Sweatshirts & Sweatpants,,,64.151.14.190,mobile,https://www.onequince.com/men/cashmere/mongoli...,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
6,6,email_subscription_view,_13ftwo5x7,2020-09-15 00:00:12,_tv5zz7zxn,,,,,,,64.151.14.190,mobile,https://www.onequince.com/men/cashmere/mongoli...,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
7,6,email_subscription_view,_g263lpdmo,2020-09-15 00:00:17,_pbkmff3ji,305696f95c4ac02cd70379c2fba1babf,,,,,,69.115.146.167,mobile,https://www.onequince.com/women/tees,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like...,Female,New York,
8,2,product_view,_tbr84l90y,2020-09-15 00:00:24,_7ysphgq06,,4.351683e+12,Home,default,,,76.93.163.113,mobile,https://www.onequince.com/home/sheets/organic-...,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
9,6,email_subscription_view,_tbr84l90y,2020-09-15 00:00:31,_7ysphgq06,,,,,,,76.93.163.113,mobile,https://www.onequince.com/home/sheets/organic-...,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,


In [261]:
session_data['lb_user_id'].isna().sum()

0

In [243]:
keys = ['1', '2', '3' ,'4a', '4b', '4c', '4d', '4e', '4f', '4g', '5', '6', '7']

user_funnel = Counter(unique_events)

print(f"No of sessions is {session_data.shape[0]}")

for key in keys:
    desc = inv_event_type_order[key]
    print(f"Count for {desc} is {user_funnel[key]} - {np.round(user_funnel[key]/session_data.shape[0]*100,2)}%")

No of sessions is 256827
Count for collection_view is 158158 - 61.58%
Count for product_view is 211270 - 82.26%
Count for add_to_cart is 58322 - 22.71%
Count for checkout_page_view_contact_information is 39648 - 15.44%
Count for checkout_page_view_shipping_method is 23263 - 9.06%
Count for checkout_page_view_payment_method is 23302 - 9.07%
Count for checkout_page_view_processing is 21357 - 8.32%
Count for checkout_page_view_stock_problems is 2074 - 0.81%
Count for checkout_page_view_review is 148 - 0.06%
Count for checkout_page_view_forward is 558 - 0.22%
Count for order_complete is 29546 - 11.5%
Count for email_subscription_view is 126563 - 49.28%
Count for nan is 10254 - 3.99%


In [224]:
unique_events = []

event_sequences = session_data[session_data['device']=='desktop']['event_sequence'].values
for event_sequence in event_sequences:
    event_sequence = event_sequence.split('->')
    event_set_list = list(set(event_sequence))
    unique_events.extend(event_set_list)
    
keys = ['1', '2', '3' ,'4a', '4b', '4c', '4d', '4e', '4f', '4g', '5', '6', '7']

user_funnel = Counter(unique_events)

den = session_data[session_data['device']=='desktop'].shape[0]

print(f"No of sessions is {den}")
for key in keys:
    desc = inv_event_type_order[key]
    print(f"Count for {desc} is {user_funnel[key]} - {np.round(user_funnel[key]/den*100,2)}%")

No of sessions is 75106
Count for collection_view is 54339 - 72.35%
Count for product_view is 61511 - 81.9%
Count for add_to_cart is 22238 - 29.61%
Count for checkout_page_view_contact_information is 15653 - 20.84%
Count for checkout_page_view_shipping_method is 10273 - 13.68%
Count for checkout_page_view_payment_method is 10255 - 13.65%
Count for checkout_page_view_processing is 9534 - 12.69%
Count for checkout_page_view_stock_problems is 944 - 1.26%
Count for checkout_page_view_review is 52 - 0.07%
Count for checkout_page_view_forward is 286 - 0.38%
Count for order_complete is 12470 - 16.6%
Count for email_subscription_view is 31608 - 42.08%
Count for nan is 3641 - 4.85%


In [255]:
event_data[event_data['device_category']=='iPhone'].reset_index()['user_agent'][0]

'Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Instagram 158.1.0.29.120 (iPhone9,2; iOS 13_7; en_US; en-US; scale=2.88; 1080x1920; 244229785)'

In [256]:
event_data[event_data['device_category']=='iPhone'].reset_index()['user_agent'][1]

'Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Mobile/15E148 Safari/604.1'

In [257]:
event_data[event_data['device_category']=='iPhone'].reset_index()['user_agent'][2]

'Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Mobile/15E148 Safari/604.1'

In [258]:
event_data[event_data['device_category']=='iPhone'].reset_index()['user_agent'][3]

'Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Instagram 158.1.0.29.120 (iPhone9,3; iOS 13_7; en_US; en-US; scale=2.00; 750x1334; 244229785)'

In [234]:
event_data

Unnamed: 0,event_id,event_type,session_id,event_time,lb_user_id,email,product_id,product_category,product_type,order_id,collection_name,ip,device,url,device_category,user_agent,user_gender,user_state,traffic_source
0,1,collection_view,_3yqjlmvar,2020-09-15 00:00:00,_gubus5rdy,,,,,,Best Sellers,66.65.111.174,mobile,https://www.onequince.com/women/best-sellers,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
1,2,product_view,,2020-09-15 00:00:00,,,,,,,,66.249.79.246,mobile,https://54.183.41.207/products/100-organic-lin...,Android,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,,,
2,2,product_view,_khkq9os37,2020-09-15 00:00:00,_um8huipsf,,2.070045e+12,Accessories,Accessories,,,66.249.79.228,desktop,https://www.onequince.com/women/leather/tall-i...,Unknown,"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Ge...",,,
3,1,collection_view,_hze69q9et,2020-09-15 00:00:03,_ol42z2v2t,dd71fec29676f26fb0b5ceace4f73879,,,,,Silk,107.77.214.106,mobile,https://www.onequince.com/collections/silk,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,Female,California,
4,1,collection_view,_g263lpdmo,2020-09-15 00:00:06,_pbkmff3ji,305696f95c4ac02cd70379c2fba1babf,,,,,Tees & Tanks,69.115.146.167,mobile,https://www.onequince.com/women/tees,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like...,Female,New York,
5,2,product_view,_13ftwo5x7,2020-09-15 00:00:07,_tv5zz7zxn,,2.067474e+12,Apparel,Sweatshirts & Sweatpants,,,64.151.14.190,mobile,https://www.onequince.com/men/cashmere/mongoli...,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
6,6,email_subscription_view,_13ftwo5x7,2020-09-15 00:00:12,_tv5zz7zxn,,,,,,,64.151.14.190,mobile,https://www.onequince.com/men/cashmere/mongoli...,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
7,6,email_subscription_view,_g263lpdmo,2020-09-15 00:00:17,_pbkmff3ji,305696f95c4ac02cd70379c2fba1babf,,,,,,69.115.146.167,mobile,https://www.onequince.com/women/tees,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like...,Female,New York,
8,2,product_view,_tbr84l90y,2020-09-15 00:00:24,_7ysphgq06,,4.351683e+12,Home,default,,,76.93.163.113,mobile,https://www.onequince.com/home/sheets/organic-...,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,
9,6,email_subscription_view,_tbr84l90y,2020-09-15 00:00:31,_7ysphgq06,,,,,,,76.93.163.113,mobile,https://www.onequince.com/home/sheets/organic-...,iPhone,Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like M...,,,


In [225]:
unique_events = []

event_sequences = session_data[session_data['device']=='mobile']['event_sequence'].values
for event_sequence in event_sequences:
    event_sequence = event_sequence.split('->')
    event_set_list = list(set(event_sequence))
    unique_events.extend(event_set_list)
    
keys = ['1', '2', '3' ,'4a', '4b', '4c', '4d', '4e', '4f', '4g', '5', '6', '7']

user_funnel = Counter(unique_events)

den = session_data[session_data['device']=='mobile'].shape[0]

print(f"No of sessions is {den}")

for key in keys:
    desc = inv_event_type_order[key]
    print(f"Count for {desc} is {user_funnel[key]} - {np.round(user_funnel[key]/den*100,2)}%")

No of sessions is 171795
Count for collection_view is 98547 - 57.36%
Count for product_view is 142071 - 82.7%
Count for add_to_cart is 34270 - 19.95%
Count for checkout_page_view_contact_information is 22853 - 13.3%
Count for checkout_page_view_shipping_method is 12256 - 7.13%
Count for checkout_page_view_payment_method is 12316 - 7.17%
Count for checkout_page_view_processing is 11148 - 6.49%
Count for checkout_page_view_stock_problems is 1078 - 0.63%
Count for checkout_page_view_review is 93 - 0.05%
Count for checkout_page_view_forward is 266 - 0.15%
Count for order_complete is 16201 - 9.43%
Count for email_subscription_view is 90493 - 52.67%
Count for nan is 6293 - 3.66%


In [226]:
unique_events = []

event_sequences = session_data[session_data['device']=='tablet']['event_sequence'].values
for event_sequence in event_sequences:
    event_sequence = event_sequence.split('->')
    event_set_list = list(set(event_sequence))
    unique_events.extend(event_set_list)
    
keys = ['1', '2', '3' ,'4a', '4b', '4c', '4d', '4e', '4f', '4g', '5', '6', '7']

user_funnel = Counter(unique_events)

den = session_data[session_data['device']=='tablet'].shape[0]

print(f"No of sessions is {den}")

for key in keys:
    desc = inv_event_type_order[key]
    print(f"Count for {desc} is {user_funnel[key]} - {np.round(user_funnel[key]/den*100,2)}%")

No of sessions is 9926
Count for collection_view is 5272 - 53.11%
Count for product_view is 7688 - 77.45%
Count for add_to_cart is 1814 - 18.28%
Count for checkout_page_view_contact_information is 1142 - 11.51%
Count for checkout_page_view_shipping_method is 734 - 7.39%
Count for checkout_page_view_payment_method is 731 - 7.36%
Count for checkout_page_view_processing is 675 - 6.8%
Count for checkout_page_view_stock_problems is 52 - 0.52%
Count for checkout_page_view_review is 3 - 0.03%
Count for checkout_page_view_forward is 6 - 0.06%
Count for order_complete is 875 - 8.82%
Count for email_subscription_view is 4462 - 44.95%
Count for nan is 320 - 3.22%


In [233]:
unique_events = []

event_sequences = session_data[session_data['event_sequence'].str.startswith('2')]['event_sequence'].values
for event_sequence in event_sequences:
    event_sequence = event_sequence.split('->')
    event_set_list = list(set(event_sequence))
    unique_events.extend(event_set_list)
    
keys = ['1', '2', '3' ,'4a', '4b', '4c', '4d', '4e', '4f', '4g', '5', '6', '7']

user_funnel = Counter(unique_events)

den = session_data[session_data['event_sequence'].str.startswith('2')].shape[0]

print(f"No of sessions is {den}")

for key in keys:
    desc = inv_event_type_order[key]
    print(f"Count for {desc} is {user_funnel[key]} - {np.round(user_funnel[key]/den*100,2)}%")

No of sessions is 125947
Count for collection_view is 39227 - 31.15%
Count for product_view is 125947 - 100.0%
Count for add_to_cart is 26312 - 20.89%
Count for checkout_page_view_contact_information is 15984 - 12.69%
Count for checkout_page_view_shipping_method is 9652 - 7.66%
Count for checkout_page_view_payment_method is 9678 - 7.68%
Count for checkout_page_view_processing is 8934 - 7.09%
Count for checkout_page_view_stock_problems is 540 - 0.43%
Count for checkout_page_view_review is 52 - 0.04%
Count for checkout_page_view_forward is 205 - 0.16%
Count for order_complete is 12219 - 9.7%
Count for email_subscription_view is 59204 - 47.01%
Count for nan is 3240 - 2.57%


In [240]:
unique_events = []

event_sequences = session_data[session_data['event_sequence'].str.contains('4a')]['event_sequence'].values
for event_sequence in event_sequences:
    event_sequence = event_sequence.split('->')
    event_set_list = list(set(event_sequence))
    unique_events.extend(event_set_list)
    
keys = ['1', '2', '3' ,'4a', '4b', '4c', '4d', '4e', '4f', '4g', '5', '6', '7']

user_funnel = Counter(unique_events)

den = session_data[session_data['event_sequence'].str.contains('4a')].shape[0]

print(f"No of sessions is {den}")

for key in keys:
    desc = inv_event_type_order[key]
    print(f"Count for {desc} is {user_funnel[key]} - {np.round(user_funnel[key]/den*100,2)}%")

No of sessions is 39648
Count for collection_view is 28569 - 72.06%
Count for product_view is 35637 - 89.88%
Count for add_to_cart is 33618 - 84.79%
Count for checkout_page_view_contact_information is 39648 - 100.0%
Count for checkout_page_view_shipping_method is 21788 - 54.95%
Count for checkout_page_view_payment_method is 21705 - 54.74%
Count for checkout_page_view_processing is 20173 - 50.88%
Count for checkout_page_view_stock_problems is 993 - 2.5%
Count for checkout_page_view_review is 133 - 0.34%
Count for checkout_page_view_forward is 501 - 1.26%
Count for order_complete is 27368 - 69.03%
Count for email_subscription_view is 21222 - 53.53%
Count for nan is 3713 - 9.36%


In [215]:
29546/256827

0.11504242155225113

In [210]:
event_data['event_type'].value_counts()

product_view                              1355986
collection_view                           1309654
email_subscription_view                    546312
add_to_cart                                194821
checkout_page_view_contact_information      74019
checkout_page_view_processing               33874
order_complete                              32957
checkout_page_view_payment_method           31872
checkout_page_view_shipping_method          30455
checkout_page_view_stock_problems            3100
checkout_page_view_forward                    713
checkout_page_view_review                     193
Name: event_type, dtype: int64

In [211]:
32957/1355986

0.024304823206139296

In [192]:
Counter(consequent_events)

Counter({('s', '1'): 102526,
         ('1', '1'): 150542,
         ('1', '2'): 305985,
         ('2', 'e'): 106903,
         ('s', '2'): 125947,
         ('2', '6'): 64447,
         ('6', 'e'): 43828,
         ('2', '1'): 218270,
         ('1', 'e'): 60245,
         ('1', '6'): 41674,
         ('6', '1'): 35648,
         ('2', '3'): 95054,
         ('3', '2'): 19498,
         ('2', '2'): 83435,
         ('s', '6'): 18704,
         ('s', '4a'): 4641,
         ('4a', '2'): 4287,
         ('6', '3'): 9860,
         ('3', '1'): 51604,
         ('2', '4a'): 8335,
         ('4a', '5'): 7139,
         ('5', '1'): 2762,
         ('6', '2'): 36457,
         ('4a', '4b'): 22915,
         ('4b', '4c'): 23502,
         ('4c', '4d'): 20758,
         ('4d', '4d'): 9000,
         ('4d', '5'): 20979,
         ('3', '4a'): 25584,
         ('4a', '1'): 4936,
         ('5', 'e'): 25147,
         ('4c', 'e'): 1692,
         ('4c', '4a'): 850,
         ('s', '4e'): 860,
         ('4e', '4a'): 790,
        