# EB5202 Web Analytics - Retail Rocket - Clustering

## Load libraries

In [1]:
import os
import pandas as pd
import random
import pickle
import datetime

## Load data

In [2]:
sessions_df = pd.read_pickle(os.path.join('data','sessions.pkl'))

In [3]:
sessions_df.head()

Unnamed: 0,visitorid,startdate,enddate,addtocart,transaction,view
0,0,2015-09-11 20:49:49.439,2015-09-11 20:55:17.175,[],[],"[285930, 357564, 67045]"
1,1,2015-08-13 17:46:06.444,2015-08-13 17:46:06.444,[],[],[72028]
2,2,2015-08-07 17:51:44.567,2015-08-07 18:20:57.845,[],[],"[325215, 325215, 259884, 216305, 342816, 34281..."
3,3,2015-08-01 07:10:35.296,2015-08-01 07:10:35.296,[],[],[385090]
4,4,2015-09-15 21:24:27.167,2015-09-15 21:24:27.167,[],[],[177677]


## Create features

### Pages

In [5]:
sessions_df['pages'] = sessions_df['view'].apply(lambda x: len(x))
sessions_df.head()

Unnamed: 0,visitorid,startdate,enddate,addtocart,transaction,view,pages
0,0,2015-09-11 20:49:49.439,2015-09-11 20:55:17.175,[],[],"[285930, 357564, 67045]",3
1,1,2015-08-13 17:46:06.444,2015-08-13 17:46:06.444,[],[],[72028],1
2,2,2015-08-07 17:51:44.567,2015-08-07 18:20:57.845,[],[],"[325215, 325215, 259884, 216305, 342816, 34281...",8
3,3,2015-08-01 07:10:35.296,2015-08-01 07:10:35.296,[],[],[385090],1
4,4,2015-09-15 21:24:27.167,2015-09-15 21:24:27.167,[],[],[177677],1


In [11]:
(sessions_df.loc[0, 'enddate'] - sessions_df.loc[0, 'startdate']) / sessions_df.loc[0, 'pages']

numpy.timedelta64(109245333333,'ns')

In [19]:
sessions_df['pages'].describe()

count    1.786340e+06
mean     1.491492e+00
std      1.591950e+00
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.900000e+02
Name: pages, dtype: float64

In [20]:
sessions_df.loc[sessions_df['pages'] == 0].head()

Unnamed: 0,visitorid,startdate,enddate,addtocart,transaction,view,pages
6,6,2015-08-30 06:03:48.202,2015-08-30 06:03:48.202,[65273],[],[],0
211,155,2015-09-11 06:12:12.678,2015-09-11 06:19:00.395,"[368372, 452082, 181405, 41882, 442601, 224623]",[],[],0
212,155,2015-09-12 04:38:00.026,2015-09-11 06:19:00.395,[389974],[],[],0
256,186,2015-08-12 16:28:16.523,2015-08-12 16:34:57.040,[49029],[49029],[],0
527,419,2015-07-29 05:03:12.695,2015-07-29 04:03:49.136,[],[19278],[],0


### Pagetime

In [35]:
pages_more_than1 = sessions_df['pages'] > 1    

In [36]:
sessions_df.loc[pages_more_than1, 'pagetime'] = (sessions_df.loc[pages_more_than1, 'enddate'] - sessions_df.loc[pages_more_than1, 'startdate']) /\
                                                (sessions_df.loc[pages_more_than1, 'pages'] - 1)

In [37]:
pages_less_than1 = pages_more_than1.apply(lambda x: not x)

In [38]:
sessions_df.loc[pages_less_than1, 'pagetime'] = pd.Timedelta(0)

In [39]:
sessions_df['pagetime'].to_frame().info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1786340 entries, 0 to 1786339
Data columns (total 1 columns):
pagetime    1786340 non-null timedelta64[ns]
dtypes: timedelta64[ns](1)
memory usage: 13.6 MB


In [40]:
sessions_df.head()

Unnamed: 0,visitorid,startdate,enddate,addtocart,transaction,view,pages,pagetime
0,0,2015-09-11 20:49:49.439,2015-09-11 20:55:17.175,[],[],"[285930, 357564, 67045]",3,00:02:43.868000
1,1,2015-08-13 17:46:06.444,2015-08-13 17:46:06.444,[],[],[72028],1,00:00:00
2,2,2015-08-07 17:51:44.567,2015-08-07 18:20:57.845,[],[],"[325215, 325215, 259884, 216305, 342816, 34281...",8,00:04:10.468285
3,3,2015-08-01 07:10:35.296,2015-08-01 07:10:35.296,[],[],[385090],1,00:00:00
4,4,2015-09-15 21:24:27.167,2015-09-15 21:24:27.167,[],[],[177677],1,00:00:00


### Diffcat

In [41]:
infile = open(os.path.join('data', 'itemtocat.pkl'), 'rb')
itemtocat_dict = pickle.load(infile)
infile.close()

In [69]:
itemtocat_dict[177677]

KeyError: 177677

In [52]:
sessions_df['view'][0][0]

285930

In [53]:
itemtocat_dict[sessions_df['view'][0][0]]

[(Timestamp('2015-05-17 03:00:00'), '1188')]

In [71]:
def to_categoryid(itemid, enddate):
    catid = ''
    
    for time, cat in itemtocat_dict.get(itemid, []):
        if enddate <= time:
            catid = cat
            break
    if not catid:
        if itemtocat_dict.get(itemid, []):  # enddate did not match
            catid = itemtocat_dict[itemid][-1][1]  # set categiryid to last value
        else:
            catid = str(itemid) + '_no_cat_id'  # itemid not in dictionary
    return catid

In [72]:
{ to_categoryid(i, sessions_df['enddate'][0]) for i in sessions_df['view'][0] }

{'1188', '256', '333'}

In [74]:
sessions_df['diffcat'] = sessions_df.loc[:, ['enddate', 'view']].apply(
    lambda x: len({ to_categoryid(i, x['enddate']) for i in x['view'] }) / len(x['view']) if x['view'] else 0, axis=1)
#sessions_df['diffcat'] = sessions_df.loc[:, ['enddate', 'view']].apply(lambda x: (x.view), axis=1)
#sessions_df.loc[[0, 1, 2], ['enddate', 'view']].apply(lambda x: print(x['view']), axis=1)

In [75]:
sessions_df.head()

Unnamed: 0,visitorid,startdate,enddate,addtocart,transaction,view,pages,pagetime,diffcat
0,0,2015-09-11 20:49:49.439,2015-09-11 20:55:17.175,[],[],"[285930, 357564, 67045]",3,00:02:43.868000,1.0
1,1,2015-08-13 17:46:06.444,2015-08-13 17:46:06.444,[],[],[72028],1,00:00:00,1.0
2,2,2015-08-07 17:51:44.567,2015-08-07 18:20:57.845,[],[],"[325215, 325215, 259884, 216305, 342816, 34281...",8,00:04:10.468285,0.25
3,3,2015-08-01 07:10:35.296,2015-08-01 07:10:35.296,[],[],[385090],1,00:00:00,1.0
4,4,2015-09-15 21:24:27.167,2015-09-15 21:24:27.167,[],[],[177677],1,00:00:00,1.0


In [76]:
{ to_categoryid(i, sessions_df['enddate'][2]) for i in sessions_df['view'][2] }

{'299', '444'}

In [78]:
for i in sessions_df['view'][2]:
    print(itemtocat_dict[i])

[(Timestamp('2015-06-28 03:00:00'), '299')]
[(Timestamp('2015-06-28 03:00:00'), '299')]
[(Timestamp('2015-05-24 03:00:00'), '299')]
[(Timestamp('2015-05-31 03:00:00'), '299')]
[(Timestamp('2015-05-10 03:00:00'), '444')]
[(Timestamp('2015-05-10 03:00:00'), '444')]
[(Timestamp('2015-05-31 03:00:00'), '299')]
[(Timestamp('2015-06-28 03:00:00'), '299')]


In [79]:
sessions_df['diffcat'].to_frame().info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1786340 entries, 0 to 1786339
Data columns (total 1 columns):
diffcat    1786340 non-null float64
dtypes: float64(1)
memory usage: 13.6 MB


In [80]:
sessions_df['diffcat'].describe()

count    1.786340e+06
mean     8.866262e-01
std      2.385307e-01
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: diffcat, dtype: float64

In [None]:
sessions_df.to_pickle(os.path.join('data', 'sessions_temp.pkl'))

## Split data into train / test dataset

In [21]:
# train_idx = random.sample(range(len(sessions_df)), int(len(sessions_df) / 2))
# test_idx = [ i for i in range(len(sessions_df)) if i not in train_idx ]