In [1]:
import pandas as pd
import os
import numpy as np
import random
import pickle

In [4]:
sessions_df = pd.read_pickle(os.path.join('data', 'sessions.pkl'))

In [5]:
sessions_df.head()

Unnamed: 0,visitorid,startdate,enddate,addtocart,transaction,view
0,0,2015-09-11 20:49:49.439,2015-09-11 20:55:17.175,[],[],"[285930, 357564, 67045]"
1,1,2015-08-13 17:46:06.444,2015-08-13 17:46:06.444,[],[],[72028]
2,2,2015-08-07 17:51:44.567,2015-08-07 18:20:57.845,[],[],"[325215, 325215, 259884, 216305, 342816, 34281..."
3,3,2015-08-01 07:10:35.296,2015-08-01 07:10:35.296,[],[],[385090]
4,4,2015-09-15 21:24:27.167,2015-09-15 21:24:27.167,[],[],[177677]


In [9]:
sessions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1786340 entries, 0 to 1786339
Data columns (total 6 columns):
visitorid      int64
startdate      datetime64[ns]
enddate        datetime64[ns]
addtocart      object
transaction    object
view           object
dtypes: datetime64[ns](2), int64(1), object(3)
memory usage: 81.8+ MB


In [7]:
# Split into train/test
train_idx = random.sample(range(len(sessions_df)), int(len(sessions_df) * .7))
test_idx = list(set(range(len(sessions_df))) - set(train_idx))

In [8]:
print(len(train_idx), len(test_idx))

1250438 535902


## Create a transactions file for association mining

In [10]:
sessions_df['basketid'] = sessions_df.index

In [11]:
sessions_df.head()

Unnamed: 0,visitorid,startdate,enddate,addtocart,transaction,view,basketid
0,0,2015-09-11 20:49:49.439,2015-09-11 20:55:17.175,[],[],"[285930, 357564, 67045]",0
1,1,2015-08-13 17:46:06.444,2015-08-13 17:46:06.444,[],[],[72028],1
2,2,2015-08-07 17:51:44.567,2015-08-07 18:20:57.845,[],[],"[325215, 325215, 259884, 216305, 342816, 34281...",2
3,3,2015-08-01 07:10:35.296,2015-08-01 07:10:35.296,[],[],[385090],3
4,4,2015-09-15 21:24:27.167,2015-09-15 21:24:27.167,[],[],[177677],4


In [12]:
train_sessions_df = sessions_df.loc[train_idx, ['basketid', 'view']].reset_index(drop=True)
train_sessions_df.head()

Unnamed: 0,basketid,view
0,1480710,[168624]
1,1550519,[352044]
2,1082085,[98467]
3,1169255,[188171]
4,1361431,"[248740, 248740]"


In [13]:
test_sessions_df = sessions_df.loc[test_idx, ['basketid', 'view']].reset_index(drop=True)
test_sessions_df.head()

Unnamed: 0,basketid,view
0,6,[]
1,12,[222422]
2,1048588,[450819]
3,1048589,[458945]
4,1048591,[5411]


In [14]:
train_rows = train_sessions_df.apply(lambda x: len(x[1]), axis=1).sum()
train_rows

1864046

1863706

In [15]:
test_rows = test_sessions_df.apply(lambda x: len(x[1]), axis=1).sum()
test_rows

800606

In [14]:
train_view_arr = np.zeros((train_rows, 2), dtype=np.int)

In [15]:
train_view_arr[:10]

array([[0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0]])

In [16]:
row_idx = 0
def update_train_view_arr(x):
    global row_idx
    for i in x[1]:
        train_view_arr[row_idx, 0] = x[0]
        train_view_arr[row_idx, 1] = i
        row_idx += 1

train_sessions_df.apply(lambda x: update_train_view_arr(x), axis=1)

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
1250408    None
1250409    None
1250410    None
1250411    None
1250412    None
1250413    None
1250414    None
1250415    None
1250416    None
1250417    None
1250418    None
1250419    None
1250420    None
1250421    None
1250422    None
1250423    None
1250424    None
1250425    None
1250426    None
1250427    None
1250428    None
1250429    None
1250430    None
1250431    None
1250432    None
1250433    None
1250434    None
1250435    None
1250436    None
1250437    None
Length: 1250438, dtype: 

In [17]:
train_view_arr[:20]

array([[1004780,  431499],
       [ 693465,  309436],
       [ 132907,  408003],
       [ 792695,  277556],
       [ 227612,   64104],
       [ 633341,  456578],
       [1071643,  254703],
       [1071643,  254703],
       [ 381561,  433441],
       [1372149,  163780],
       [ 797507,  174529],
       [1018026,   32805],
       [ 536176,  296561],
       [ 896420,  198747],
       [ 896420,  198747],
       [ 896420,  198747],
       [ 896420,  198747],
       [  42411,   94980],
       [1045321,    5305],
       [1045321,  354233]])

In [18]:
test_view_arr = np.zeros((test_rows, 2), dtype=np.int)
row_idx = 0
def update_test_view_arr(x):
    global row_idx
    for i in x[1]:
        test_view_arr[row_idx, 0] = x[0]
        test_view_arr[row_idx, 1] = i
        row_idx += 1
test_sessions_df.apply(lambda x: update_test_view_arr(x), axis=1)

0         None
1         None
2         None
3         None
4         None
5         None
6         None
7         None
8         None
9         None
10        None
11        None
12        None
13        None
14        None
15        None
16        None
17        None
18        None
19        None
20        None
21        None
22        None
23        None
24        None
25        None
26        None
27        None
28        None
29        None
          ... 
535872    None
535873    None
535874    None
535875    None
535876    None
535877    None
535878    None
535879    None
535880    None
535881    None
535882    None
535883    None
535884    None
535885    None
535886    None
535887    None
535888    None
535889    None
535890    None
535891    None
535892    None
535893    None
535894    None
535895    None
535896    None
535897    None
535898    None
535899    None
535900    None
535901    None
Length: 535902, dtype: object

In [19]:
test_view_arr[:10]

array([[      0,  285930],
       [      0,  357564],
       [      0,   67045],
       [1048577,  458318],
       [1048578,  406376],
       [1048579,  219821],
       [1048579,  219821],
       [1048579,  219821],
       [      4,  177677],
       [1048580,  219821]])

In [20]:
train_view_df = pd.DataFrame(train_view_arr, columns=['basketID', 'item']).sort_values('basketID')
test_view_df = pd.DataFrame(test_view_arr, columns=['basketID', 'item']).sort_values('basketID')

In [21]:
train_view_df.head()

Unnamed: 0,basketID,item
1373426,1,72028
1476422,2,325215
1476421,2,216305
1476416,2,325215
1476417,2,259884


In [22]:
train_view_df.to_csv(os.path.join('data', 'train_view.csv'), index=False)
test_view_df.to_csv(os.path.join('data', 'test_view.csv'), index=False)

## Create a baskets file for sequence mining

In [23]:
train_seq_arr = np.zeros((train_rows, 4), dtype=np.int)
test_seq_arr = np.zeros((test_rows, 4), dtype=np.int)

In [24]:
row_idx = 0
def update_train_seq_arr(x):
    global row_idx
    for idx, value in enumerate(x[1]):
        train_seq_arr[row_idx, 0] = x[0]
        train_seq_arr[row_idx, 1] = idx
        train_seq_arr[row_idx, 2] = 1
        train_seq_arr[row_idx, 3] = value
        row_idx += 1

train_sessions_df.apply(lambda x: update_train_seq_arr(x), axis=1)

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
1250408    None
1250409    None
1250410    None
1250411    None
1250412    None
1250413    None
1250414    None
1250415    None
1250416    None
1250417    None
1250418    None
1250419    None
1250420    None
1250421    None
1250422    None
1250423    None
1250424    None
1250425    None
1250426    None
1250427    None
1250428    None
1250429    None
1250430    None
1250431    None
1250432    None
1250433    None
1250434    None
1250435    None
1250436    None
1250437    None
Length: 1250438, dtype: 

In [25]:
row_idx = 0
def update_test_seq_arr(x):
    global row_idx
    for idx, value in enumerate(x[1]):
        test_seq_arr[row_idx, 0] = x[0]
        test_seq_arr[row_idx, 1] = idx
        test_seq_arr[row_idx, 2] = 1
        test_seq_arr[row_idx, 3] = value
        row_idx += 1

test_sessions_df.apply(lambda x: update_test_seq_arr(x), axis=1)

0         None
1         None
2         None
3         None
4         None
5         None
6         None
7         None
8         None
9         None
10        None
11        None
12        None
13        None
14        None
15        None
16        None
17        None
18        None
19        None
20        None
21        None
22        None
23        None
24        None
25        None
26        None
27        None
28        None
29        None
          ... 
535872    None
535873    None
535874    None
535875    None
535876    None
535877    None
535878    None
535879    None
535880    None
535881    None
535882    None
535883    None
535884    None
535885    None
535886    None
535887    None
535888    None
535889    None
535890    None
535891    None
535892    None
535893    None
535894    None
535895    None
535896    None
535897    None
535898    None
535899    None
535900    None
535901    None
Length: 535902, dtype: object

In [35]:
train_sequence_df = pd.DataFrame(train_seq_arr, columns=['sequenceid', 'eventid', 'size', 'item'])\
    .sort_values(['sequenceid', 'eventid'])
test_sequence_df = pd.DataFrame(test_seq_arr, columns=['sequenceid', 'eventid', 'size', 'item'])\
    .sort_values(['sequenceid', 'eventid'])

In [36]:
train_sequence_df.head(10)

Unnamed: 0,sequenceid,eventid,size,item
1373426,1,0,1,72028
1476415,2,0,1,325215
1476416,2,1,1,325215
1476417,2,2,1,259884
1476418,2,3,1,216305
1476419,2,4,1,342816
1476420,2,5,1,342816
1476421,2,6,1,216305
1476422,2,7,1,325215
1444679,3,0,1,385090


In [37]:
train_sequence_df.to_csv(os.path.join('data', 'train_sequence.txt'), header=False, index=False, sep='\t')
test_sequence_df.to_csv(os.path.join('data', 'test_sequence.txt'), header=False, index=False, sep='\t')

## Create a transactions file using categoryid for association mining

In [2]:
infile = open(os.path.join('data', 'itemtocat.pkl'), 'rb')
itemtocat_dict = pickle.load(infile)
infile.close()

In [16]:
train_cat_arr = np.zeros((train_rows, 2), dtype=np.int)
test_cat_arr = np.zeros((test_rows, 2), dtype=np.int)

In [19]:
train_cat_sessions_df = sessions_df.loc[train_idx, ['basketid', 'enddate', 'view']].reset_index(drop=True)
train_cat_sessions_df.head()

Unnamed: 0,basketid,enddate,view
0,1480710,2015-08-21 21:11:03.764,[168624]
1,1550519,2015-05-18 21:03:50.552,[352044]
2,1082085,2015-07-10 22:45:11.445,[98467]
3,1169255,2015-05-17 19:44:04.907,[188171]
4,1361431,2015-07-22 18:29:31.592,"[248740, 248740]"


In [20]:
test_cat_sessions_df = sessions_df.loc[test_idx, ['basketid', 'enddate', 'view']].reset_index(drop=True)
test_cat_sessions_df.head()

Unnamed: 0,basketid,enddate,view
0,6,2015-08-30 06:03:48.202,[]
1,12,2015-07-08 17:36:47.285,[222422]
2,1048588,2015-05-28 23:18:18.745,[450819]
3,1048589,2015-06-09 15:13:53.256,[458945]
4,1048591,2015-07-14 22:27:50.103,[5411]


In [22]:
def to_categoryid(itemid, enddate):
    catid = ''
    
    for time, cat in itemtocat_dict.get(itemid, []):
        if enddate <= time:
            catid = cat
            break
    if not catid:
        if itemtocat_dict.get(itemid, []):  # enddate did not match
            catid = itemtocat_dict[itemid][-1][1]  # set categiryid to last value
        else:
            catid = itemid  # itemid not in dictionary
    return catid

In [26]:
row_idx = 0
def update_train_cat_arr(x):
    global row_idx
    for i in x[2]:
        train_cat_arr[row_idx, 0] = x[0]
        train_cat_arr[row_idx, 1] = to_categoryid(i, x[1])
        row_idx += 1

train_cat_sessions_df.apply(lambda x: update_train_cat_arr(x), axis=1)

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
1250408    None
1250409    None
1250410    None
1250411    None
1250412    None
1250413    None
1250414    None
1250415    None
1250416    None
1250417    None
1250418    None
1250419    None
1250420    None
1250421    None
1250422    None
1250423    None
1250424    None
1250425    None
1250426    None
1250427    None
1250428    None
1250429    None
1250430    None
1250431    None
1250432    None
1250433    None
1250434    None
1250435    None
1250436    None
1250437    None
Length: 1250438, dtype: 

In [28]:
train_cat_arr[:10]

array([[1480710,  168624],
       [1550519,      50],
       [1082085,    1613],
       [1169255,    1051],
       [1361431,  248740],
       [1361431,  248740],
       [ 930192,     730],
       [ 211822,    1565],
       [1195593,     491],
       [1467024,     196]])

In [29]:
row_idx = 0
def update_test_cat_arr(x):
    global row_idx
    for i in x[2]:
        test_cat_arr[row_idx, 0] = x[0]
        test_cat_arr[row_idx, 1] = to_categoryid(i, x[1])
        row_idx += 1

test_cat_sessions_df.apply(lambda x: update_test_cat_arr(x), axis=1)

0         None
1         None
2         None
3         None
4         None
5         None
6         None
7         None
8         None
9         None
10        None
11        None
12        None
13        None
14        None
15        None
16        None
17        None
18        None
19        None
20        None
21        None
22        None
23        None
24        None
25        None
26        None
27        None
28        None
29        None
          ... 
535872    None
535873    None
535874    None
535875    None
535876    None
535877    None
535878    None
535879    None
535880    None
535881    None
535882    None
535883    None
535884    None
535885    None
535886    None
535887    None
535888    None
535889    None
535890    None
535891    None
535892    None
535893    None
535894    None
535895    None
535896    None
535897    None
535898    None
535899    None
535900    None
535901    None
Length: 535902, dtype: object

In [30]:
train_cat_df = pd.DataFrame(train_cat_arr, columns=['basketID', 'item']).sort_values('basketID')
test_cat_df = pd.DataFrame(test_cat_arr, columns=['basketID', 'item']).sort_values('basketID')

In [31]:
train_cat_df.head()

Unnamed: 0,basketID,item
843726,0,333
843725,0,256
843724,0,1188
129947,1,1192
1677890,2,299


In [32]:
train_cat_df.to_csv(os.path.join('data', 'train_cat.csv'), index=False)
test_cat_df.to_csv(os.path.join('data', 'test_cat.csv'), index=False)