In [47]:
import pandas as pd
import numpy as np
from scipy import sparse as spsp
from scipy.sparse import linalg

Read data

In [19]:
sessions = pd.read_csv('keep-events.gz', sep=' ', header=None)
item_attrs = pd.read_csv('keep-hashes.gz', sep=' ', header=None)

In [11]:
sessions

Unnamed: 0,0,1,2
0,0,285930,1442004589439
1,0,357564,1442004759591
2,0,67045,1442004917175
3,2,325215,1438969904567
4,2,325215,1438970013790
...,...,...,...
1343619,1407573,57720,1437973995037
1343620,1407573,363109,1438402327230
1343621,1407573,463766,1438402365311
1343622,1407573,82278,1438648367715


Collect all unique items and map the original item Ids to a contiguous Id space. All items in the item attribute table exist in the session item table.

In [32]:
item_map = {}
num_items = 0
for item in np.array(item_attrs[0]):
    if item in item_map:
        continue
        
    item_map[item] = num_items
    num_items += 1

Remap all session Ids and item Ids in the session item table to contiguous Id space.

In [35]:
num_interacts = len(sessions[0])
sids = np.array(sessions[0], dtype=np.int64)
items = np.array(sessions[1], dtype=np.int64)

sid_map = {}
num_sids = 0
new_sids = []
new_items = []
for sid, item in zip(sids, items):
    if item not in item_map:
        continue
    if sid not in sid_map:
        sid_map[sid] = num_sids
        num_sids += 1

    new_sids.append(sid_map[sid])
    new_items.append(item_map[item])
new_sids = np.array(new_sids, dtype=np.int64)
new_items = np.array(new_items, dtype=np.int64)
print('iterations:', len(new_sids))

iterations: 1255669


Construct the session item sparse matrix.

In [76]:
num_interactions = len(new_sids)
sess_item_spm = spsp.coo_matrix((np.ones((num_interactions)), (new_sids, new_items)))
print(sess_item_spm.shape)
print(sess_item_spm.nnz)

(370542, 130922)
1255669


In [20]:
item_attrs

Unnamed: 0,0,1
0,3,1182824
1,3,1305767
2,3,138228
3,3,150169
4,3,261419
...,...,...
2708156,466864,580465
2708157,466864,610340
2708158,466864,625815
2708159,466864,631756


Remap the items in the item attribute table and construct an item-attribute sparse matrix.

In [41]:
items = np.array(item_attrs[0], dtype=np.int64)
attrs = np.array(item_attrs[1], dtype=np.int64)
for i in range(len(items)):
    items[i] = item_map[items[i]]

In [42]:
item_attr_spm = spsp.coo_matrix((np.ones((len(items))), (items, attrs)))
print(item_attr_spm.shape)

(130922, 1339686)


Dimension reduction on the item-attribute sparse matrix to generate item features.

In [74]:
u, s, _ = linalg.svds(item_attr_spm, k=100)
item_feats = u * np.sqrt(s).transpose()
print(item_feats.shape)

(130922, 100)


Save the session-item data and itme-feature data in files.

In [49]:
import pickle
pickle.dump(sess_item_spm, open('retail_sess_item_full.pkl', 'wb'))
pickle.dump(u, open('retail_item_feats_full.pkl', 'wb'))

Some of the items are accessed in a small number of sessions.

In [52]:
num_sessions = sess_item_spm.shape[0]
item_deg = sess_item_spm.transpose().dot(np.ones((num_sessions)))
print(len(item_deg))
print(np.sum(item_deg < 20))

130922
116388


In [70]:
sess_item_spm_slice = sess_item_spm.tocsc()[:,item_deg >= 10]
sess_deg = sess_item_spm_slice.dot(np.ones((28701)))
sess_item_spm_slice = sess_item_spm_slice[sess_deg > 0]
item_attr_slice = u[item_deg >= 10]

In [71]:
pickle.dump(sess_item_spm_slice, open('retail_sess_item_slice.pkl', 'wb'))
pickle.dump(item_attr_slice, open('retail_item_feats_slice.pkl', 'wb'))

In [72]:
print(sess_item_spm_slice.shape)
print(item_attr_slice.shape)

(314874, 28701)
(28701, 100)
