# Build Graph

In [7]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
import pickle
from utils import load_data, EasyDict
# from scipy.sparse import coo_matrix
import seaborn as sns
import dgl
import tqdm

In [23]:
browsing, search, sku, info = load_data('../dataset/new/')

load browsing done...
load search done...
load sku done...
load info done...


In [3]:
n_items = len(info.idx2item)
n_urls = len(info.idx2url)
n_sess = len(info.idx2sess)
n_items, n_urls, n_sess

(66386, 517740, 5485256)

In [4]:
sess_offset = 0
item_offset = n_sess
url_offset = n_sess + n_items

In [5]:
browsing.sort_values(by=['product_action'], inplace=True)

In [6]:
browsing['product_sku_hash'].isnull().sum()

26699519

In [7]:
url2item = browsing.dropna().drop_duplicates(subset=['hashed_url', 'product_sku_hash']).groupby('hashed_url')['product_sku_hash'].agg(list)
url2item = url2item.reset_index()
url2item['product_sku_hash'] = url2item['product_sku_hash'].apply(lambda x: x[0])
url2item = np.vstack(url2item.values).T
url2item = url2item.astype(int).tolist()
url2item = dict(zip(*url2item))
browsing['product_sku_hash'] = browsing['hashed_url'].map(url2item.get)

In [8]:
browsing['product_sku_hash'].isnull().sum()

15174125

In [9]:
browsing = browsing.drop_duplicates(subset=['session_id_hash', 'hashed_url'])

In [10]:
browsing 

Unnamed: 0,Unnamed: 1,session_id_hash,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url
train,0,0,0,31231.0,1550885210881,0
train,18427417,2532912,0,8747.0,1551137116480,3740
train,18427419,2532912,0,30987.0,1551137120150,4894
train,18427422,2532913,0,23261.0,1550916315045,5022
train,18427427,2532913,0,31025.0,1550916364052,2706
train,...,...,...,...,...,...
train,28342316,3875553,4,61002.0,1549381069130,747
train,28342317,3875553,4,61002.0,1549381257069,411161
train,28342295,3875553,4,61002.0,1549380342372,3782
train,28342312,3875553,4,61002.0,1549380854448,744


In [11]:
browsing['session_id_hash'] += sess_offset
browsing['product_sku_hash'] += item_offset
browsing['hashed_url'] += url_offset

In [12]:
browsing.sort_values(by=['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True)

In [13]:
s2u = browsing[['session_id_hash', 'hashed_url']]

In [14]:
s2u

Unnamed: 0,Unnamed: 1,session_id_hash,hashed_url
train,16758585,0,5557512
train,16758591,0,5553829
train,16758604,0,5551643
train,0,0,5551642
train,5,0,5551644
...,...,...,...
test,1303283,5408427,5556480
test,1303284,5408427,5552598
test,1303285,5408427,5552600
test,1303286,5408427,5554472


In [15]:
s2i = browsing.dropna()[['session_id_hash', 'product_sku_hash']]

In [16]:
sess2url = np.vstack([s2u['session_id_hash'].values, s2u['hashed_url'].values])

In [17]:
s2i

Unnamed: 0,Unnamed: 1,session_id_hash,product_sku_hash
train,16758585,0,5527368.0
train,16758604,0,5515750.0
train,0,0,5516487.0
train,16,0,5502083.0
train,20,0,5504124.0
...,...,...,...
test,1303283,5408427,5534894.0
test,1303284,5408427,5542283.0
test,1303285,5408427,5515602.0
test,1303286,5408427,5526521.0


In [18]:
sess2item = np.vstack([s2i['session_id_hash'].values, s2i['product_sku_hash'].values])

In [19]:
sess2item

array([[      0.,       0.,       0., ..., 5408427., 5408427., 5408428.],
       [5527368., 5515750., 5516487., ..., 5515602., 5526521., 5543232.]])

In [20]:
sess2url

array([[      0,       0,       0, ..., 5408427, 5408427, 5408428],
       [5557512, 5553829, 5551643, ..., 5552600, 5554472, 5561065]])

In [21]:
edges = np.concatenate((sess2item, sess2url), axis=1).astype(int)

In [22]:
graph = pd.DataFrame(edges.T)

In [23]:
graph.columns = ['src_id', 'dst_id']
graph.sort_values(by=['src_id', 'dst_id'])

Unnamed: 0,src_id,dst_id
3,0,5502083
4,0,5504124
1,0,5515750
2,0,5516487
0,0,5527368
...,...,...
28019289,5408427,5552600
28019290,5408427,5554472
28019287,5408427,5556480
9197772,5408428,5543232


In [24]:
np.save('../dataset/prepared/i-s-u.npy', graph.values)

In [25]:
import torch

In [26]:
src_ids = np.concatenate((graph['src_id'].values, graph['dst_id'].values))
dst_ids = np.concatenate((graph['dst_id'].values, graph['src_id'].values))

In [27]:
g = dgl.graph((src_ids, dst_ids))

In [28]:
dgl.save_graphs('../dataset/prepared/i-s-u.dgl', g)

In [29]:
g.num_nodes()

6069382

In [30]:
g.num_edges()

56038584

In [31]:
graph.to_csv('../dataset/prepared/i-s-u.txt', index=False, header=False, sep=' ')

In [4]:
!head -n 2 ./deepwalk/deepwalk.txt

5628145 128
0 0.4994327 0.12726587 -0.027790219 -0.48828262 0.4506812 -0.28060484 -0.4875493 0.62042034 -0.07670465 -0.63580567 -0.5657148 0.14789246 0.50255185 -0.020030126 -0.2060916 0.15345676 0.26373643 -0.45672342 0.4083202 -0.32718456 -0.12983443 -0.023214897 -0.091956116 0.21912357 -0.24596201 -0.394155 -0.5212407 0.4753517 0.6072154 0.35876873 0.7731848 0.23297629 -0.1569679 0.29665664 -0.4629792 0.22425805 0.12827586 0.026289023 0.44236562 0.037490644 0.38673767 -0.782304 -0.45497724 0.17532767 -0.22079006 0.39620486 -0.15203534 -0.10208359 -0.14580393 0.28251457 -0.47726864 0.32036272 -0.43612897 -0.09965496 0.7598768 0.010068815 0.2567218 -0.30063564 0.37460154 -0.5213604 0.29501447 0.21126814 0.1323055 0.6681945 -0.18075651 0.048518308 0.298775 0.026097434 -0.2197725 -0.26472113 0.50297385 -0.05403956 -0.28318897 0.0039339466 0.8466319 -0.34298763 -0.5566619 -0.058480375 -0.3603743 0.8381076 0.061375223 0.18920843 -0.47329074 -0.2981344 -0.05962424 -0.5673387 -0.34831032 -0

# Load Embedding

In [8]:
with open('../dataset/new/' + 'map_info.pkl', 'rb') as f:
    info = pickle.load(f)
info = EasyDict(info)

In [9]:
n_items = len(info.idx2item)
n_urls = len(info.idx2url)
n_sess = len(info.idx2sess)
n_items, n_urls, n_sess

(66386, 517740, 5485256)

In [10]:
sess_offset = 0
item_offset = n_sess
url_offset = n_sess + n_items

In [11]:
with open('deepwalk/deepwalk.txt', 'r') as f:
    line = f.readline()
    embedding = torch.zeros(n_sess + n_items + n_urls, 128)
    for line in tqdm.tqdm(f):
        line = line.split(' ')
        node_id  = int(line[0])
        node_vec = list(map(float, line[1:]))
        embedding[node_id] = torch.tensor(node_vec)

5980004it [04:22, 22784.94it/s]


In [12]:
graph = np.load('../dataset/prepared/i-s-u.npy')

In [18]:
sess_embed = embedding[:n_sess]
with open('../dataset/prepared/dw_sess.pkl', 'wb') as f:
    pickle.dump(sess_embed.numpy(), f)
sess_embed = F.normalize(sess_embed, dim=1)

In [19]:
sku_embed = embedding[n_sess:n_sess+n_items]
with open('../dataset/prepared/dw_sku.pkl', 'wb') as f:
    pickle.dump(sku_embed.numpy(), f)
sku_embed = F.normalize(sku_embed, dim=1)

In [20]:
url_embed = embedding[url_offset:]
with open('../dataset/prepared/dw_url.pkl', 'wb') as f:
    pickle.dump(url_embed.numpy(), f)
url_embed = F.normalize(url_embed, dim=1)

In [21]:
url_embed.shape

torch.Size([517740, 128])

In [24]:
test_ids = torch.from_numpy(browsing.loc['test', 'session_id_hash'].unique()).long()

  """Entry point for launching an IPython kernel.


In [29]:
results = sess_embed[test_ids].mm(sku_embed.t())

In [30]:
test_items = np.load('../dataset/prepared/sku_test_items.npy')
results[:, test_items] += 100

In [31]:
topks = torch.topk(results, k=20, dim=1)[1]

In [32]:
result = (test_ids.tolist(), topks.numpy().tolist())

In [33]:
test_item_set = set(test_items)

In [34]:
with open('../results/deepwalk_i_s_u.pkl', 'wb') as f:
    pickle.dump(result, f)

In [35]:
missing = 0
for lst in result[1]:
    for iid in lst:
        if iid not in test_item_set:
            missing +=1 