# Build Graph

In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
import pickle
from utils import load_data
# from scipy.sparse import coo_matrix
import seaborn as sns
import dgl
import tqdm

Using backend: pytorch


In [2]:
browsing, search, sku, info = load_data('../dataset/new/')

load browsing done...
load search done...
load sku done...
load info done...


In [3]:
n_items = len(info.idx2item)
n_urls = len(info.idx2url)
n_sess = len(info.idx2sess)
n_items, n_urls, n_sess

(66386, 498843, 5153009)

In [4]:
sess_offset = 0
item_offset = n_sess
url_offset = n_sess + n_items

In [5]:
browsing.sort_values(by=['product_action'], inplace=True)

In [6]:
browsing['product_sku_hash'].isnull().sum()

25963435

In [7]:
url2item = browsing.dropna().drop_duplicates(subset=['hashed_url', 'product_sku_hash']).groupby('hashed_url')['product_sku_hash'].agg(list)
url2item = url2item.reset_index()
url2item['product_sku_hash'] = url2item['product_sku_hash'].apply(lambda x: x[0])
url2item = np.vstack(url2item.values).T
url2item = url2item.astype(int).tolist()
url2item = dict(zip(*url2item))
browsing['product_sku_hash'] = browsing['hashed_url'].map(url2item.get)

In [8]:
browsing['product_sku_hash'].isnull().sum()

14438082

In [9]:
browsing = browsing.drop_duplicates(subset=['session_id_hash', 'hashed_url'])

In [10]:
browsing 

Unnamed: 0,Unnamed: 1,session_id_hash,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url
train,0,0,0,31231.0,1550885210881,0
train,17461068,2402655,0,40863.0,1552340834849,965
train,17461070,2402656,0,40863.0,1552345086081,965
train,17461074,2402658,0,59397.0,1552502999526,6794
train,17461077,2402659,0,3067.0,1550351042835,13163
train,...,...,...,...,...,...
train,28916897,3954730,4,61002.0,1550450108643,744
train,32409673,4430748,4,41131.0,1548105007429,747
train,32409659,4430748,4,42812.0,1548104958010,453466
train,17566891,2417261,4,24448.0,1550253552788,293410


In [11]:
browsing['session_id_hash'] += sess_offset
browsing['product_sku_hash'] += item_offset
browsing['hashed_url'] += url_offset

In [12]:
browsing.sort_values(by=['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True)

In [13]:
s2u = browsing[['session_id_hash', 'hashed_url']]

In [14]:
s2u

Unnamed: 0,Unnamed: 1,session_id_hash,hashed_url
train,16758585,0,5225265
train,16758591,0,5221582
train,16758608,0,5219396
train,0,0,5219395
train,5,0,5219397
...,...,...,...
test,559511,5076752,5219411
test,559512,5076752,5227233
test,559513,5076752,5244004
test,559514,5076753,5219571


In [15]:
s2i = browsing.dropna()[['session_id_hash', 'product_sku_hash']]

In [16]:
sess2url = np.vstack([s2u['session_id_hash'].values, s2u['hashed_url'].values])

In [17]:
s2i

Unnamed: 0,Unnamed: 1,session_id_hash,product_sku_hash
train,16758585,0,5195121.0
train,16758608,0,5183503.0
train,0,0,5184240.0
train,16,0,5169836.0
train,20,0,5171877.0
...,...,...,...
test,559505,5076751,5213900.0
test,559512,5076752,5160118.0
test,559513,5076752,5166501.0
test,559514,5076753,5210297.0


In [18]:
sess2item = np.vstack([s2i['session_id_hash'].values, s2i['product_sku_hash'].values])

In [19]:
sess2item

array([[      0.,       0.,       0., ..., 5076752., 5076753., 5076754.],
       [5195121., 5183503., 5184240., ..., 5166501., 5210297., 5189663.]])

In [20]:
sess2url

array([[      0,       0,       0, ..., 5076752, 5076753, 5076754],
       [5225265, 5221582, 5219396, ..., 5244004, 5219571, 5223647]])

In [21]:
edges = np.concatenate((sess2item, sess2url), axis=1).astype(int)

In [22]:
graph = pd.DataFrame(edges.T)

In [23]:
graph.columns = ['src_id', 'dst_id']
graph.sort_values(by=['src_id', 'dst_id'])

Unnamed: 0,src_id,dst_id
3,0,5169836
4,0,5171877
1,0,5183503
2,0,5184240
0,0,5195121
...,...,...
26568324,5076752,5244004
8739600,5076753,5210297
26568325,5076753,5219571
8739601,5076754,5189663


In [24]:
np.save('../dataset/prepared/i-s-u.npy', graph.values)

In [29]:
import torch

In [32]:
src_ids = np.concatenate((graph['src_id'].values, graph['dst_id'].values))
dst_ids = np.concatenate((graph['dst_id'].values, graph['src_id'].values))

In [47]:
g = dgl.graph((src_ids, dst_ids))

In [34]:
dgl.save_graphs('../dataset/prepared/i-s-u.dgl', g)

In [42]:
g.num_nodes()

5718238

In [None]:
g.num_edges()

In [50]:
graph.to_csv('../dataset/prepared/i-s-u.txt', index=False, header=False, sep=' ')

In [4]:
!head -n 2 ./deepwalk/deepwalk.txt

5628145 128
0 0.4994327 0.12726587 -0.027790219 -0.48828262 0.4506812 -0.28060484 -0.4875493 0.62042034 -0.07670465 -0.63580567 -0.5657148 0.14789246 0.50255185 -0.020030126 -0.2060916 0.15345676 0.26373643 -0.45672342 0.4083202 -0.32718456 -0.12983443 -0.023214897 -0.091956116 0.21912357 -0.24596201 -0.394155 -0.5212407 0.4753517 0.6072154 0.35876873 0.7731848 0.23297629 -0.1569679 0.29665664 -0.4629792 0.22425805 0.12827586 0.026289023 0.44236562 0.037490644 0.38673767 -0.782304 -0.45497724 0.17532767 -0.22079006 0.39620486 -0.15203534 -0.10208359 -0.14580393 0.28251457 -0.47726864 0.32036272 -0.43612897 -0.09965496 0.7598768 0.010068815 0.2567218 -0.30063564 0.37460154 -0.5213604 0.29501447 0.21126814 0.1323055 0.6681945 -0.18075651 0.048518308 0.298775 0.026097434 -0.2197725 -0.26472113 0.50297385 -0.05403956 -0.28318897 0.0039339466 0.8466319 -0.34298763 -0.5566619 -0.058480375 -0.3603743 0.8381076 0.061375223 0.18920843 -0.47329074 -0.2981344 -0.05962424 -0.5673387 -0.34831032 -0

# Load Embedding

In [5]:
with open('deepwalk/deepwalk.txt', 'r') as f:
    line = f.readline()
    embedding = torch.zeros(5718238, 128)
    for line in tqdm.tqdm(f):
        line = line.split(' ')
        node_id  = int(line[0])
        node_vec = list(map(float, line[1:]))
        embedding[node_id] = torch.tensor(node_vec)

5628145it [03:47, 24779.95it/s]


In [12]:
graph = np.load('../dataset/prepared/i-s-u.npy')

In [6]:
sess_embed = embedding[:n_sess]
with open('../dataset/prepared/dw_sess.pkl', 'wb') as f:
    pickle.dump(sess_embed.numpy(), f)
sess_embed = F.normalize(sess_embed, dim=1)

In [7]:
sku_embed = embedding[n_sess:n_sess+n_items]
with open('../dataset/prepared/dw_sku.pkl', 'wb') as f:
    pickle.dump(sku_embed.numpy(), f)
sku_embed = F.normalize(sku_embed, dim=1)

In [8]:
test_ids = torch.from_numpy(browsing.loc['test', 'session_id_hash'].unique()).long()

In [33]:
results = sess_embed[test_ids].mm(sku_embed.t())

In [35]:
test_items = np.load('../dataset/prepared/test_items.npy')
results[:, test_items] += 100

In [36]:
topks = torch.topk(results, k=20, dim=1)[1]

In [38]:
result = (test_ids.tolist(), topks.numpy().tolist())

In [29]:
test_item_set = set(test_items)

In [39]:
with open('../results/deepwalk_i_s_u.pkl', 'wb') as f:
    pickle.dump(result, f)

In [40]:
missing = 0
for lst in result[1]:
    for iid in lst:
        if iid not in test_item_set:
            missing +=1 