In [7]:
import pandas as pd
import numpy as np
from itertools import chain
import pickle
import time
import networkx as nx
from walker import RandomWalker
from sklearn.preprocessing import LabelEncoder
import argparse
from datetime import datetime
import gzip
import json
import contextlib
from queue import PriorityQueue


class StubLogger(object):
    def __getattr__(self, name):
        return self.log_print

    def log_print(self, msg, *args):
        print(msg % args)


LOGGER = StubLogger()
LOGGER.info("Hello %s!", "world")


@contextlib.contextmanager
def elapsed_timer(message):
    start_time = time.time()
    yield
    LOGGER.info(message.format(time.time() - start_time))


def cnt_session(data, time_cut=30, cut_type=2):
    sku_list = data['sku_id']
    time_list = data['action_time']
    type_list = data['type']
    session = []
    tmp_session = []
    for i, item in enumerate(sku_list):
        # if type_list[i] == cut_type or (i < len(sku_list)-1 and (time_list[i+1] - time_list[i]).seconds/60 > time_cut) or i == len(sku_list)-1:
        #     tmp_session.append(item)
        #     session.append(tmp_session)
        #     tmp_session = []
        # else:
        #     tmp_session.append(item)
        if i == len(sku_list)-1:
            tmp_session.append(item)
            session.append(tmp_session)
            tmp_session = []
        else:
            tmp_session.append(item)
    return session


def get_session(action_data, use_type=None):
    if use_type is None:
        use_type = [1, 2, 3, 5]
    action_data = action_data[action_data['type'].isin(use_type)]
    action_data = action_data.sort_values(by=['user_id', 'action_time'], ascending=True)
    group_action_data = action_data.groupby('user_id').agg(list)
    session_list = group_action_data.apply(cnt_session, axis=1)
    return session_list.to_numpy()


def get_graph_context_all_pairs(walks, window_size):
    all_pairs = []
    for k in range(len(walks)):
        for i in range(len(walks[k])):
            for j in range(i - window_size, i + window_size + 1):
                if i == j or j < 0 or j >= len(walks[k]):
                    continue
                else:
                    all_pairs.append([walks[k][i], walks[k][j]])
    return np.array(all_pairs, dtype=np.int32)


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)


def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

Hello world!


In [9]:
data_path = "D:\\Developer\\Amazon\\"
p = 0.25
q = 2
num_walks = 10
walk_length = 10
window_size = 5

with elapsed_timer("-- {0}s - %s" % ("find topk timestamp",)):
    queue = PriorityQueue()
    k = 500000
    with open(data_path + 'Clothing_Shoes_and_Jewelry.csv', 'r') as action:
        for line in action:
            action_columns = line.split(',')
            queue.put(int(action_columns[3]))
            if queue.qsize() > k:
                queue.get()
    top_k_timestamp = queue.get()
    print("top_k_timestamp: " + str(top_k_timestamp))

item_ids = set()
with elapsed_timer("-- {0}s - %s" % ("reduce amazon dataset size",)):
    with open(data_path + 'Clothing_Shoes_and_Jewelry.csv', 'r') as action:
        with open(data_path + 'Clothing_Shoes_and_Jewelry_reduced.csv', 'w') as action_reduced:
            for line in action:
                action_columns = line.split(',')
                if int(action_columns[3]) >= top_k_timestamp:
                    action_reduced.write(line)
                    item_ids.add(action_columns[0])

with elapsed_timer("-- {0}s - %s" % ("reduce meta dataset size",)):
    with open(data_path + 'meta_Clothing_Shoes_and_Jewelry.json', 'r') as meta:
        with open(data_path + 'meta_Clothing_Shoes_and_Jewelry_reduced.json', 'w') as meta_reduced:
            for line in meta:
                meta_map = json.loads(line)
                if meta_map["asin"] in item_ids:
                    meta_reduced.write(line)
    with open(data_path + 'meta_Clothing_Shoes_and_Jewelry_reduced.json', "rb") as meta_reduced:
        with gzip.open(data_path + 'meta_Clothing_Shoes_and_Jewelry_reduced.json.gz', "wb") as meta_reduced_gz:
            meta_reduced_gz.writelines(meta_reduced)

with elapsed_timer("-- {0}s - %s" % ("read action data",)):
    action_data = pd.read_csv(data_path + 'Clothing_Shoes_and_Jewelry_reduced.csv', header=None,
                              names=["sku_id", "user_id", "rating", "timestamp"])
    # action_data = action_data.loc[action_data["timestamp"] > 1514736000]
    action_data.sort_values("timestamp", inplace=True)
    timestamps = action_data["timestamp"].tolist()
    datetimes = []
    for timestamp in timestamps:
        datetimes.append(datetime.fromtimestamp(timestamp))
    action_data["action_time"] = datetimes
    action_data["type"] = 1

with elapsed_timer("-- {0}s - %s" % ("filter",)):
    user_count = {}
    sku_count = {}
    for index, row in action_data.iterrows():
        user_id = row["user_id"]
        sku_id = row["sku_id"]
        user_count.setdefault(user_id, 0)
        user_count[user_id] += 1
        sku_count.setdefault(sku_id, 0)
        sku_count[sku_id] += 1

    index_list = []
    for index, row in action_data.iterrows():
        user_id = row["user_id"]
        sku_id = row["sku_id"]
        if 5 <= user_count[user_id] <= 20 and 5 <= sku_count[sku_id] <= 20:
            index_list.append(index)
    action_data = action_data.loc[index_list]

with elapsed_timer("-- {0}s - %s" % ("split",)):
    test_ratio = 0.25

    user_items_map = {}
    for index, row in action_data.iterrows():
        user = row["user_id"]
        item = row["sku_id"]
        user_items_map.setdefault(user, [])
        user_items_map[user].append({"item": item, "index": index})

    train_index = []
    test_index = []
    for user, maps in user_items_map.items():
        for i in range(len(maps)):
            if i < (1 - test_ratio) * len(maps):
                train_index.append(maps[i]["index"])
            else:
                test_index.append(maps[i]["index"])

    all_skus = action_data['sku_id'].unique()
    all_skus = pd.DataFrame({'sku_id': list(all_skus)})
    sku_lbe = LabelEncoder()
    # Fit label encoder and return encoded labels.
    all_skus['sku_id'] = sku_lbe.fit_transform(all_skus['sku_id'])
    # Transform labels to normalized encoding.
    action_data['sku_id'] = sku_lbe.transform(action_data['sku_id'])

    action_data_test = action_data.loc[test_index]
    action_data_train = action_data.loc[train_index]

    with open("../../../data/amazon/user-event-rsvp_test.tsv", "w") as test:
        for index, row in action_data_test.iterrows():
            user = row["user_id"]
            item = row["sku_id"]
            test.write(str(user) + "\t" + str(item) + "\n")

    with open("../../../data/amazon/train.tsv", "w") as test:
        for index, row in action_data_train.iterrows():
            user = row["user_id"]
            item = row["sku_id"]
            test.write(str(user) + "\t" + str(item) + "\n")

with elapsed_timer("-- {0}s - %s" % ("make session list",)):
    print('make session list\n')
    start_time = time.time()
    session_list = get_session(action_data_train, use_type=[1, 2, 3, 5])
    session_list_all = []
    for item_list in session_list:
        for session in item_list:
            if len(session) > 1:
                session_list_all.append(session)

    print('make session list done, time cost {0}'.format(str(time.time() - start_time)))

# session2graph
with elapsed_timer("-- {0}s - %s" % ("session2graph",)):
    node_pair = dict()
    for session in session_list_all:
        for i in range(1, len(session)):
            if (session[i - 1], session[i]) not in node_pair.keys():
                node_pair[(session[i - 1], session[i])] = 1
            else:
                node_pair[(session[i - 1], session[i])] += 1

    in_node_list = list(map(lambda x: x[0], list(node_pair.keys())))
    out_node_list = list(map(lambda x: x[1], list(node_pair.keys())))
    weight_list = list(node_pair.values())
    graph_df = pd.DataFrame({'in_node': in_node_list, 'out_node': out_node_list, 'weight': weight_list})
    graph_df.to_csv('../../../data/amazon/graph.csv', sep=' ', index=False, header=False)

    G = nx.read_edgelist('../../../data/amazon/graph.csv', create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
    walker = RandomWalker(G, p=p, q=q)
    print("Preprocess transition probs...")
    walker.preprocess_transition_probs()

    session_reproduce = walker.simulate_walks(num_walks=num_walks, walk_length=walk_length, workers=4,
                                              verbose=1)
    session_reproduce = list(filter(lambda x: len(x) > 2, session_reproduce))

# add side info
with elapsed_timer("-- {0}s - %s" % ("add side info",)):
    df = getDF(data_path + 'meta_Clothing_Shoes_and_Jewelry_reduced.json.gz')
    product_data = df.loc[:, ["asin", "brand"]]
    product_data = product_data.rename(columns={'asin': 'sku_id'})

    # Transform labels back to original encoding.
    all_skus['sku_id'] = sku_lbe.inverse_transform(all_skus['sku_id'])
    print("sku nums: " + str(all_skus.count()))
    sku_side_info = pd.merge(all_skus, product_data, on='sku_id', how='left').fillna("NaN")

    # id2index
    for feat in sku_side_info.columns:
        if feat != 'sku_id':
            lbe = LabelEncoder()
            sku_side_info[feat] = lbe.fit_transform(sku_side_info[feat])
        else:
            sku_side_info[feat] = sku_lbe.transform(sku_side_info[feat])

    sku_side_info = sku_side_info.sort_values(by=['sku_id'], ascending=True)
    sku_side_info.to_csv('../../../data/amazon/sku_side_info.csv', index=False, header=False, sep='\t')

# get pair
with elapsed_timer("-- {0}s - %s" % ("get pair",)):
    all_pairs = get_graph_context_all_pairs(session_reproduce, window_size)
    np.savetxt('../../../data/amazon/all_pairs', X=all_pairs, fmt="%d", delimiter=" ")

top_k_timestamp: 1530489600
-- 329.522807598114s - find topk timestamp
-- 44.00921583175659s - reduce amazon dataset size
-- 200.28831148147583s - reduce meta dataset size
-- 2.5627169609069824s - read action data
-- 127.42916679382324s - filter
-- 1.4748566150665283s - split
make session list

make session list done, time cost 1.6420793533325195
-- 1.6569674015045166s - make session list
Preprocess transition probs...


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   12.6s finished


-- 12.928093910217285s - session2graph
sku nums: sku_id    4091
dtype: int64
-- 17.032688856124878s - add side info
-- 1.759772539138794s - get pair


In [47]:
product_data = df.loc[:, ["asin", "category"]]
product_data = product_data.rename(columns={'asin': 'sku_id'})

sku_category = pd.merge(all_skus, product_data, on='sku_id', how='left').fillna("NaN")
sku_category

Unnamed: 0,sku_id,category
0,B01DQNMPT6,"[Clothing, Shoes & Jewelry, Women, Clothing, S..."
1,B01DTOI3MU,"[Clothing, Shoes & Jewelry, Women, Clothing, J..."
2,B01D5S3YBK,"[Clothing, Shoes & Jewelry, Baby, Baby Boys, C..."
3,B01FM06ZT2,"[Clothing, Shoes & Jewelry, Women, Clothing, S..."
4,B00XKUWRQW,"[Clothing, Shoes & Jewelry, Boys, Clothing, Pa..."
5,B01BHI294S,"[Clothing, Shoes & Jewelry, Men, Accessories, ..."
6,B01AGGQX2Q,"[Clothing, Shoes & Jewelry, Girls, Shoes, Oxfo..."
7,B01AEFRUSK,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ea..."
8,B01AVON8SK,"[Clothing, Shoes & Jewelry, Women, Clothing, D..."
9,B01ATZ5GCC,"[Clothing, Shoes & Jewelry, Women, Clothing, D..."


In [48]:
sku_category['sku_id'] = sku_lbe.transform(sku_category['sku_id'])
sku_category = sku_category.sort_values(by=['sku_id'], ascending=True)
sku_category

Unnamed: 0,sku_id,category
2347,0,"[Clothing, Shoes & Jewelry, Baby, Baby Boys, C..."
376,1,"[Clothing, Shoes & Jewelry, Men, Shoes, Athlet..."
487,2,"[Clothing, Shoes & Jewelry, Girls, Shoes, Athl..."
1915,3,"[Clothing, Shoes & Jewelry, Men, Shoes, Athlet..."
2673,4,"[Clothing, Shoes & Jewelry, Men, Shoes, Slippers]"
385,5,"[Clothing, Shoes & Jewelry, Men, Surf, Skate &..."
173,6,"[Clothing, Shoes & Jewelry, Men, Shoes]"
1496,7,"[Clothing, Shoes & Jewelry, Men, Clothing, Sho..."
445,8,"[Clothing, Shoes & Jewelry, Women, Clothing, L..."
24,9,"[Clothing, Shoes & Jewelry, Men, Clothing, Jeans]"


In [49]:
class TrieNode(object):
    def __init__(self, val, leaf, children, index):
        self.val, self.leaf, self.children, self.index = val, leaf, children, index

class Trie(object):
    def __init__(self):
        self.root = TrieNode("", False, {}, -1)
        self.num_category = 0
        self.index_category_map = {}
    
    def insert(self, leaf, path):
        category_path = []
        
        node = self.root
        for category in path:
            if category not in node.children:
                node.children[category] = TrieNode(category, False, {}, self.num_category)
                self.index_category_map[self.num_category] = node.children[category]
                self.num_category += 1
            category_path.append(node.children[category].index)
            node = node.children[category]
        node.children[leaf] = TrieNode(leaf, True, {}, -1)
        
        return category_path

category_column = []
trie = Trie()
for index, row in sku_category.iterrows():
    leaf = row['sku_id']
    path = row['category']
    category_column.append(trie.insert(leaf, path))
sku_category['category'] = category_column
sku_category

Unnamed: 0,sku_id,category
2347,0,"[0, 1, 2, 3, 4, 5]"
376,1,"[0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]"
487,2,"[0, 16, 17, 18, 19]"
1915,3,"[0, 6, 7, 8, 20]"
2673,4,"[0, 6, 7, 21]"
385,5,"[0, 6, 22, 23, 24, 25]"
173,6,"[0, 6, 7]"
1496,7,"[0, 6, 26, 27, 28]"
445,8,"[0, 29, 30, 31, 32, 33, 34]"
24,9,"[0, 6, 26, 35]"


In [50]:
category_item_children_map = {}
category_category_children_map = {}
for i in range(trie.num_category):
    node = trie.index_category_map[i]
    item_children = []
    category_children = []
    for _, child_node in node.children.items():
        if child_node.leaf:
            item_children.append(child_node.val)
        else:
            category_children.append(child_node.index)
    if len(item_children) > 0:
        category_item_children_map[i] = item_children
    if len(category_children) > 0:
        category_category_children_map[i] = category_children

In [51]:
category_item_children_map

{5: [0],
 6: [597, 2643, 3229],
 7: [6,
  69,
  721,
  925,
  1328,
  1449,
  1539,
  1716,
  1717,
  1769,
  2520,
  2565,
  2601,
  3125],
 8: [1064, 2522],
 10: [2563, 2961],
 15: [1],
 16: [1133],
 17: [2335],
 18: [2963],
 19: [2],
 20: [3, 1486, 2358, 2440, 2725, 3874, 3939, 4002],
 21: [4, 4058],
 25: [5],
 26: [816, 2345, 3270],
 27: [560],
 28: [7, 140, 457, 1565, 2087],
 29: [186,
  242,
  364,
  517,
  697,
  971,
  1002,
  1603,
  1776,
  2074,
  2312,
  3081,
  3610,
  3611,
  4044,
  4070],
 30: [346, 640, 986, 1018, 1028, 1132, 1881, 2253, 2389, 2785, 2844, 3207],
 32: [1048],
 34: [8, 879, 1349, 1451, 2210, 2258, 2299, 2597, 3278, 3643],
 35: [9,
  10,
  38,
  289,
  306,
  307,
  308,
  693,
  717,
  833,
  1291,
  1605,
  2303,
  2854,
  2889,
  3279,
  3312,
  3436,
  3640,
  4011],
 36: [2836],
 37: [11, 485, 1386, 2624, 3618],
 38: [1908, 2848],
 39: [12, 147, 191, 320, 379, 394, 445, 468, 2341, 2355, 3128, 3969],
 40: [170, 1821, 3417],
 47: [13, 14],
 49: [15,
  

In [52]:
category_category_children_map

{0: [1, 6, 16, 29, 69, 71, 124, 350, 380, 459, 1622],
 1: [2, 53],
 2: [3, 217, 1523],
 3: [4, 171, 655, 664, 1108, 1451, 2200, 2313, 2593, 3352],
 4: [5, 1502],
 6: [7, 22, 26, 82, 130, 202, 244, 317, 1909],
 7: [8, 21, 88, 226, 240, 357, 445, 482, 1750, 2328, 3940, 4303],
 8: [9, 20, 79, 803, 1951, 2048, 2219, 2248, 2769, 4558, 5906],
 9: [10, 1748, 5900],
 10: [11, 237],
 11: [12],
 12: [13],
 13: [14],
 14: [15],
 16: [17, 98, 738, 1671, 2119, 4502, 4983],
 17: [18, 66, 572, 828, 962, 2235, 4015],
 18: [19, 642, 2753, 4874],
 19: [3358],
 20: [149, 715, 1091, 2479],
 22: [23, 181],
 23: [24, 289, 825, 1260],
 24: [25, 4133],
 26: [27, 35, 36, 38, 77, 89, 108, 162, 195, 254, 1134, 2656, 2831, 5754],
 27: [28, 1413, 3094, 5286],
 28: [101],
 29: [30, 50, 80, 85, 138, 325, 513, 592, 638, 829, 1006, 1914, 1967, 1995],
 30: [31,
  67,
  90,
  164,
  179,
  301,
  322,
  348,
  354,
  391,
  551,
  555,
  641,
  672,
  681,
  705,
  1322,
  1443,
  1943,
  2514,
  2899,
  5073,
  5446],


In [56]:
import itertools

In [60]:
with open("../../../data/amazon/category_category_children.csv", "r") as reader, \
        open("../../../data/amazon/category_category_children_edge.txt", "w") as writer:
    for line in reader:
        columns = line.strip().split("\t")
        category = columns[0]
        items = columns[1].split(",")
        edges = itertools.permutations(items, 2)
        for edge in edges:
            writer.write(category + " " + " ".join(edge) + "\n")

with open("../../../data/amazon/category_item_children.csv", "r") as reader, \
        open("../../../data/amazon/category_item_children_edge.txt", "w") as writer:
    for line in reader:
        columns = line.strip().split("\t")
        category = columns[0]
        items = columns[1].split(",")
        edges = itertools.permutations(items, 2)
        for edge in edges:
            writer.write(category + " " + " ".join(edge) + "\n")

In [61]:
[5]+[6]

[5, 6]

In [62]:
all_skus['sku_id']

0       B01DQNMPT6
1       B01DTOI3MU
2       B01D5S3YBK
3       B01FM06ZT2
4       B00XKUWRQW
5       B01BHI294S
6       B01AGGQX2Q
7       B01AEFRUSK
8       B01AVON8SK
9       B01ATZ5GCC
10      B01AHXSZ6U
11      B01AJLG98G
12      B01AJH0LFM
13      B01CTLTKOY
14      B01COUVYRQ
15      B01D3YE8WK
16      B01C1H3N44
17      B01C6U7RTI
18      B009YRM6MK
19      B00A4MW6DI
20      B009LJW5HM
21      B009OCZ84I
22      B00AHOQBAM
23      B009ESZFFO
24      B008L1EJHU
25      B00BW0UKRU
26      B00C3XMHOO
27      B00AO9I2U2
28      B00AO9I2DE
29      B00AR3BS24
           ...    
4061    B01FQHSJ22
4062    B01DK79RJE
4063    B01FGOEEIS
4064    B01GVDVW9W
4065    B01H59Z31O
4066    B01E2DL1XK
4067    B01HFWXFO8
4068    B01FQU83T8
4069    B01CQ09DFY
4070    B01BHCWUH0
4071    B01FS66YR8
4072    B01DK7G9UE
4073    B01DYU92B0
4074    B01H334QCO
4075    B01FRR2QWK
4076    B01GP08RSK
4077    B01ET80YO0
4078    B01GQZ0WCS
4079    B01GR09NHW
4080    B01ESWURGW
4081    B01EIJ3MO4
4082    B01E

In [3]:
import itertools
from deepwalk import graph
import random

ModuleNotFoundError: No module named 'deepwalk'

In [63]:
def load_category_edgelist(file_, undirected=True):
    category_graph_map = {}
    with open(file_) as f:
        for l in f:
            c, x, y = l.strip().split()[:3]
            x = int(x)
            y = int(y)
            category_graph_map.setdefault(c, graph.Graph())
            G = category_graph_map[c]
            G[x].append(y)
            if undirected:
                G[y].append(x)

    for category, G in category_graph_map.items():
        G.make_consistent()

    return category_graph_map


def random_walk(category_graph_map, num_paths, path_length, alpha=0, rand=random.Random(0)):
    for category, G in category_graph_map.items():
        nodes = list(G.nodes())

        for cnt in range(num_paths):
            rand.shuffle(nodes)
            for node in nodes:
                walk = G.random_walk(path_length, rand=rand, alpha=alpha, start=node)
                yield [category] + walk


def get_category_graph_context_all_pairs(category_category_children_walks, category_item_children_walks,
                                         window_size, num_items):
    all_pairs = []

    # category_category_children_walks
    for walk in category_category_children_walks:
        for i in range(len(walk)):
            for j in range(i - window_size, i + window_size + 1):
                if i == j or j < 1 or j >= len(walk):
                    continue
                else:
                    # (category, category)
                    all_pairs.append([num_items + walk[i], num_items + walk[j]])

    # category_item_children_walks
    for walk in category_item_children_walks:
        for i in range(len(walk)):
            for j in range(i - window_size, i + window_size + 1):
                if i == j or j < 1 or j >= len(walk):
                    continue
                elif i == 0:
                    # (category, item)
                    all_pairs.append([num_items + walk[i], walk[j]])
                else:
                    # (item, item)
                    all_pairs.append([walk[i], walk[j]])

    return np.array(all_pairs, dtype=np.int32)

NameError: name 'random' is not defined