In [1]:
import argparse
import os
import operator
import pickle
import time

import csv
import datetime
import numpy as np
from collections import Counter

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--save-dir', default='yoochoose_temp', help='save directory name')
opt = parser.parse_args([])
print(opt)

os.makedirs(f'../{opt.save_dir}_4', exist_ok=True)
os.makedirs(f'../{opt.save_dir}_64', exist_ok=True)

dataset = '../raw/yoochoose-clicks.dat'

Namespace(save_dir='yoochoose_temp')


In [3]:
print("-- Starting @ %ss" % datetime.datetime.now())

with open(dataset, "r") as f:
    reader = csv.reader(f, delimiter=',')
    sess_clicks = {}
    sess_date = {}
    ctr = 0
    curid = -1
    curdate = None
    for data in reader:
        sessid = data[0]
        if curdate and not curid == sessid:
            date = ''
            date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S'))
            sess_date[curid] = date
        curid = sessid
        item = data[2]
        curdate = ''
        curdate = data[1]

        if sessid in sess_clicks:
            sess_clicks[sessid] += [item]
        else:
            sess_clicks[sessid] = [item]
        ctr += 1
    date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%dT%H:%M:%S'))
    # add
    for i in list(sess_clicks):
        sorted_clicks = sorted(sess_clicks[i], key=operator.itemgetter(1))
        sess_clicks[i] = [c for c in sorted_clicks]
    sess_date[curid] = date
print("-- Reading data @ %ss" % datetime.datetime.now())

-- Starting @ 2022-10-10 16:03:35.855900s
-- Reading data @ 2022-10-10 16:10:58.035800s


In [4]:
# Filter out length shorter than 2
for s in list(sess_clicks):
    if len(sess_clicks[s]) < 2 :
        del sess_clicks[s]
        del sess_date[s]

# Counter number of times each appears
iid_counts = {}
for s in sess_clicks:
    seq = sess_clicks[s]
    for iid in seq:
        if iid in iid_counts:
            iid_counts[iid] += 1
        else:
            iid_counts[iid] = 1

sorted_counts = sorted(iid_counts.items(), key=operator.itemgetter(1))

In [5]:
length = len(sess_clicks)
for s in list(sess_clicks):
    curseq = sess_clicks[s]
    filseq = list(filter(lambda i: iid_counts[i] >= 5, curseq))

    if len(filseq) < 2 :
        del sess_clicks[s]
        del sess_date[s]
    else:
        sess_clicks[s] = filseq

dates = list(sess_date.items())
maxdate = dates[0][1]

for _, date in dates:
    if maxdate < date:
        maxdate = date

# 7 days for test
splitdate = 0
splitdate = maxdate - 86400 * 1

print('Splitting date', splitdate)      # Yoochoose: ('Split date', 1411930799.0)
tra_sess = filter(lambda x: x[1] < splitdate, dates)
tes_sess = filter(lambda x: x[1] > splitdate, dates)

# Sort sessions by date
tra_sess = sorted(tra_sess, key=operator.itemgetter(1))     # [(session_id, timestamp), (), ]
tes_sess = sorted(tes_sess, key=operator.itemgetter(1))     # [(session_id, timestamp), (), ]
print(f'# train sessions: {len(tra_sess)}')    # 186670    # 7966257
print(f'# test sessions: {len(tes_sess)}')    # 15979     # 15324
print(f'train sessions example: {tra_sess[:3]}')
print(f'test sessions example: {tes_sess[:3]}')

print("-- Splitting train set and test set @ %ss" % datetime.datetime.now())

Splitting date 1411927199.0
# train sessions: 7966257
# test sessions: 15324
train sessions example: [('171168', 1396288832.0), ('345618', 1396288875.0), ('263073', 1396288902.0)]
test sessions example: [('11532683', 1411927253.0), ('11464959', 1411927271.0), ('11296119', 1411927295.0)]
-- Splitting train set and test set @ 2022-10-10 16:12:20.225155s


In [6]:
# split train dataset 1/4 and 1/64
split4 = int(len(tra_sess) / 4)
split64 = int(len(tra_sess) / 64)
tra_sess4 = tra_sess[-split4:]
tra_sess64 = tra_sess[-split64:]

print(f"# 1/4 train sessions before preprocess: {len(tra_sess4)}")
print(f"# 1/64 train sessions before preprocess : {len(tra_sess64)}")

print("-- Splitting train set and test set @ %ss" % datetime.datetime.now())

# 1/4 train sessions before preprocess: 1991564
# 1/64 train sessions before preprocess : 124472
-- Splitting train set and test set @ 2022-10-10 16:12:20.306506s


**preprocess yoochoose 1/4**

In [7]:
item_dict, item_cnt = {}, {}

# train
train_ids, train_seqs, train_dates = [], [], []
item_ctr = 1
for s, date in tra_sess4:
    seq = sess_clicks[s]
    outseq = []
    for i in seq:
        if i in item_dict:
            outseq += [item_dict[i]]
            item_cnt[item_dict[i]] += 1
        else:
            outseq += [item_ctr]
            item_dict[i] = item_ctr
            item_cnt[item_dict[i]] = 1
            item_ctr += 1
    if len(outseq) < 2: 
        continue
    train_ids += [s]
    train_dates += [date]
    train_seqs += [outseq]

# test
test_ids = []
test_seqs = []
test_dates = []
for s, date in tes_sess:
    seq = sess_clicks[s]
    outseq = []
    for i in seq:
        if i in item_dict:
            outseq += [item_dict[i]]
    if len(outseq) < 2:
        continue
    test_ids += [s]
    test_dates += [date]
    test_seqs += [outseq]

In [8]:
def process_seqs(iseqs, idates, train=True):
    out_seqs, labs = [], []
    
    if train:
        out_dates, ids = [], []

        for id, seq, date in zip(range(len(iseqs)), iseqs, idates):
            for i in range(1, len(seq)):
                tar = seq[-i]
                labs += [tar]
                out_seqs += [seq[:-i]]
                out_dates += [date]
                ids += [id]
        return out_seqs, out_dates, labs, ids
    else:
        for seq in iseqs:
            labs += [seq[-1]]
            out_seqs += [seq[:-1]]
        return out_seqs, labs

In [9]:
tra_seqs, tra_dates, tra_labs, tra_ids = process_seqs(train_seqs, train_dates)

In [10]:
tra_seqs[-100:]

[[372, 1415, 6905, 14192, 7827, 4582, 4288, 2666],
 [372, 1415, 6905, 14192, 7827, 4582, 4288],
 [372, 1415, 6905, 14192, 7827, 4582],
 [372, 1415, 6905, 14192, 7827],
 [372, 1415, 6905, 14192],
 [372, 1415, 6905],
 [372, 1415],
 [372],
 [30218],
 [29484, 29419, 29299],
 [29484, 29419],
 [29484],
 [30163],
 [30297],
 [7983],
 [29299],
 [1659, 357, 2661, 3548, 4048],
 [1659, 357, 2661, 3548],
 [1659, 357, 2661],
 [1659, 357],
 [1659],
 [5885],
 [29271],
 [30488, 30174, 30164, 30164, 29295, 168, 29220, 29299, 4699, 14605],
 [30488, 30174, 30164, 30164, 29295, 168, 29220, 29299, 4699],
 [30488, 30174, 30164, 30164, 29295, 168, 29220, 29299],
 [30488, 30174, 30164, 30164, 29295, 168, 29220],
 [30488, 30174, 30164, 30164, 29295, 168],
 [30488, 30174, 30164, 30164, 29295],
 [30488, 30174, 30164, 30164],
 [30488, 30174, 30164],
 [30488, 30174],
 [30488],
 [25035, 30363, 30218, 30150],
 [25035, 30363, 30218],
 [25035, 30363],
 [25035],
 [30306, 30565, 30213, 30218, 30375, 30411],
 [30306, 3056

In [11]:
tes_seqs, tes_labs = process_seqs(test_seqs, None, False)

In [12]:
print(max(tes_seqs), max(tes_labs), max(tra_seqs), max(tes_labs))

[30618] 30579 [30653, 1290, 1291, 1291] 30579


In [13]:
max(item_dict.values())

30653

In [14]:
print(f"1/4 # train sessions {len(tra_seqs)}, ex. {tra_seqs[:3], tra_labs[:3]}")
print(f"1/4 # test sessions {len(tes_seqs)}, ex. {tes_seqs[:3], tes_labs[:3]}")
print(f"1/4 # train clicks {sum(item_cnt.values())}")
print(f"1/4 # items {len(item_cnt.keys())}")
print(f"1/4 avg. length : {sum(map(len, tra_seqs)) / len(tra_seqs)}")

1/64 # train sessions 6145883, ex. ([[1], [2], [4, 5]], [1, 3, 6])
1/64 # test sessions 15317, ex. ([[17788, 29299, 5885], [30153, 29576, 30148, 30151], [2628, 185]], [350, 30148, 359])
1/64 # train clicks 8137447
1/64 # items 30653
1/64 avg. length : 4.710282802324743


In [15]:
tra = (tra_seqs, tra_labs)
tes = (tes_seqs, tes_labs)

In [16]:
print(min(item_cnt.keys()), max(item_cnt.keys()))

1 30653


In [17]:
pickle.dump(tra, open(f'../{opt.save_dir}_4/train.txt', 'wb'))
pickle.dump(tes, open(f'../{opt.save_dir}_4/test.txt', 'wb'))
pickle.dump(len(item_cnt.keys()) + 1, open(f'../{opt.save_dir}_4/n_node.txt', 'wb'))

In [18]:
# Popularity dict
total_ctr = sum(item_cnt.values())
pop_dict = {key : (value / total_ctr) for key, value in item_cnt.items()}

pickle.dump(pop_dict, open(f"../{opt.save_dir}_4/pop_dict.pickle", "wb"))

In [19]:
sorted = sorted(item_cnt.items(), reverse=True, key=lambda item: item[1])

In [68]:
type(item_cnt)

dict

In [20]:
# head tail dict 
sorted_item_cnt = sorted(item_cnt.items(), reverse=True, key=lambda item: item[1])
sorted_keys = np.array(sorted_item_cnt)[:, 0].astype(int)
sorted_values = np.array(sorted_item_cnt)[:, 1]

split_point = int(len(sorted_keys) * 0.2)
point_cnt_value = sorted_values[split_point]
split_idx = [i for i, cnt in enumerate(sorted_values) if cnt == (point_cnt_value-1)][0]

ht_dict = dict()
ht_dict['head'] = sorted_keys[:split_idx]
ht_dict['tail'] = sorted_keys[split_idx:]

print(f'# head items : {len(ht_dict["head"])}, # tail items : {len(ht_dict["tail"])}')
pickle.dump(ht_dict, open(f'../{opt.save_dir}_4/ht_dict.pickle', 'wb'))

# head items : 6179, # tail items : 24474


#### Preprocess 1/64 


In [None]:
item_dict, item_cnt = {}, {}

# train
train_ids, train_seqs, train_dates = [], [], []
item_ctr = 1
for s, date in tra_sess64:
    seq = sess_clicks[s]
    outseq = []
    for i in seq:
        if i in item_dict:
            outseq += [item_dict[i]]
            item_cnt[item_dict[i]] += 1
        else:
            outseq += [item_ctr]
            item_dict[i] = item_ctr
            item_cnt[item_dict[i]] = 1
            item_ctr += 1
    if len(outseq) < 2: 
        continue
    train_ids += [s]
    train_dates += [date]
    train_seqs += [outseq]

# test
test_ids = []
test_seqs = []
test_dates = []
for s, date in tes_sess:
    seq = sess_clicks[s]
    outseq = []
    for i in seq:
        if i in item_dict:
            outseq += [item_dict[i]]
    if len(outseq) < 2:
        continue
    test_ids += [s]
    test_dates += [date]
    test_seqs += [outseq]

In [None]:
def process_seqs(iseqs, idates, train=True):
    out_seqs, labs = [], []
    
    if train:
        out_dates, ids = [], []

        for id, seq, date in zip(range(len(iseqs)), iseqs, idates):
            for i in range(1, len(seq)):
                tar = seq[-i]
                labs += [tar]
                out_seqs += [seq[:-i]]
                out_dates += [date]
                ids += [id]
        return out_seqs, out_dates, labs, ids
    else:
        for seq in iseqs:
            labs += [seq[-1]]
            out_seqs += [seq[:-1]]
        return out_seqs, labs

In [None]:
tra_seqs, tra_dates, tra_labs, tra_ids = process_seqs(train_seqs, train_dates)
tes_seqs, tes_labs = process_seqs(test_seqs, None, False)
print(max(tes_seqs), max(tes_labs), max(tra_seqs), max(tes_labs))

In [None]:
max(item_dict.values())

In [None]:
print(f"1/64 # train sessions {len(tra_seqs)}, ex. {tra_seqs[:3], tra_labs[:3]}")
print(f"1/64 # test sessions {len(tes_seqs)}, ex. {tes_seqs[:3], tes_labs[:3]}")
print(f"1/64 # train clicks {sum(item_cnt.values())}")
print(f"1/64 # items {len(item_cnt.keys())}")
print(f"1/64 avg. length : {sum(map(len, tra_seqs)) / len(tra_seqs)}")

In [None]:
tra = (tra_seqs, tra_labs)
tes = (tes_seqs, tes_labs)
print(min(item_cnt.keys()), max(item_cnt.keys()))

In [None]:
pickle.dump(tra, open(f'../{opt.save_dir}_64/train.txt', 'wb'))
pickle.dump(tes, open(f'../{opt.save_dir}_64/test.txt', 'wb'))
pickle.dump(len(item_cnt.keys()) + 1, open(f'../{opt.save_dir}_64/n_node.txt', 'wb'))

In [None]:
# Popularity dict
total_ctr = sum(item_cnt.values())
pop_dict = {key : (value / total_ctr) for key, value in item_cnt.items()}

pickle.dump(pop_dict, open(f"../{opt.save_dir}_64/pop_dict.pickle", "wb"))

In [None]:
# head tail dict 
sorted_item_cnt = sorted(item_cnt.items(), reverse=True, key=lambda item: item[1])
sorted_keys = np.array(sorted_item_cnt)[:, 0].astype(int)
sorted_values = np.array(sorted_item_cnt)[:, 1]

split_point = int(len(sorted_keys) * 0.2)
point_cnt_value = sorted_values[split_point]
split_idx = [i for i, cnt in enumerate(sorted_values) if cnt == (point_cnt_value-1)][0]

ht_dict = dict()
ht_dict['head'] = sorted_keys[:split_idx]
ht_dict['tail'] = sorted_keys[split_idx:]

print(f'# head items : {len(ht_dict["head"])}, # tail items : {len(ht_dict["tail"])}')
pickle.dump(ht_dict, open(f'../{opt.save_dir}_64/ht_dict.pickle', 'wb'))

In [1]:
ht_dict

NameError: name 'ht_dict' is not defined