In [4]:
import time
import csv
import pickle
import operator
import datetime
import os
from tqdm import tqdm
import pandas as pd
from collections import defaultdict
from sklearn import preprocessing

In [5]:
path = "./dataset"
start=time.time()
df=pd.read_csv(os.path.join(path,'user_session_items_df.csv'))
print("It takes {:.4f} seconds to read the data".format(time.time()-start))

It takes 3.4165 seconds to read the data


In [6]:
df.drop(["Unnamed: 0"],inplace=True,axis=1)
df.head(2)

Unnamed: 0,user_id,session_id,create_time,trip_id,item_id
0,fffe2dcb8a8381a5e71d05dd24159110,3057e0f6-d576-461d-ba23-7a5ecbbb55cf,2022-10-13 21:05:55.695,3eb1c6d3-ae10-3e0f-a6cd-3a03218092ce,82bf80fc-36d7-3199-aded-a4352493330a
1,fffe2dcb8a8381a5e71d05dd24159110,3057e0f6-d576-461d-ba23-7a5ecbbb55cf,2022-10-13 21:05:55.695,3eb1c6d3-ae10-3e0f-a6cd-3a03218092ce,c7aed8f5-fdcd-3089-9744-a59f5f6608fc


In [7]:
def convert_id_integer(df,column_name):
    all_id={*df[column_name].unique()}
    dict_id={}
    cnt=0
    for v in tqdm(all_id):
        if v not in dict_id:
            dict_id[v]=cnt
            cnt+=1
    return dict_id

user_dict=convert_id_integer(df,"user_id")
session_dict=convert_id_integer(df,"session_id")
item_dict=convert_id_integer(df,"item_id")

100%|██████████| 31877/31877 [00:00<00:00, 2659621.42it/s]
100%|██████████| 66820/66820 [00:00<00:00, 2447779.36it/s]
100%|██████████| 131109/131109 [00:00<00:00, 2159299.66it/s]


In [8]:
user_id=[]
session_id=[]
item_id=[]
timestamp=[]
for index,row in tqdm(df.iterrows(), total=df.shape[0]):
    user_id.append(user_dict[row["user_id"]])
    session_id.append(session_dict[row["session_id"]])
    item_id.append(item_dict[row["item_id"]])
    timestamp.append(row["create_time"])
    
df=pd.DataFrame({"user_id":user_id, "session_id":session_id, "item_id":item_id, "timestamp":timestamp})
df.head()

100%|██████████| 2119332/2119332 [02:44<00:00, 12845.60it/s]


Unnamed: 0,user_id,session_id,item_id,timestamp
0,20861,24615,71653,2022-10-13 21:05:55.695
1,20861,24615,105685,2022-10-13 21:05:55.695
2,20861,24615,103446,2022-10-13 21:05:55.695
3,20861,24615,50171,2022-10-13 21:05:55.695
4,20861,24615,84308,2022-10-13 21:05:55.695


In [31]:
df["timestamp"].min(), df["timestamp"].max()

('2022-02-06 17:22:25.284', '2022-11-30 21:45:03.365')

In [10]:
preprocess_output_log = dict()

sess_clicks = {}
sess_date = {}
ctr = 0
curid = -1
curdate = None
for index,row in tqdm(df.iterrows(), total=df.shape[0]):
    sessid = row['session_id']
    if curdate and not curid == sessid:
        date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%d %H:%M:%S'))
        sess_date[curid] = date
    curid = sessid
    item=row["item_id"]
    curdate = row['timestamp']
    if sessid in sess_clicks:
        sess_clicks[sessid] += [item]
    else:
        sess_clicks[sessid] = [item]
    ctr += 1
    
date = time.mktime(time.strptime(curdate[:19], '%Y-%m-%d %H:%M:%S'))
sess_date[curid] = date

print("-- Reading data @ %ss" % datetime.datetime.now())
preprocess_output_log['start time'] = datetime.datetime.now()

100%|██████████| 2119332/2119332 [02:40<00:00, 13177.25it/s]

-- Reading data @ 2022-12-08 15:20:14.198626s





In [11]:
len(sess_clicks)

66820

In [12]:
len1_sessions_filtered_count = 0 
# Filter out length 1 sessions
for s in list(sess_clicks):
    if len(sess_clicks[s]) == 1:
        len1_sessions_filtered_count += 1
        del sess_clicks[s]
        del sess_date[s]
        
preprocess_output_log['length 1 sessions filtered'] = len1_sessions_filtered_count

In [14]:
len(sess_clicks)

66652

In [15]:
# Count number of times each item appears
iid_counts = {}
for s in sess_clicks:
    seq = sess_clicks[s]
    for iid in seq:
        if iid in iid_counts:
            iid_counts[iid] += 1
        else:
            iid_counts[iid] = 1

sorted_counts = sorted(iid_counts.items(), key=operator.itemgetter(1))

In [16]:
sorted_counts[-5:]

[(85493, 2751), (52688, 2924), (90248, 3028), (103344, 3166), (10508, 3231)]

In [19]:
list(sess_clicks)[:10]

[24615, 60636, 11592, 42835, 17682, 63083, 21798, 10242, 5632, 65298]

In [23]:
list(sess_clicks.keys())[0:10]

[24615, 60636, 11592, 42835, 17682, 63083, 21798, 10242, 5632, 65298]

In [24]:
length = len(sess_clicks)
for s in list(sess_clicks):
    curseq = sess_clicks[s]
    filseq = list(filter(lambda i: iid_counts[i] >= 5, curseq))
    if len(filseq) < 2:
        del sess_clicks[s]
        del sess_date[s]
    else:
        sess_clicks[s] = filseq
        
print(f"before filtering:\t {length}")
print(f"after filtering:\t {len(sess_clicks)}")

before filtering:	 66652
after filtering:	 65803


In [67]:
# Split out test set based on dates
dates = list(sess_date.items())
maxdate = dates[0][1]
mindate = dates[0][1]
for _, date in dates:
    if maxdate < date:
        maxdate = date
    if mindate > date:
        mindate = date
        
preprocess_output_log['max date'] = pd.to_datetime(int(maxdate), unit='s')
preprocess_output_log['min date'] = pd.to_datetime(int(mindate), unit='s')
preprocess_output_log

{'start time': datetime.datetime(2022, 12, 8, 15, 20, 14, 198703),
 'length 1 sessions filtered': 168,
 'max date': Timestamp('2022-11-30 21:45:03'),
 'amex split days before maxdate': 20,
 'min date': Timestamp('2022-02-06 17:22:25')}

In [30]:
unique_item=set()
for k,v in sess_clicks.items():
    for i in v:
        if i not in unique_item:
            unique_item.add(i)
len(unique_item)

36206

In [38]:
amex_splitdate = 20
preprocess_output_log['amex split days before maxdate'] = amex_splitdate
splitdate = maxdate - 86400 * amex_splitdate 

In [70]:
print("{:<25}{:}".format('Minimal date', pd.to_datetime(int(mindate), unit='s')))
print("{:<25}{:}".format('Splitting date', pd.to_datetime(int(splitdate), unit='s')))
print("{:<25}{:}".format('Maximal date', pd.to_datetime(int(maxdate), unit='s')))
print()
tra_sess = filter(lambda x: x[1] < splitdate, dates)
tes_sess = filter(lambda x: x[1] > splitdate, dates)
# Sort sessions by date
tra_sess = sorted(tra_sess, key=operator.itemgetter(1))     # [(sessionId, timestamp), (), ]
tes_sess = sorted(tes_sess, key=operator.itemgetter(1))     # [(sessionId, timestamp), (), ]
print("{:<25}{:<20,}{:<20.2%}".format("Training Set",len(tra_sess),len(tra_sess)/len(dates)) )  
print("{:<25}{:<20,}{:<20.2%}".format("Test Set",len(tes_sess),len(tes_sess)/len(dates)) ) 
print()
print(tra_sess[:3])
print(tes_sess[:3])


Minimal date             2022-02-06 17:22:25
Splitting date           2022-11-10 21:45:03
Maximal date             2022-11-30 21:45:03

Training Set             61,365              93.26%              
Test Set                 4,438               6.74%               

[(59595, 1644168145.0), (61662, 1644175010.0), (39385, 1644175087.0)]
[(42087, 1668116802.0), (14743, 1668116975.0), (7304, 1668117015.0)]


In [71]:
preprocess_output_log['len training sessions (tra_sess)'] = len(tra_sess)
preprocess_output_log['len test sessions (tes_sess)'] = len(tes_sess)

In [75]:
item_dict = {}
# Convert training sessions to sequences and renumber items to start from 1
def obtian_tra():
    train_ids = []
    train_seqs = []
    train_dates = []
    item_ctr = 1
    for s, date in tra_sess:
        seq = sess_clicks[s]
        outseq = []
        for i in seq:
            if i in item_dict:
                outseq += [item_dict[i]]
            else:
                outseq += [item_ctr]
                item_dict[i] = item_ctr
                item_ctr += 1
        if len(outseq) < 2:  # Doesn't occur
            continue
        train_ids += [s]
        train_dates += [date]
        train_seqs += [outseq]

    preprocess_output_log['item count:'] = item_ctr
    print("{:<20}{:<15,}".format("item count:",item_ctr) )    
    return train_ids, train_dates, train_seqs


# Convert test sessions to sequences, ignoring items that do not appear in training set
def obtian_tes():
    test_ids = []
    test_seqs = []
    test_dates = []
    for s, date in tes_sess:
        seq = sess_clicks[s]
        outseq = []
        for i in seq:
            if i in item_dict:  ### avoid cold start issue
                outseq += [item_dict[i]]
        if len(outseq) < 2:
            continue
        test_ids += [s]
        test_dates += [date]
        test_seqs += [outseq]
    return test_ids, test_dates, test_seqs


tra_ids, tra_dates, tra_seqs = obtian_tra()
tes_ids, tes_dates, tes_seqs = obtian_tes()

item count:         36,113         


In [78]:
def process_seqs(iseqs, idates):
    out_seqs = []
    out_dates = []
    labs = []
    ids = []
    for id, seq, date in zip(range(len(iseqs)), iseqs, idates):
        for i in range(1, len(seq)):
            tar = seq[-i]
            labs += [tar]
            out_seqs += [seq[:-i]]
            out_dates += [date]
            ids += [id]
    return out_seqs, out_dates, labs, ids


tr_seqs, tr_dates, tr_labs, tr_ids = process_seqs(tra_seqs, tra_dates)
te_seqs, te_dates, te_labs, te_ids = process_seqs(tes_seqs, tes_dates)
tra = (tr_seqs, tr_labs)
tes = (te_seqs, te_labs)
print("{:<35}{:<20,}".format("number of training sequences",len(tr_seqs))  )
print("{:<35}{:<20,}".format("number of test sequences",len(te_seqs))  )
print()
print(tr_seqs[:3], tr_dates[:3], tr_labs[:3])
# print(te_seqs[:3], te_dates[:3], te_labs[:3])
all = 0

preprocess_output_log['len training seqs (tr_seqs)'] = len(tr_seqs)
preprocess_output_log['len test seqs (te_seqs)'] = len(tr_seqs)

for seq in tra_seqs:
    all += len(seq)
for seq in tes_seqs:
    all += len(seq)
print()
print("{:<25}{:<20.2f}".format('avg length: ', all/(len(tra_seqs) + len(tes_seqs) * 1.0)))
print()
preprocess_output_log['avg length'] = all/(len(tra_seqs) + len(tes_seqs) * 1.0)
preprocess_output_log

number of training sequences       1,785,996           
number of test sequences           116,555             

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]] [1644168145.0, 1644168145.0, 1644168145.0] [41, 40, 39]

avg length:              29.91               



{'start time': datetime.datetime(2022, 12, 8, 15, 20, 14, 198703),
 'length 1 sessions filtered': 168,
 'max date': Timestamp('2022-11-30 21:45:03'),
 'amex split days before maxdate': 20,
 'min date': Timestamp('2022-02-06 17:22:25'),
 'len training sessions (tra_sess)': 61365,
 'len test sessions (tes_sess)': 4438,
 'item count:': 36113,
 'len training seqs (tr_seqs)': 1785996,
 'len test seqs (te_seqs)': 1785996,
 'avg length': 29.912830722003555}

In [79]:
dir_path="./dataset"
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
pickle.dump(tra, open(dir_path + '/train.txt', 'wb'))
pickle.dump(tes, open(dir_path + '/test.txt', 'wb'))
pickle.dump(tra_seqs, open(dir_path + '/all_train_seq.txt', 'wb'))
pickle.dump(tes_seqs, open(dir_path + '/all_test_seq.txt', 'wb'))