In [1]:
import os
from pathlib import Path

os.chdir('/content/drive/MyDrive/hnm')
DATA_PATH = Path.cwd() / 'data'
RAW = DATA_PATH / 'raw'
PROCESSED = DATA_PATH / 'processed'
SUBMISSION = DATA_PATH / 'submission'

import gzip
from collections import defaultdict
from datetime import datetime
import copy
import time
import datetime
import json
import numpy as np
import pandas as pd

from tqdm import tqdm

In [2]:
transactions = pd.read_csv(RAW / 'transactions_train.csv', parse_dates=['t_dat'])
transactions['unix_t_dat'] = transactions.t_dat.view(np.int64) // 10**9
transactions = transactions.reset_index()

In [3]:
transactions.head(2)

Unnamed: 0,index,t_dat,customer_id,article_id,price,sales_channel_id,unix_t_dat
0,0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,1537401600
1,1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,1537401600


In [4]:
print(f'No. of transactions: {transactions.shape[0]:,}')
print(f'No. of users: {transactions.customer_id.nunique():,}')
print(f'No. of articles: {transactions.article_id.nunique():,}')

No. of transactions: 31,788,324
No. of users: 1,362,281
No. of articles: 104,547


In [5]:
while(1):
  if transactions['article_id'].value_counts().values[-1] < 10:
    leastReviewedProducts = []
    counts = transactions['article_id'].value_counts()

    for val in list(transactions['article_id'].unique()):
      if counts[val] < 25:
        leastReviewedProducts.append(val)

    transactions = transactions[~(transactions['article_id'].isin(leastReviewedProducts))]
    print('Removed articles\t\t\t Remaining number of transactions: {:,}'.format(len(transactions)))

  elif transactions['customer_id'].value_counts().values[-1] < 10:
    leastReviewsBy = []
    counts = transactions['customer_id'].value_counts()

    for val in list(transactions['customer_id'].unique()):
      if counts[val] < 25:
        leastReviewsBy.append(val)

    transactions = transactions[~(transactions['customer_id'].isin(leastReviewsBy))]
    print('Removed customers\t\t\t Remaining number of transactions: {:,}'.format(len(transactions)))

  else:
    print('\n\n----')
    print('Obtained 25-core subset')
    break

Removed articles			 Remaining number of transactions: 31,473,449
Removed customers			 Remaining number of transactions: 23,962,677
Removed articles			 Remaining number of transactions: 23,856,203


----
Obtained 25-core subset


In [6]:
print(f'After taking 25-core subset:\n\n')
print(f'No. of transactions: {transactions.shape[0]:,}. Decreased by {100*(31788324 - transactions.shape[0])/31788324:.2f}%')
print(f'No. of users: {transactions.customer_id.nunique():,}. Decreased by {100*(1362281 - transactions.customer_id.nunique())/1362281:.2f}%')
print(f'No. of articles: {transactions.article_id.nunique():,}. Decreased by {100*(104547 - transactions.article_id.nunique())/104547:.2f}%')


After taking 25-core subset:


No. of transactions: 23,856,203. Decreased by 24.95%
No. of users: 365,193. Decreased by 73.19%
No. of articles: 63,768. Decreased by 39.01%


In [None]:
transactions.customer_id.value_counts().tail(2)

67a3dfe65332c2832212fbac37dbf8ebe92f037a9c56be6ba46475606fc71231    12
9b062e1f17b13e4d2ad2ec1fbcf6a3f6c489061eea66d34fa8a56cd9fd5caafd    12
Name: customer_id, dtype: int64

In [None]:
transactions.article_id.value_counts().tail(2)

553873033    25
652057001    25
Name: article_id, dtype: int64

In [None]:
countU = transactions.customer_id.value_counts().to_dict()
countP = transactions.article_id.value_counts().to_dict()


In [None]:
usermap = dict()
usernum = 1

itemmap = dict()
itemnum = 1

User = dict()

for index, one_interaction in tqdm(transactions.iterrows(), total=transactions.shape[0]):
    rev = one_interaction['customer_id']
    asin = one_interaction['article_id']
    time = float(one_interaction['unix_t_dat'])

    if countU[rev] < 10 or countP[asin] < 10:
        print('Error in taking 10-core')
        continue

    if rev in usermap:
        userid = usermap[rev]
    else:
        userid = usernum
        usermap[rev] = userid
        User[userid] = []
        usernum += 1

    if asin in itemmap:
        itemid = itemmap[asin]
    else:
        itemid = itemnum
        itemmap[asin] = itemid
        itemnum += 1
        
    User[userid].append([itemid, time])

100%|██████████| 23856203/23856203 [23:55<00:00, 16622.38it/s]


In [None]:
imap_file = str(f'{PROCESSED}')+'/imap2.json'
umap_file = str(f'{PROCESSED}')+'/umap2.json'

with open(imap_file, 'w') as f:
    json.dump(itemmap, f)

with open(umap_file, 'w') as f:
    json.dump(usermap, f)

In [None]:
for userid in User.keys():
    # sort User according to time
    User[userid].sort(key=lambda x: x[1])

In [None]:
user_train = {}
user_valid = {}
user_test = {}

for user in User:
    nfeedback = len(User[user])

    if nfeedback < 10:
        print('Error in taking 10-core')
        break
    else:
        user_train[user] = User[user][:-2]
        user_valid[user] = []
        user_valid[user].append(User[user][-2])
        user_test[user] = []
        user_test[user].append(User[user][-1])

In [None]:
train_file = str(f'{PROCESSED}')+'/train2.txt'
valid_file = str(f'{PROCESSED}')+'/valid2.txt'
test_file = str(f'{PROCESSED}')+'/test2.txt'

data_file = str(f'{PROCESSED}')+'/transactions_train_sequences2.txt'

def writetofile(data, dfile):
    with open(dfile, 'w') as f:
        for u, ilist in sorted(data.items()):
            for i, t in ilist:
                f.write(str(u) + '\t'+ str(i) + '\t' + str(t) + "\n")

def writetofile_v2(data, dfile):
    with open(dfile, 'w') as f:
        for u, ilist in sorted(data.items()):
            f.write(str(u))
            for i, t in ilist:
                f.write(' '+ str(i))
            f.write("\n")

writetofile_v2(User, data_file)

In [None]:
num_instances = sum([len(ilist) for _, ilist in User.items()])
print('total user: ', len(User))
print('total instances: ', num_instances)
print('avg length: ', num_instances / len(User))
print('total items: ', itemnum)
print('density: ', num_instances / (len(User) * itemnum))
print('valid #users: ', len(user_valid))
numvalid_instances = sum([len(ilist) for _, ilist in user_valid.items()])
print('valid instances: ', numvalid_instances)
numtest_instances = sum([len(ilist) for _, ilist in user_test.items()])
print('test #users: ', len(user_test))
print('test instances: ', numtest_instances)

total user:  365193
total instances:  23856203
avg length:  65.324918604683
total items:  63769
density:  0.001024399294401402
valid #users:  365193
valid instances:  365193
test #users:  365193
test instances:  365193
