In [1]:
!pwd
!python --version
!ls

/Users/z0g00mx/desktop/Intern_project
Python 3.7.3
README.md         als_pyspark.ipynb [34monboarding[m[m        query.sh
[34m__pycache__[m[m       bpr.ipynb         [34mpapers[m[m            [34mreport[m[m
als.ipynb         [34mexperiment_data[m[m   preprocess.ipynb


In [2]:
import os
import sys
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import scipy.sparse as sp

from collections import defaultdict, Counter

# Describe the distribution of training/validation/test data

`train.txt`:
    * ‘2019-03-25’ ~ ‘2019-05-19’, 2 months
    * Number of transactions: 47938318
    * Size of textfile: 5.1G
`validation.txt`:
    * ‘2019-05-20’ ~ ‘2019-06-02’, 2 weeks
    * Number of transactions: 11546420
    * Size of textfile: 1.3G
`test.txt`:
    * ‘2019-06-03’ to ‘2019-06-09’, 1 week
    * Number of transactions: 6149672
    * Size of textfile: 666M

# Train

In [3]:
# get (cid, catalog_item_id) rows from train data
file_path = "../train.txt"

table = []
with open(file_path, 'r') as f:
    for line in f.readlines():
        line = line.split()
        user, item = line[0], line[4]
        table.append(np.asarray([user, item]))

table = np.asarray(table)

In [4]:
# indexing
# cid -> user_id (0~N)
# catalog_item_id -> item_id (0~M)
user_map, item_map = {}, {}
user_id = item_id = 0

for row in table:
    user, item = row[0], row[-1]
    if user not in user_map:
        user_map[user] = user_id
        user_id += 1
    if item not in item_map:
        item_map[item] = item_id
        item_id += 1

N, M = len(user_map), len(item_map)
N, M

(14929995, 3008951)

In [5]:
# calculate frequencies about users buying and items bought
user_dict = defaultdict(int)
item_dict = defaultdict(int)

for row in table:
    user_id, item_id = user_map[row[0]], item_map[row[-1]]

    user_dict[user_id] += 1
    item_dict[item_id] += 1

user_count = Counter(user_dict.values())
item_count = Counter(item_dict.values())

In [6]:
# get the distribution of dataset
# "customer_buy_frequency.csv"
sum_items = sum(user_count.values())

# with open('./experiment_data/customer_buy_frequency.csv', 'w') as g:
#     g.write("Frequency,# of Customers,%\n")
#     for k, v in sorted(user_count.items()):
#         g.write("{},{},{:.7f}\n".format(k, v, v/sum_items))

# "item_bought_frequency.csv"
sum_users = sum(item_count.values())

# with open('./experiment_data/item_bought_frequency.csv', 'w') as h:
#     h.write("Frequency,# of Items,%\n")
#     for k, v in sorted(item_count.items()):
#         h.write("{},{},{:.7f}\n".format(k, v, v/sum_users))

Based on the output, set the threshold to avoid outliers like `infrequent customers/items, resellers`

In [7]:
# drop infrequent user/item and resellers
# determine the thresholds based on the distribution
lower_user_threshold, upper_user_threshold = 45, 250
lower_item_threshold = 150

ignored_user_set, ignored_item_set = set(), set()

for u, v in user_dict.items():
    if v <= lower_user_threshold or v >= upper_user_threshold:
        ignored_user_set.add(u)

for i, v in item_dict.items():
    if v <= lower_item_threshold:
        ignored_item_set.add(i)

len(ignored_user_set), len(ignored_item_set)

(14895325, 2958922)

In [8]:
# re-indexing after shrinking the data
train_user_map, train_item_map = {}, {}
new_u_id = new_i_id = 0

for row in table:
    user, item = row[0], row[-1]
    if user_map[user] not in ignored_user_set and item_map[item] not in ignored_item_set:
        if user not in train_user_map:
            train_user_map[user] = new_u_id
            new_u_id += 1
        if item not in train_item_map:
            train_item_map[item] = new_i_id
            new_i_id += 1

## fix bug: indices of user_map/item_map should be continuous
# for u in user_map:
#     if user_map[u] not in ignored_user_set:
#         train_user_map[u] = new_u_id
#         new_u_id += 1
# for i in item_map:
#     if item_map[i] not in ignored_item_set:
#         train_item_map[i] = new_i_id
#         new_i_id += 1
        
N, M = len(train_user_map), len(train_item_map)
N, M

(34619, 47880)

Convert the data table into dictionary, mapping as `user_id -> item_id -> value`.

In [9]:
# get 0/1 user-item matrix
user_item_dict = defaultdict(lambda : defaultdict(float))
# user_item_dict = defaultdict(lambda : defaultdict(int))

for row in table:
    user, item = row[0], row[-1]
    if user in train_user_map and item in train_item_map:
        user_id, item_id = train_user_map[user], train_item_map[item]
        user_item_dict[user_id][item_id] += 1
        # user_item_dict[user_id][item_id] = 1

# if normalize (guess non-normalized data performs better)
for uid in user_item_dict:
    tmp = sum(user_item_dict[uid].values())
    for iid in user_item_dict[uid]:
        user_item_dict[uid][iid] /= tmp

In [10]:
## calculate density of matrix
# matrix = np.zeros((N, M))
# for u in range(N):
#     for i in range(M):
#         matrix[u][i] = user_item_dict[u][i]
# matrix_size = np.prod(matrix.shape)
# interaction = np.flatnonzero(matrix).shape[0]

matrix_size = N * M
interaction = sum(len(user_item_dict[u].keys()) for u in user_item_dict)
density = 100 * interaction / matrix_size

print('matrix interaction: ', interaction)
print('matrix density: {:.4f}%'.format(density))

matrix interaction:  1180452
matrix density: 0.0712%


In [11]:
# write textfile
# with open('../processed_train.txt', 'w') as m:
# with open('../processed_train_2.txt', 'w') as m:
#     for uid in sorted(user_item_dict):
#         for iid in sorted(user_item_dict[uid]):
#             m.write("{}\t{}\t{}\n".format(uid, iid, user_item_dict[uid][iid]))

!wc -l ../processed_train.txt

 1180452 ../processed_train.txt


In [12]:
# store the mappings for users-based validation and test
%store train_user_map  # from cid to user_id
%store train_item_map  # from catalog_item_id to item_id

Stored 'train_user_map' (dict)
Stored 'train_item_map' (dict)


In [15]:
# get and store inverse mappings 
# user_id -> cid
# item_id -> catalog_item_id
cid_map = {v:k for k,v in train_user_map.items()}
catalog_item_id_map = {v:k for k,v in train_item_map.items()}

%store cid_map
%store catalog_item_id_map

Stored 'cid_map' (dict)
Stored 'catalog_item_id_map' (dict)


# Validation

In [16]:
# preprocess validation data
file_path = '../validation.txt'

validation_dict = defaultdict(lambda : defaultdict(int))
eval_users, eval_items = set(), set()

with open(file_path, 'r') as v:
    for line in v.readlines():
        line = line.split()
        user, item = line[0], line[4]
        if user in train_user_map and item in train_item_map:
            user_id, item_id = train_user_map[user], train_item_map[item]
            eval_users.add(user_id)
            eval_items.add(item_id)
            # validation_dict[user_id][item_id] = 1
            validation_dict[user_id][item_id] += 1

# if normalize to 0~1
for uid in validation_dict:
    tmp = sum(validation_dict[uid].values())
    for iid in validation_dict[uid]:
        validation_dict[uid][iid] /= tmp

v_total = sum(len(validation_dict[u].keys()) for u in validation_dict)
v_users, v_items = len(eval_users), len(eval_items)
v_density = 100 * v_total / (v_users * v_items)

print('validation size: {} users, {} items'.format(v_users, v_items))
print('validation density: {:.4f}%'.format(v_density))

# write validation file
# with open('../processed_validation.txt', 'w') as m:
# with open('../processed_validation_2.txt', 'w') as m:
#     for uid in sorted(validation_dict):
#         for iid in sorted(validation_dict[uid]):
#             m.write("{}\t{}\t{}\n".format(uid, iid, validation_dict[uid][iid]))

v_total

validation size: 25048 users, 34208 items
validation density: 0.0274%


234524

# Test

In [17]:
# preprocess test data
file_path = '../test.txt'

test_dict = defaultdict(lambda : defaultdict(int))
test_users, test_items = set(), set()

with open(file_path, 'r') as v:
    for line in v.readlines():
        line = line.split()
        user, item = line[0], line[4]
        if user in train_user_map and item in train_item_map:
            user_id, item_id = train_user_map[user], train_item_map[item]
            test_users.add(user_id)
            test_items.add(item_id)
            # test_dict[user_id][item_id] = 1
            test_dict[user_id][item_id] += 1

# if normalize to 0~1
for uid in test_dict:
    tmp = sum(test_dict[uid].values())
    for iid in test_dict[uid]:
        test_dict[uid][iid] /= tmp

t_total = sum(len(test_dict[u].keys()) for u in test_dict)
t_users, t_items = len(test_users), len(test_items)
t_density = 100 * t_total / (t_users * t_items)

print('test size: {} users, {} items'.format(t_users, t_items))
print('test density: {:.4f}%'.format(t_density))

# write validation file
# with open('../processed_test.txt', 'w') as m:
# with open('../processed_test_2.txt', 'w') as m:
#     for uid in sorted(test_dict):
#         for iid in sorted(test_dict[uid]):
#             m.write("{}\t{}\t{}\n".format(uid, iid, test_dict[uid][iid]))

t_total

test size: 18612 users, 27496 items
test density: 0.0246%


125816