In [96]:
import pandas as pd
import numpy as np
import tqdm
from sklearn.preprocessing import LabelEncoder
import json

tqdm.tqdm.pandas()

In [97]:
transaction_df = pd.read_csv("./data/preprocessed/transaction.csv")
recommendation_df = pd.read_csv("./data/preprocessed/recommendation.csv", index_col=["t", "store_id"])

In [98]:
transaction_df.head()

Unnamed: 0,user_id,t,store_id,item_id
0,1130,1,31642,833715
1,1130,1,31642,1048462
2,98,1,337,878302
3,98,1,337,985911
4,1172,1,396,930917


In [99]:
recommendation_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_ids
t,store_id,Unnamed: 2_level_1
9,288,822407 823704 824311 826249 826597 829722 8303...
9,292,819255 822785 823721 825365 825703 826249 8263...
9,296,819063 819304 822178 823704 823721 825365 8262...
9,298,821344 822346 822407 823704 826249 826597 8270...
9,306,819063 821200 822785 823704 823721 824311 8268...


In [100]:
user_le = LabelEncoder().fit(transaction_df.user_id)
item_le = LabelEncoder().fit(transaction_df.item_id)

item_ids = set(item_le.classes_)

transaction_df.user_id = user_le.fit_transform(transaction_df.user_id)
transaction_df.item_id = item_le.fit_transform(transaction_df.item_id)

recommendation_df.item_ids = (
    recommendation_df.item_ids.apply(lambda s: list(map(int, s.split())))
    .apply(lambda s: list(filter(lambda p: p in item_ids, s)))
    .apply(lambda s: set(item_le.transform(s)))
)

In [101]:
transaction_df.head()

Unnamed: 0,user_id,t,store_id,item_id
0,210,1,31642,71
1,210,1,31642,1020
2,19,1,337,284
3,19,1,337,754
4,218,1,396,520


In [102]:
recommendation_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_ids
t,store_id,Unnamed: 2_level_1
9,288,"{520, 1044, 21, 1556, 1557, 24, 537, 1558, 155..."
9,292,"{1, 514, 519, 520, 521, 526, 529, 1555, 1044, ..."
9,296,"{0, 2, 514, 1029, 518, 520, 1035, 526, 529, 19..."
9,298,"{514, 515, 518, 520, 522, 11, 529, 531, 20, 21..."
9,306,"{0, 1025, 514, 1541, 518, 520, 9, 1035, 529, 1..."


In [103]:
td = transaction_df.t.max()
te = 8

table: valid_data, test_data
| user_id | train_purchased_items | train_recommended_items | eval_target_items |
|---------|-----------------------|-------------------------|-------------------|
| 0       | [1, 2, 3]             | [1, 2, 4]               | [5, 2, 4]         |
| 1       | [2, 5, 10]            | [3, 6, 7]               | [3, 7, 8]         |
| 2       | [3, 6, 8]             | [4, 5, 8]               | [4, 6, 10]        |

In [104]:
rec_dict = recommendation_df.item_ids.to_dict()

In [105]:
data = {}

for col_name, (tl, tr) in {
    "valid_train_purchased_items": (1, td - 2 * te),
    "valid_eval_purchased_items": (td - 2 * te + 1, td - te),
    "test_train_purchased_items": (te + 1, td - te),
    "test_eval_purchased_items": (td - te + 1, td),
}.items():
    data[col_name] = (
        transaction_df[(tl <= transaction_df.t) & (transaction_df.t <= tr)]
        .groupby("user_id")["item_id"]
        .agg(set)
        .apply(lambda e: list(e))
        .rename(col_name)
    )

data["valid_train_recommended_items"] = []
data["test_train_recommended_items"] = []

for user_id, user_df in transaction_df.drop_duplicates(subset=["user_id", "t", "store_id"]).groupby("user_id")[["t", "store_id"]]:
    data["valid_train_recommended_items"].append(set())
    data["test_train_recommended_items"].append(set())

    for _, row in user_df.iterrows():
        if 1 <= row.t <= td - 2 * te and (row.t, row.store_id) in rec_dict:
            data["valid_train_recommended_items"][-1] |= rec_dict[row.t, row.store_id]
        if te + 1 <= row.t <= td - te and (row.t, row.store_id) in rec_dict:
            data["test_train_recommended_items"][-1] |= rec_dict[row.t, row.store_id]

    data["valid_train_recommended_items"][-1] = list(data["valid_train_recommended_items"][-1])
    data["test_train_recommended_items"][-1] = list(data["test_train_recommended_items"][-1])

In [106]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0_level_0,valid_train_purchased_items,valid_eval_purchased_items,test_train_purchased_items,test_eval_purchased_items,valid_train_recommended_items,test_train_recommended_items
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"[0, 1538, 515, 1029, 525, 1038, 1554, 1044, 53...","[740, 106, 657, 755, 986, 635]","[0, 1538, 515, 1029, 525, 1038, 1554, 1044, 53...","[899, 261, 139, 1419, 657, 1426, 1428, 533, 79...","[1, 2, 3, 4, 5, 6, 8, 10, 11, 14, 16, 17, 20, ...","[1, 2, 3, 4, 5, 6, 8, 10, 11, 14, 16, 17, 20, ..."
1,"[11, 14, 24, 25, 29, 31, 36, 37, 41, 43, 61, 6...","[1664, 261, 1419, 782, 1550, 1038, 785, 1554, ...","[11, 14, 24, 25, 29, 31, 36, 37, 41, 43, 61, 6...","[520, 20, 1044, 24, 28, 1054, 542, 36, 562, 10...","[0, 1, 2, 3, 4, 6, 7, 10, 11, 13, 14, 16, 17, ...","[0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 13, 14, 16, 1..."
2,"[1538, 4, 1541, 389, 1543, 1160, 1292, 1421, 9...","[834, 612, 1541, 842, 1421, 1136, 849, 912, 15...","[1538, 4, 1541, 389, 1543, 1160, 1292, 1421, 9...","[576, 834, 612, 1541, 614, 325, 877, 912, 1136...","[0, 1, 3, 4, 5, 6, 7, 10, 11, 12, 14, 15, 16, ...","[0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 1..."
3,"[0, 3, 517, 5, 1035, 15, 1047, 1050, 1063, 107...","[769, 4, 1425, 790, 1431, 1195, 1078, 1208, 19...","[0, 3, 4, 517, 5, 1035, 15, 1047, 1050, 1063, ...","[1152, 128, 1416, 268, 1425, 1426, 1183, 1317,...","[0, 1, 2, 3, 6, 7, 13, 14, 16, 17, 18, 20, 21,...","[0, 1, 2, 3, 6, 7, 13, 14, 16, 17, 18, 20, 21,..."
4,"[640, 771, 4, 774, 646, 9, 783, 657, 1042, 142...","[832, 834, 771, 1060, 1125, 710, 966, 840, 157...","[4, 9, 1550, 1042, 1564, 1570, 1060, 1573, 157...",,"[0, 1, 2, 3, 4, 6, 7, 9, 10, 14, 16, 17, 19, 2...","[0, 1, 2, 3, 4, 6, 7, 9, 10, 14, 16, 17, 19, 2..."


In [107]:
df.to_json("./data/preprocessed/preprocessed.json", orient="index")

In [108]:
params = {"user_n": len(user_le.classes_), "item_n": len(item_le.classes_)}
with open("./data/preprocessed/param.json", "w") as f:
    json.dump(params, f)