In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import json

In [2]:
transaction_df = pd.read_csv("../data/preprocessed/transaction.csv")
recommendation_df = pd.read_csv("../data/preprocessed/recommendation.csv", index_col=["t", "store_id"])

In [3]:
transaction_df.head()

Unnamed: 0,user_id,t,store_id,item_id
0,1193,9,334,942088
1,1193,9,334,985893
2,1193,9,334,1013928
3,1193,9,334,1115874
4,1193,9,334,5569230


In [4]:
recommendation_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_ids
t,store_id,Unnamed: 2_level_1
9,288,822407 823704 824311 825665 826249 826597 8280...
9,289,822140 823704 824311 826249 826385 826597 8276...
9,292,819255 820752 822785 823721 825365 825665 8257...
9,293,819063 821890 822140 822407 825970 826249 8265...
9,295,821845 823704 825665 826249 826666 827047 8276...


In [5]:
user_le = LabelEncoder().fit(transaction_df.user_id)
item_le = LabelEncoder().fit(transaction_df.item_id)

item_ids = set(item_le.classes_)

transaction_df.user_id = user_le.fit_transform(transaction_df.user_id)
transaction_df.item_id = item_le.fit_transform(transaction_df.item_id)

recommendation_df.item_ids = (
    recommendation_df.item_ids.apply(lambda s: list(map(int, s.split())))
    .apply(lambda s: list(filter(lambda p: p in item_ids, s)))
    .apply(lambda s: set(item_le.transform(s)))
)

In [6]:
transaction_df.head()

Unnamed: 0,user_id,t,store_id,item_id
0,518,9,334,1188
1,518,9,334,1572
2,518,9,334,1827
3,518,9,334,2696
4,518,9,334,2972


In [7]:
recommendation_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_ids
t,store_id,Unnamed: 2_level_1
9,288,"{1026, 1035, 2572, 2063, 2577, 2039, 532, 3091..."
9,289,"{2570, 1035, 2571, 2577, 1555, 1051, 1052, 156..."
9,292,"{2, 13, 2063, 2068, 2069, 2071, 2072, 2078, 37..."
9,293,"{1, 2051, 2064, 2068, 28, 30, 2079, 34, 2085, ..."
9,295,"{2049, 1542, 2571, 527, 2575, 2068, 2069, 2581..."


In [8]:
td = transaction_df.t.max()
te = 8

table: valid_data, test_data
| user_id | train_purchased_items | train_recommended_items | eval_target_items |
|---------|-----------------------|-------------------------|-------------------|
| 0       | [1, 2, 3]             | [1, 2, 4]               | [5, 2, 4]         |
| 1       | [2, 5, 10]            | [3, 6, 7]               | [3, 7, 8]         |
| 2       | [3, 6, 8]             | [4, 5, 8]               | [4, 6, 10]        |

In [9]:
rec_dict = recommendation_df.item_ids.to_dict()

In [10]:
data = {}

for col_name, (tl, tr) in {
    "valid_train_purchased_items": (1, td - 2 * te),
    "valid_eval_purchased_items": (td - 2 * te + 1, td - te),
    "test_train_purchased_items": (te + 1, td - te),
    "test_eval_purchased_items": (td - te + 1, td),
}.items():
    data[col_name] = (
        transaction_df[(tl <= transaction_df.t) & (transaction_df.t <= tr)]
        .groupby("user_id")["item_id"]
        .agg(set)
        .apply(lambda e: list(e))
        .rename(col_name)
    )

recommend_dict = {
    "valid_train_recommended_items": (1, td - 2 * te),
    "valid_eval_recommended_items": (td - 2 * te + 1, td - te),
    "test_train_recommended_items": (te + 1, td - te),
    "test_eval_recommended_items": (td - te + 1, td),
}

for col_name, _ in recommend_dict.items():
    data[col_name] = []


for user_id, user_df in transaction_df.drop_duplicates(
    subset=["user_id", "t", "store_id"]
).groupby("user_id")[["t", "store_id"]]:
    for col_name, (tl, tr) in recommend_dict.items():
        data[col_name].append(set())

    for _, row in user_df.iterrows():
        for col_name, (tl, tr) in recommend_dict.items():
            if tl <= row.t <= tr and (row.t, row.store_id) in rec_dict:
                data[col_name][-1] |= rec_dict[row.t, row.store_id]

    for col_name, _ in recommend_dict.items():
        data[col_name][-1] = list(data[col_name][-1])

In [11]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0_level_0,valid_train_purchased_items,valid_eval_purchased_items,test_train_purchased_items,test_eval_purchased_items,valid_train_recommended_items,valid_eval_recommended_items,test_train_recommended_items,test_eval_recommended_items
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,"[3584, 2561, 3586, 3588, 3589, 3076, 522, 2061...","[2561, 516, 1157, 1159, 2061, 1038, 1171, 791,...","[3584, 2561, 3586, 3588, 3589, 3076, 516, 522,...","[2561, 3588, 3589, 2572, 2061, 2960, 277, 2972...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[2049, 1, 2, 4, 5, 9, 11, 12, 2059, 13, 18, 20...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[3, 4, 8, 10, 11, 16, 19, 32, 33, 34, 37, 42, ..."
1,"[0, 1, 3076, 3591, 2055, 1543, 2571, 2575, 206...","[1029, 1931, 2833, 2836, 2342, 1576, 303, 1969...","[0, 1, 3076, 1029, 3591, 2055, 1543, 2571, 257...","[130, 2055, 392, 137, 2824, 1931, 268, 1296, 2...","[0, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 16, 19, 2...","[2048, 2049, 2, 5, 6, 2055, 9, 2059, 11, 13, 1...","[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16...","[2048, 1, 2049, 3, 5, 6, 7, 8, 2058, 2059, 19,..."
2,"[5, 2055, 2057, 2058, 2060, 2065, 2067, 2068, ...","[5, 2577, 1554, 531, 2068, 2075, 1564, 2082, 1...","[5, 2055, 2057, 2058, 2060, 2065, 2067, 2068, ...","[517, 2058, 1035, 2065, 2577, 1043, 2068, 531,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[2, 5, 12, 13, 26, 27, 35, 42, 44, 47, 57, 58,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[2, 3, 7, 10, 11, 19, 26, 29, 30, 33, 34, 41, ..."
3,"[1536, 3073, 4, 2570, 2571, 2060, 524, 1551, 3...","[2304, 2695, 906, 2060, 2957, 2446, 527, 1295,...","[1536, 3073, 4, 2570, 2571, 2060, 524, 1551, 5...","[906, 2446, 21, 802, 1061, 936, 937, 45, 2222,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 16...","[2049, 1, 2, 5, 2054, 2055, 11, 12, 2064, 2067...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 16...","[2049, 3, 6, 2055, 2054, 2060, 2068, 2071, 25,..."
4,"[2571, 2069, 1564, 1054, 34, 2101, 2623, 2626,...","[1923, 3076, 1159, 2187, 1296, 1044, 1818, 924...","[3076, 2571, 1044, 2069, 1564, 1054, 34, 2101,...","[9, 143, 2071, 1176, 2331, 2086, 3116, 2222, 8...","[0, 6, 13, 26, 27, 29, 30, 31, 33, 34, 35, 38,...","[2049, 1029, 5, 1033, 1036, 2067, 2068, 2069, ...","[0, 5, 6, 13, 26, 27, 29, 30, 31, 33, 34, 35, ...","[1, 2049, 3, 1026, 6, 2055, 3078, 11, 2059, 30..."


In [12]:
df.to_json("../data/preprocessed/preprocessed.json", orient="index")

In [13]:
params = {"user_n": len(user_le.classes_), "item_n": len(item_le.classes_)}
with open("../data/preprocessed/param.json", "w") as f:
    json.dump(params, f)