In [1]:
import sys

import numpy as np
import pandas as pd
import torch

from rnn_dataset import SequenceDataset, essay_collate_fn
from rnn_trainer import train_rnn
from rnn_model import GRUMultiTask


In [2]:
IN_COLAB = 'google.colab' in sys.modules

# find gpu device
GPU_AVAILABLE = torch.cuda.is_available()
device = torch.device('cuda' if GPU_AVAILABLE else 'cpu')


RANDOM_SEED = 42

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Colab environment")
    d_path = "./drive/MyDrive/data/sequence-embeddings/"
    save_path = "./drive/MyDrive/data/sequence-embeddings/"
else:
    print("Local environment")
    d_path = "./data/"
    save_path = "./data/"

BATCH_SIZE = 2048
NUM_PRODUCTS = 49688 # number of products from the dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Colab environment


In [3]:
# Load the data
# features = pd.read_csv(d_path + "features.csv")
labels = pd.read_csv(d_path + "labels.csv")
rnn_data = pd.read_parquet(d_path + "rnn_data.parquet").reset_index()


In [4]:
rnn_data = rnn_data.merge(labels[["user_id", "cls_days_to_next_order","cls_reorder_ratio"]], on="user_id", how="left")
rnn_data.describe()

Unnamed: 0,user_id,cls_days_to_next_order,cls_reorder_ratio
count,206209.0,206209.0,206209.0
mean,103105.0,1.568506,1.807394
std,59527.555167,1.12296,0.868746
min,1.0,0.0,0.0
25%,51553.0,1.0,1.0
50%,103105.0,2.0,2.0
75%,154657.0,3.0,2.0
max,206209.0,3.0,3.0


In [5]:
rnn_data

Unnamed: 0,user_id,product_id,norm_days_since_prior_order,cls_days_to_next_order,cls_reorder_ratio
0,1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.5, ...",1,3
1,2,"[32792, 47766, 20574, 12000, 48110, 22474, 165...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",3,1
2,3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",2,3
3,4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ...","[1.0, 1.0, 1.0, 1.0, 0.6333333333333333, 0.633...",0,0
4,5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0,1
...,...,...,...,...,...
206204,206205,"[27845, 28745, 3896, 49235, 21137, 37067, 3873...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1,1
206205,206206,"[27086, 41387, 38530, 47011, 8021, 38530, 1152...","[0.13333333333333333, 0.1, 0.1, 0.1, 0.1, 0.03...",1,3
206206,206207,"[47766, 3397, 3469, 2450, 45965, 20583, 30233,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",2,3
206207,206208,"[23644, 21461, 31506, 31404, 27845, 20995, 276...","[0.13333333333333333, 0.13333333333333333, 0.1...",1,2


In [6]:
# rnn_data["max_product_id"] = rnn_data["product_id"].apply(lambda x: max(x))
# rnn_data["min_product_id"] = rnn_data["product_id"].apply(lambda x: min(x))
# rnn_data["num_null"] = rnn_data["product_id"].apply(lambda x: len([e for e in x if not isinstance(e, int)]))
# rnn_data["min_product_id"].min()
# rnn_data["product_id"] = rnn_data["product_id"].apply(lambda x: [int(e) for e in x])

In [7]:
cls_weight_task_1 = rnn_data.cls_days_to_next_order.value_counts(normalize=True).max() / rnn_data.cls_days_to_next_order.value_counts(normalize=True).sort_index()
cls_weight_task_2 = rnn_data.cls_reorder_ratio.value_counts(normalize=True).max() / rnn_data.cls_reorder_ratio.value_counts(normalize=True).sort_index()

In [8]:
print(cls_weight_task_1)
print(cls_weight_task_2)

0    1.309691
1    1.029578
2    1.355644
3    1.000000
Name: cls_days_to_next_order, dtype: float64
0    5.714340
1    1.672118
2    1.000000
3    1.966235
Name: cls_reorder_ratio, dtype: float64


In [9]:
# Split the data into train and test
from sklearn.model_selection import train_test_split
rnn_data_train, rnn_data_test = train_test_split(rnn_data, test_size=0.1, random_state=RANDOM_SEED)

In [10]:
rnn_data_train, rnn_data_valid = train_test_split(rnn_data_train, test_size=0.05, random_state=RANDOM_SEED)

In [11]:
ds_train = SequenceDataset(
    rnn_data_train, "product_id", "norm_days_since_prior_order",
    ["cls_days_to_next_order","cls_reorder_ratio"]
    )
dl_train = torch.utils.data.DataLoader(
    ds_train, batch_size=BATCH_SIZE, 
    shuffle=True, collate_fn=essay_collate_fn)
ds_valid = SequenceDataset(
    rnn_data_valid, "product_id", "norm_days_since_prior_order",
    ["cls_days_to_next_order","cls_reorder_ratio"]
    )
dl_valid = torch.utils.data.DataLoader(
    ds_valid, batch_size=BATCH_SIZE, 
    shuffle=True, collate_fn=essay_collate_fn)

In [12]:
rnn_model = GRUMultiTask(num_embeddings=NUM_PRODUCTS+1, embedding_dim=32, gru_hidden_size=32, dropout=0.2)
best_model = train_rnn(dl_train, dl_valid, 
        model=rnn_model,
        model_name="initial_model",
        save_path=save_path,
        cls_weight_1 = cls_weight_task_1, 
        cls_weight_2 = cls_weight_task_2,
        opt=torch.optim.Adam(rnn_model.parameters(), lr=0.001),
        epochs=30)

device cuda
Epoch 0000, valid loss: 2.587223, best valid loss: 2.587223
Epoch 0001, valid loss: 2.505964, best valid loss: 2.505964
Epoch 0002, valid loss: 2.491995, best valid loss: 2.491995
Epoch 0003, valid loss: 2.484100, best valid loss: 2.484100
Epoch 0004, valid loss: 2.479378, best valid loss: 2.479378
Epoch 0005, valid loss: 2.474780, best valid loss: 2.474780
Epoch 0006, valid loss: 2.471274, best valid loss: 2.471274
Epoch 0007, valid loss: 2.466479, best valid loss: 2.466479
Epoch 0008, valid loss: 2.473715, best valid loss: 2.466479
Epoch 0009, valid loss: 2.475687, best valid loss: 2.466479
Epoch 0010, valid loss: 2.477807, best valid loss: 2.466479
Epoch 0011, valid loss: 2.501543, best valid loss: 2.466479
Epoch 0012, valid loss: 2.502166, best valid loss: 2.466479
Epoch 0013, valid loss: 2.514728, best valid loss: 2.466479
Epoch 0014, valid loss: 2.552108, best valid loss: 2.466479
Epoch 0015, valid loss: 2.574857, best valid loss: 2.466479
Epoch 0016, valid loss: 2.59

In [13]:
rnn_model = GRUMultiTask(num_embeddings=NUM_PRODUCTS+1, embedding_dim=32, gru_hidden_size=64, dropout=0.2)
best_model = train_rnn(dl_train, dl_valid, 
        model=rnn_model,
        model_name="gru64_lr0.01",
        save_path=save_path,
        cls_weight_1 = cls_weight_task_1, 
        cls_weight_2 = cls_weight_task_2,
        opt=torch.optim.Adam(rnn_model.parameters(), lr=0.01),
        epochs=30)

device cuda
Epoch 0000, valid loss: 2.494356, best valid loss: 2.494356
Epoch 0001, valid loss: 2.491824, best valid loss: 2.491824
Epoch 0002, valid loss: 2.497642, best valid loss: 2.491824
Epoch 0003, valid loss: 2.545732, best valid loss: 2.491824
Epoch 0004, valid loss: 2.662980, best valid loss: 2.491824
Epoch 0005, valid loss: 2.832839, best valid loss: 2.491824
Epoch 0006, valid loss: 2.921568, best valid loss: 2.491824
Epoch 0007, valid loss: 3.177166, best valid loss: 2.491824
Epoch 0008, valid loss: 3.394822, best valid loss: 2.491824
Epoch 0009, valid loss: 3.730756, best valid loss: 2.491824
Epoch 0010, valid loss: 3.700525, best valid loss: 2.491824
Epoch 0011, valid loss: 4.081199, best valid loss: 2.491824
Early stopping, best valid loss: 2.491824


In [16]:
best_initial_model = GRUMultiTask(num_embeddings=NUM_PRODUCTS+1, embedding_dim=32, gru_hidden_size=32, dropout=0.2)
best_initial_model.load_state_dict(torch.load(d_path + "best_initial_model.pt"))
best_initial_model.eval()


GRUMultiTask(
  (embeddings): Embedding(49689, 32, padding_idx=0)
  (gru): GRU(33, 32, num_layers=4, batch_first=True, dropout=0.2)
  (task1_mlp): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=4, bias=True)
  )
  (task2_mlp): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=4, bias=True)
  )
)

In [15]:
ds_test = SequenceDataset(
    rnn_data_test, "product_id", "norm_days_since_prior_order",
    ["cls_days_to_next_order","cls_reorder_ratio"]
    )
dl_test = torch.utils.data.DataLoader(
    ds_test, batch_size=BATCH_SIZE, 
    shuffle=False, collate_fn=essay_collate_fn)

In [18]:

y_pred_task1 = []
y_pred_task2 = []
best_initial_model.to(device)
with torch.no_grad():
    for (x1b, x2b, sb), yb in dl_test:
        x1b = x1b.to(device)
        x2b = x2b.to(device)
        sb = sb.to(device)
        yb = yb.to(device)
        yb_pred_1, yb_pred_2 = best_initial_model([x1b, x2b, sb])
        y_pred_task1.append(yb_pred_1)
        y_pred_task2.append(yb_pred_2)

In [19]:

y_pred_task1[0].shape

torch.Size([2048, 4])

In [33]:
y_pred_proba_1_np = torch.concat(y_pred_task1).cpu().numpy()
y_pred_proba_1_np = np.exp(y_pred_proba_1_np)/np.expand_dims(np.exp(y_pred_proba_1_np).sum(axis=1),1)
y_pred_proba_1_np

array([[0.3100659 , 0.31290126, 0.2308447 , 0.14618815],
       [0.1613381 , 0.17585288, 0.25485033, 0.40795866],
       [0.15288909, 0.26102057, 0.34558585, 0.24050444],
       ...,
       [0.14260565, 0.25013813, 0.33562815, 0.2716281 ],
       [0.60275805, 0.33198577, 0.05615537, 0.00910071],
       [0.23929557, 0.33022764, 0.28571635, 0.14476043]], dtype=float32)

In [34]:
y_pred_proba_2_np = torch.concat(y_pred_task2).cpu().numpy()
y_pred_proba_2_np = np.exp(y_pred_proba_2_np)/np.expand_dims(np.exp(y_pred_proba_2_np).sum(axis=1),1)
y_pred_proba_2_np

array([[0.2297949 , 0.30278087, 0.27492017, 0.19250412],
       [0.8473206 , 0.08099224, 0.02651683, 0.04517031],
       [0.04239007, 0.16434419, 0.45111993, 0.34214577],
       ...,
       [0.05373659, 0.30021158, 0.43800414, 0.20804776],
       [0.00380106, 0.02847211, 0.38175786, 0.585969  ],
       [0.0348279 , 0.1945189 , 0.48323214, 0.28742105]], dtype=float32)

In [41]:
y_test_pred_1 = np.argmax(y_pred_proba_1_np, axis=1)
y_test_pred_2 = np.argmax(y_pred_proba_2_np, axis=1)


In [42]:
from sklearn.metrics import classification_report
print(classification_report(y_test_pred_1, rnn_data_test["cls_days_to_next_order"]))

              precision    recall  f1-score   support

           0       0.43      0.44      0.43      4484
           1       0.27      0.36      0.31      4321
           2       0.21      0.28      0.24      3388
           3       0.66      0.46      0.54      8428

    accuracy                           0.40     20621
   macro avg       0.39      0.38      0.38     20621
weighted avg       0.45      0.40      0.42     20621



In [44]:
print(classification_report(y_test_pred_2, rnn_data_test["cls_reorder_ratio"]))

              precision    recall  f1-score   support

           0       0.57      0.21      0.31      4185
           1       0.41      0.42      0.41      5290
           2       0.36      0.58      0.45      5699
           3       0.48      0.41      0.44      5447

    accuracy                           0.42     20621
   macro avg       0.46      0.40      0.40     20621
weighted avg       0.45      0.42      0.41     20621

