In [1]:
import pandas as pd
import numpy as np
from copy import copy

from recommender.data_processing import map_column, get_context
from recommender.training import Dataset
from recommender.models import Recommender

from torch.utils.data import DataLoader
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

import pytorch_lightning as pl
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/train.csv')
df.sort_values(by='ts_listen', inplace=True)

In [3]:
sample_id = np.sort(df.user_id.unique())[:1000].tolist()
sample_data = copy(df.query(f"user_id=={sample_id}"))
sample_data, sample_mapping, sample_inverse_mapping = map_column(sample_data, col_name="media_id")

gb = sample_data.groupby(by="user_id")

In [4]:
train_list, test_list = [], []

# Sampling based on user and split 80:20 by sorted 'ts_listen'

for group in list(gb.groups):
    df_group = gb.get_group(group)
    train_group, test_group = train_test_split(df_group, test_size=0.2, random_state=0, shuffle=False)
    train_list.append(train_group)
    test_list.append(test_group)

In [5]:
train = pd.concat(train_list)
test = pd.concat(test_list)

In [6]:
data_train = copy(train.query("is_listened==1"))
grp_by_train = data_train.groupby(by="user_id")
groups = list(grp_by_train.groups)

In [7]:
history_size = 120

train_data = Dataset(
    groups=groups,
    grp_by=grp_by_train,
    split="train",
    history_size=history_size,
)

In [8]:
batch_size = 32

train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    num_workers=0,
    shuffle=False,
)

In [9]:
log_dir = "recommender_logs"
model_dir = "recommender_models"
# model_dir = "recommender_models_cpu"

model = Recommender(
    vocab_size=len(sample_mapping) + 2,
    lr=1e-4,
    dropout=0.3,
)

logger = TensorBoardLogger(
    save_dir=log_dir,
)

checkpoint_callback = ModelCheckpoint(
    monitor="valid_loss",
    mode="min",
    dirpath=model_dir,
    filename="",
)

In [11]:
epochs = 100

trainer = pl.Trainer(
    max_epochs=epochs,
    logger=logger,
    accelerator='gpu',
    devices=1,
    callbacks=[checkpoint_callback],
)

trainer.fit(model, train_loader, train_loader)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name                | Type               | Params
-----------------------------------------------------------
0 | item_embeddings     | Embedding          | 23.6 M
1 | input_pos_embedding | Embedding          | 65.5 K
2 | encoder             | TransformerEncoder | 3.6 M 
3 | linear_out          | Linear             | 23.8 M
4 | do                  | Dropout            | 0     
-----------------------------------------------------------
51.1 M    Trainable params
0         Non-trainable params
51.1 M    Total params
204.309   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [12]:
result_val = trainer.test(dataloaders=val_loader)

output_json = {
    "val_loss": result_val[0]["test_loss"],
    "best_model_path": checkpoint_callback.best_model_path,
}

print(output_json)


  rank_zero_warn(
Restoring states from the checkpoint path at /home/studio-lab-user/sagemaker-studiolab-notebooks/recommender_models/epoch=91-step=368.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /home/studio-lab-user/sagemaker-studiolab-notebooks/recommender_models/epoch=91-step=368.ckpt
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_accuracy                 0.0
        test_loss           11.085309982299805
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
{'val_loss': 11.085309982299805, 'best_model_path': '/home/studio-lab-user/sagemaker-studiolab-notebooks/recommender_models/epoch=91-step=368.ckpt'}


In [22]:
import torch

PAD = 0
MASK = 1

ids = [PAD] * (120 - 1 - 1) + [mapping[206493]] + [MASK]

src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

In [24]:
with torch.no_grad():
    prediction = model(src)

In [29]:
masked_pred = prediction[0, -1].numpy()

sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]

sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]

# return [idx_to_movie[a] for a in sorted_predicted_ids[:30] if a in idx_to_movie]

In [2]:
data, mapping, inverse_mapping = map_column(data, col_name="media_id")

NameError: name 'map_column' is not defined

In [1]:
mapping[206493]

NameError: name 'mapping' is not defined

In [5]:
# data.query(f"media_id=={inverse_mapping[36474]}")

# data.query(f"user_id==[88,11]").media_id_mapped.unique()


In [6]:
# sorted_predicted_ids[:30]