In [None]:
!pip install pytorch_lightning==1.2.8

In [48]:
import pandas as pd
import numpy as np
import time
import datetime
import random

import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torch.nn as nn
from torch.nn import functional as F

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

from recommender import Recommender

# Load and preprocess data

In [3]:
def map_col(data, col):
    unique_values = sorted(list(data[col].unique()))
    mapping = {k: i + 2 for i, k in enumerate(unique_values)}
    inverse_mapping = {v: k for k, v in mapping.items()}
    
    data[col + '_mapped'] = data[col].map(mapping)
    
    return data, mapping, inverse_mapping

In [5]:
data_csv_path = 'ratings.csv'

data = pd.read_csv(data_csv_path)

data.sort_values(by="timestamp", inplace=True)

train_data, mapping, inverse_mapping = map_col(data, "movieId")

grp_by_train = data.groupby(by="userId")

groups = list(grp_by_train.groups)

# Build Dataset

In [6]:
PAD = 0
MASK = 1

In [7]:
def get_context(df: pd.DataFrame, split: str, context_size: int = 120, val_context_size: int = 5):

    if split == "train":
        if df.shape[0] - val_context_size < 10:
            end_index = df.shape[0]
        else:
            end_index = random.randint(10, df.shape[0] - val_context_size)
    elif split in ["val", "test"]:
        end_index = df.shape[0]
    else:
        raise ValueError

    start_index = max(0, end_index - context_size)

    context = df[start_index:end_index]

    return context

In [8]:
def pad_list(list_integers, history_size: int, pad_val: int = PAD, mode="left"):
    
    if len(list_integers) < history_size:
        if mode == "left":
            list_integers = [pad_val] * (history_size - len(list_integers)) + list_integers
        else:
            list_integers = list_integers + [pad_val] * (history_size - len(list_integers))

    return list_integers

In [9]:
def mask_list(l1, p=0.8):
    l1 = [a if random.random() < p else MASK for a in l1]
    return l1

def mask_last_elements_list(l1, val_context_size: int = 5):
    l1 = l1[:-val_context_size] + mask_list(l1[-val_context_size:], p=0.5)
    return l1

In [10]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, groups, grp_by, split, history_size=120, col='recipe_id'):
        self.col = col
        self.groups = groups
        self.grp_by = grp_by
        self.split = split
        self.history_size = history_size

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        group = self.groups[idx]

        df = self.grp_by.get_group(group)

        context = get_context(df, split=self.split, context_size=self.history_size)

        trg_items = context[f"{self.col}_mapped"].tolist()

        if self.split == "train":
            src_items = mask_list(trg_items)
        else:
            src_items = mask_last_elements_list(trg_items)

        pad_mode = "left" if random.random() < 0.5 else "right"
        trg_items = pad_list(trg_items, history_size=self.history_size, mode=pad_mode)
        src_items = pad_list(src_items, history_size=self.history_size, mode=pad_mode)

        src_items = torch.tensor(src_items, dtype=torch.long)

        trg_items = torch.tensor(trg_items, dtype=torch.long)

        return src_items, trg_items

In [11]:
train_data.sort_values(by='timestamp', inplace=True)

In [12]:
train_data, mapping, inverse_mapping = map_col(train_data, "movieId")

grp_by_train = train_data.groupby(by="userId")
groups = list(grp_by_train.groups)

In [13]:
history_size = 120

train_data = Dataset(groups=groups,
                     grp_by=grp_by_train,
                     split="train",
                     history_size=history_size,
                     col='movieId')

val_data = Dataset(groups=groups,
                   grp_by=grp_by_train,
                   split="val",
                   history_size=history_size,
                   col='movieId')

In [14]:
print("len(train_data)", len(train_data))
print("len(val_data)", len(val_data))

len(train_data) 6040
len(val_data) 6040


In [15]:
BATCH_SIZE = 32

train_loader = DataLoader(train_data,
                          batch_size=BATCH_SIZE,
                          num_workers=2,
                          shuffle=True,)

val_loader = DataLoader(val_data,
                        batch_size=BATCH_SIZE,
                        num_workers=2,
                        shuffle=False,)

# Build model and model trainer

In [16]:
def masked_ce(y_pred, y_true, mask):

    loss = F.cross_entropy(y_pred, y_true, reduction='none')

    loss = loss * mask

    return loss.sum() / (mask.sum() + 1e-8)

In [17]:
def masked_accuracy(y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.Tensor):

    _, predicted = torch.max(y_pred, 1)

    y_true = torch.masked_select(y_true, mask)
    predicted = torch.masked_select(predicted, mask)

    acc = (y_true == predicted).double().mean()

    return acc


In [19]:
class TrainingModule(pl.LightningModule):
  def __init__(self, vocab_size, mask=1, lr=1e-4):
    super().__init__()

    self.mask = mask
    self.lr = lr
    self.vocab_size = vocab_size

    self.model = Recommender(vocab_size=vocab_size)
    
  def forward(self, src_items):
    return self.model(src_items)

  def training_step(self, batch, batch_idx):
    src_items, y_true = batch
    
    y_pred = self(src_items)

    y_pred = y_pred.view(-1, y_pred.size(2))
    y_true = y_true.view(-1)

    src_items = src_items.view(-1)
    mask = src_items == self.mask

    loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
    accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

    self.log("train_loss", loss)
    self.log("train_accuracy", accuracy)

    return loss

  def validation_step(self, batch, batch_idx):
    src_items, y_true = batch

    y_pred = self(src_items)

    y_pred = y_pred.view(-1, y_pred.size(2))
    y_true = y_true.view(-1)

    src_items = src_items.view(-1)
    mask = src_items == self.mask

    loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
    accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

    self.log("valid_loss", loss)
    self.log("valid_accuracy", accuracy)

    return loss

  def test_step(self, batch, batch_idx):
    src_items, y_true = batch

    y_pred = self(src_items)

    y_pred = y_pred.view(-1, y_pred.size(2))
    y_true = y_true.view(-1)

    src_items = src_items.view(-1)
    mask = src_items == self.mask

    loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
    accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

    self.log("test_loss", loss)
    self.log("test_accuracy", accuracy)

    return loss

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, patience=10, factor=0.1
    )

    return {
        'optimizer': optimizer,
        'lr_scheduler': scheduler,
        'monitor': 'valid_loss'
    }

In [20]:
training_module = TrainingModule(vocab_size=len(mapping) + 2, lr=1e-4,)

In [21]:
LOG_DIR = './logger'
MODEL_DIR = './model'
EPOCHS = 100

logger = TensorBoardLogger(
    save_dir=LOG_DIR,
)

checkpoint_callback = ModelCheckpoint(
    monitor="valid_loss",
    mode="min",
    dirpath=MODEL_DIR,
    filename="recommender",
)

In [22]:
trainer = pl.Trainer(
    max_epochs=EPOCHS,
    gpus=0,
    logger=logger,
    callbacks=[checkpoint_callback],
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [23]:
trainer.fit(training_module, train_loader, val_loader)


  | Name  | Type        | Params
--------------------------------------
0 | model | Recommender | 4.6 M 
--------------------------------------
4.6 M     Trainable params
0         Non-trainable params
4.6 M     Total params
18.307    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]



1

In [24]:
trainer.test(test_dataloaders=val_loader)

Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_accuracy': 0.0022183728870004416, 'test_loss': 7.696439743041992}
--------------------------------------------------------------------------------


[{'test_accuracy': 0.0022183728870004416, 'test_loss': 7.696439743041992}]

# Recommend movie

In [25]:
def predict(list_items, model, item_to_idx, idx_to_item, history_size=120):
    
    ids = [PAD] * (history_size - len(list_items) - 1) + [item_to_idx[a] for a in list_items] + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()
    
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_item[a] for a in sorted_predicted_ids[:30] if a in idx_to_item]

In [26]:
movies = pd.read_csv('movies.csv')

In [29]:
movie_ids = data['movieId'].unique().tolist()
movies = movies[movies['movieId'].isin(movie_ids)]

In [32]:
movie_to_idx = {a: mapping[b] for a, b in zip(movies.title.tolist(), movies.movieId.tolist()) if b in mapping}
idx_to_movie = {v: k for k, v in movie_to_idx.items()}

In [49]:
list_movies = ['Snow White and the Seven Dwarfs (1937)',
               'Cinderella (1950)',
               'Beauty and the Beast (1991)',
               'Mulan (1998)'
               ]

top_movie = predict(list_movies, training_module, movie_to_idx, idx_to_movie)
top_movie

['Star Wars: Episode I - The Phantom Menace (1999)',
 'Saving Private Ryan (1998)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'American Beauty (1999)',
 'Annie Hall (1977)',
 'Being John Malkovich (1999)',
 'Pulp Fiction (1994)',
 'Braveheart (1995)',
 'Silence of the Lambs, The (1991)',
 'Forrest Gump (1994)',
 'Raiders of the Lost Ark (1981)',
 'E.T. the Extra-Terrestrial (1982)',
 'Total Recall (1990)',
 'Men in Black (1997)',
 'Lethal Weapon (1987)',
 'Jurassic Park (1993)',
 'Shawshank Redemption, The (1994)',
 'Gladiator (2000)',
 'Clerks (1994)',
 'Indiana Jones and the Last Crusade (1989)',
 'Princess Bride, The (1987)',
 'Ghostbusters (1984)',
 'Back to the Future (1985)',
 'Casablanca (1942)',
 'Terminator 2: Judgment Day (1991)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Monty Python and the Holy Grail (1974)',
 'North by Northwest (1959)',
 'Thelma & Louise (1991)',
 'As Good As It Gets (1997)']

In [51]:
list_movies = ['Star Wars: Episode I - The Phantom Menace (1999)',
               'Star Trek: Generations (1994)',
               'Star Wars: Episode IV - A New Hope (1977)',
               'Star Trek III: The Search for Spock (1984)',
               'Star Trek IV: The Voyage Home (1986)',
               ]

top_movie = predict(list_movies, training_module, movie_to_idx, idx_to_movie)
top_movie

['Saving Private Ryan (1998)',
 'Silence of the Lambs, The (1991)',
 'Braveheart (1995)',
 'Pulp Fiction (1994)',
 'American Beauty (1999)',
 'Ghostbusters (1984)',
 'Being John Malkovich (1999)',
 'Raiders of the Lost Ark (1981)',
 'Princess Bride, The (1987)',
 'E.T. the Extra-Terrestrial (1982)',
 'Annie Hall (1977)',
 'Shawshank Redemption, The (1994)',
 'Back to the Future (1985)',
 'Abyss, The (1989)',
 'Men in Black (1997)',
 'Clerks (1994)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Magnolia (1999)',
 'North by Northwest (1959)',
 'Alien (1979)',
 'Jurassic Park (1993)',
 'Lethal Weapon (1987)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Gladiator (2000)',
 'Clueless (1995)',
 'Thelma & Louise (1991)',
 'Terminator 2: Judgment Day (1991)',
 "Schindler's List (1993)",
 'Casablanca (1942)',
 'Titanic (1997)']