In [1]:
!pip install pytorch_lightning==1.2.8



In [2]:
import numpy as np
import pandas as pd
import time
import datetime
import random

import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torch.nn as nn
from torch.nn import functional as F

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

from recommender import Recommender

In [3]:
PAD = 0
MASK = 1

# Load and preprocess data

In [4]:
data = pd.read_csv('interactions_train.csv')

Filter data:

Rating >= 4.0: user likes the food

items have more than 40 people rated (reduce amount of rarely eaten food)

users have rated more than 10 items

In [5]:
data = data[data['rating'] >= 4.0]

In [6]:
item_group = data.groupby(by='recipe_id').size()
item_group = item_group[item_group > 40]
item_group = item_group.reset_index()['recipe_id']

In [7]:
data = data[data['recipe_id'].isin(item_group)]

In [8]:
user_group = data.groupby(by='user_id').size()
user_group = user_group[user_group > 10]
user_group = user_group.reset_index()['user_id']

In [9]:
data = data[data['user_id'].isin(user_group)]

In [10]:
print(f'Number of records: {data.shape}')

Number of records: (77057, 6)


In [11]:
print(f'Number of unique dishes: {data["recipe_id"].unique().shape}')

Number of unique dishes: (1413,)


In [12]:
def to_timestamp(date):
    return int(time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()))

In [13]:
def map_col(data, col):
    unique_values = sorted(list(data[col].unique()))
    mapping = {k: i + 2 for i, k in enumerate(unique_values)}
    inverse_mapping = {v: k for k, v in mapping.items()}
    
    data[col + '_mapped'] = data[col].map(mapping)
    
    return data, mapping, inverse_mapping

In [14]:
data['timestamp'] = data['date'].apply(to_timestamp)
data.sort_values(by='timestamp', inplace=True)

In [15]:
data, mapping, inverse_mapping = map_col(data, "recipe_id")

grp_by_train = data.groupby(by="user_id")
groups = list(grp_by_train.groups)

# Build Dataset

In [16]:
def pad_list(list_integers, history_size: int, pad_val: int = PAD, mode="left"):
    
    # padding
    
    if len(list_integers) < history_size:
        if mode == "left":
            list_integers = [pad_val] * (history_size - len(list_integers)) + list_integers
        else:
            list_integers = list_integers + [pad_val] * (history_size - len(list_integers))

    return list_integers

In [17]:
def mask_list(l1, p=0.8):
    l1 = [a if random.random() < p else MASK for a in l1]
    return l1

def mask_last_elements_list(l1, val_context_size: int = 5):
    l1 = l1[:-val_context_size] + mask_list(l1[-val_context_size:], p=0.5)
    return l1

In [18]:
def get_context(df: pd.DataFrame, split: str, context_size: int = 120, val_context_size: int = 5):

    # determine splitting for train or no splitting for val and test
    
    if split == "train":
        if df.shape[0] - val_context_size < 10:
            end_index = df.shape[0]
        else:
            end_index = random.randint(10, df.shape[0] - val_context_size)
    elif split in ["val", "test"]:
        end_index = df.shape[0]
    else:
        raise ValueError

    start_index = max(0, end_index - context_size)

    context = df[start_index:end_index]

    return context

In [19]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, groups, grp_by, split, history_size=120):
        self.groups = groups
        self.grp_by = grp_by
        self.split = split
        self.history_size = history_size

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        group = self.groups[idx]

        df = self.grp_by.get_group(group)

        context = get_context(df, split=self.split, context_size=self.history_size)

        trg_items = context["recipe_id_mapped"].tolist()

        if self.split == "train":
            src_items = mask_list(trg_items)
        else:
            src_items = mask_last_elements_list(trg_items)

        pad_mode = "left" if random.random() < 0.5 else "right"
        trg_items = pad_list(trg_items, history_size=self.history_size, mode=pad_mode)
        src_items = pad_list(src_items, history_size=self.history_size, mode=pad_mode)

        src_items = torch.tensor(src_items, dtype=torch.long)

        trg_items = torch.tensor(trg_items, dtype=torch.long)

        return src_items, trg_items

In [20]:
history_size = 50

train_data = Dataset(groups=groups,
                     grp_by=grp_by_train,
                     split="train",
                     history_size=history_size,)

val_data = Dataset(groups=groups,
                   grp_by=grp_by_train,
                   split="val",
                   history_size=history_size,)

In [21]:
print("len(train_data)", len(train_data))
print("len(val_data)", len(val_data))

len(train_data) 2855
len(val_data) 2855


In [22]:
BATCH_SIZE = 64

train_loader = DataLoader(train_data,
                          batch_size=BATCH_SIZE,
                          num_workers=2,
                          shuffle=True,)

val_loader = DataLoader(val_data,
                        batch_size=BATCH_SIZE,
                        num_workers=2,
                        shuffle=False,)

# Build model and model trainer

In [23]:
def masked_ce(y_pred, y_true, mask):

    loss = F.cross_entropy(y_pred, y_true, reduction='none')

    loss = loss * mask

    return loss.sum() / (mask.sum() + 1e-8)

In [24]:
def masked_accuracy(y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.Tensor):

    _, predicted = torch.max(y_pred, 1)

    y_true = torch.masked_select(y_true, mask)
    predicted = torch.masked_select(predicted, mask)

    acc = (y_true == predicted).double().mean()

    return acc


In [25]:
class TrainingModule(pl.LightningModule):
    def __init__(self, vocab_size, mask=1, lr=1e-4):
        super().__init__()
        
        self.mask = mask
        self.lr = lr
        
        self.model = Recommender(vocab_size)
        
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("train_loss", loss)
        self.log("train_accuracy", accuracy)

        return loss

    def validation_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("valid_loss", loss)
        self.log("valid_accuracy", accuracy)

        return loss

    def test_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("test_loss", loss)
        self.log("test_accuracy", accuracy)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=10, factor=0.1
        )

        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'valid_loss'
        }

In [26]:
training_module = TrainingModule(vocab_size=len(mapping) + 2, lr=1e-3)

In [27]:
LOG_DIR = './logger'
MODEL_DIR = './model'
EPOCHS = 150

logger = TensorBoardLogger(
    save_dir=LOG_DIR,
)

checkpoint_callback = ModelCheckpoint(
    monitor="valid_loss",
    mode="min",
    dirpath=MODEL_DIR,
    filename="recommender",
)

In [28]:
trainer = pl.Trainer(
    max_epochs=EPOCHS,
    gpus=0,
    logger=logger,
    callbacks=[checkpoint_callback],
)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(training_module, train_loader, val_loader)


  | Name  | Type        | Params
--------------------------------------
0 | model | Recommender | 4.0 M 
--------------------------------------
4.0 M     Trainable params
0         Non-trainable params
4.0 M     Total params
15.949    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
  trainer.test(test_dataloaders=val_loader)

# Recommend food

In [None]:
def predict(list_items, model, item_to_idx, idx_to_item, history_size=120):
    
    ids = [PAD] * (history_size - len(list_items) - 1) + [item_to_idx[a] for a in list_items] + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()
    
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [idx_to_item[a] for a in sorted_predicted_ids[:30] if a in idx_to_item]

In [None]:
recipes = pd.read_csv('recipes.csv')

In [None]:
recipe_ids = data['recipe_id'].unique().tolist()
recipes = recipes[recipes['id'].isin(recipe_ids)]

In [None]:
recipe_to_idx = {a: mapping[b] for a, b in zip(recipes.name.tolist(), recipes.id.tolist()) if b in mapping}
idx_to_recipe = {v: k for k, v in recipe_to_idx.items()}

In [None]:
list_recipes = ['bestest hamburger soup',]

top_movie = predict(list_recipes, training_module, recipe_to_idx, idx_to_recipe)
top_movie