In [2]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.nn import Linear
from torch.nn import functional as F

data_csv_path = "/kaggle/input/main-dataset-files/Apparel.csv"

model_path = "/kaggle/input/final-models-transformer-grid/recommender-apparal.ckpt"
data = pd.read_csv(data_csv_path)

In [3]:
# Model.py
def masked_accuracy(y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.Tensor):

    _, predicted = torch.max(y_pred, 1)

    y_true = torch.masked_select(y_true, mask)
    predicted = torch.masked_select(predicted, mask)

    acc = (y_true == predicted).double().mean()

    return acc


def masked_ce(y_pred, y_true, mask):

    loss = F.cross_entropy(y_pred, y_true, reduction="none")

    loss = loss * mask

    return loss.sum() / (mask.sum() + 1e-8)


class Recommender(pl.LightningModule):
    def __init__(
        self,
        vocab_size,
        channels=128,
        cap=0,
        mask=1,
        dropout=0.4,
        lr=1e-4,
    ):
        super().__init__()

        self.cap = cap
        self.mask = mask

        self.lr = lr
        self.dropout = dropout
        self.vocab_size = vocab_size

        self.item_embeddings = torch.nn.Embedding(
            self.vocab_size, embedding_dim=channels
        )

        self.input_pos_embedding = torch.nn.Embedding(512, embedding_dim=channels)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=channels, nhead=4, dropout=self.dropout
        )

        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=6)

        self.linear_out = Linear(channels, self.vocab_size)

        self.do = nn.Dropout(p=self.dropout)

    def encode_src(self, src_items):
        src_items = self.item_embeddings(src_items)

        batch_size, in_sequence_len = src_items.size(0), src_items.size(1)
        pos_encoder = (
            torch.arange(0, in_sequence_len, device=src_items.device)
            .unsqueeze(0)
            .repeat(batch_size, 1)
        )
        pos_encoder = self.input_pos_embedding(pos_encoder)

        src_items += pos_encoder

        src = src_items.permute(1, 0, 2)

        src = self.encoder(src)

        return src.permute(1, 0, 2)

    def forward(self, src_items):

        src = self.encode_src(src_items)

        out = self.linear_out(src)

        return out

    def training_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("train_loss", loss)
        self.log("train_accuracy", accuracy)

        return loss

    def validation_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("valid_loss", loss)
        self.log("valid_accuracy", accuracy)

        return loss

    def test_step(self, batch, batch_idx):
        src_items, y_true = batch

        y_pred = self(src_items)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        src_items = src_items.view(-1)
        mask = src_items == self.mask

        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=mask)
        accuracy = masked_accuracy(y_pred=y_pred, y_true=y_true, mask=mask)

        self.log("test_loss", loss)
        self.log("test_accuracy", accuracy)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=10, factor=0.1
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "valid_loss",
        }


In [4]:
PAD = 0
MASK = 1

def map_column(df: pd.DataFrame, col_name: str):
    values = sorted(list(df[col_name].unique()))
    mapping = {k: i + 2 for i, k in enumerate(values)}
    inverse_mapping = {v: k for k, v in mapping.items()}

    df[col_name + "_mapped"] = df[col_name].map(mapping)

    return df, mapping, inverse_mapping

data, mapping, inverse_mapping = map_column(data, col_name="product_id")

In [33]:
model = Recommender(
        vocab_size=len(mapping) + 2,
        lr=1e-4,
        dropout=0.3,
    )
model.eval()
model.load_state_dict(torch.load(model_path)["state_dict"])

<All keys matched successfully>

In [8]:
# product_to_idx = {a: mapping[b] for a, b in zip(data.product_title.tolist(), data.product_id.tolist()) if b in mapping}
# idx_to_product = {v: k for k, v in product_to_idx.items()}

In [24]:
id2mapid = {a: mapping[b] for a, b in zip(data.product_id.tolist(), data.product_id.tolist()) if b in mapping}
map2id = {v: k for k, v in id2mapid.items()}

In [15]:
# def predict(list_products, model, product_to_idx, idx_to_product):
    
#     ids = [PAD] * (120 - len(list_products) - 1) + [product_to_idx[a] for a in list_products] + [MASK]
    
#     src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
#     with torch.no_grad():
#         prediction = model(src)
    
#     masked_pred = prediction[0, -1].numpy()
    
#     sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
#     sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
#     return [idx_to_product[a] for a in sorted_predicted_ids[:30] if a in idx_to_product]

In [25]:
def predictId(list_products, model, id2mapid, map2id):
    
    ids = [PAD] * (120 - len(list_products) - 1) + [id2mapid[a] for a in list_products] + [MASK]
    
    src = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    
    with torch.no_grad():
        prediction = model(src)
    
    masked_pred = prediction[0, -1].numpy()
    
    sorted_predicted_ids = np.argsort(masked_pred).tolist()[::-1]
    
    sorted_predicted_ids = [a for a in sorted_predicted_ids if a not in ids]
    
    return [map2id[a] for a in sorted_predicted_ids[:30] if a in map2id]

In [17]:
# list_products = ["Levi's Boys' 514 Straight Fit Jeans",
#                "Jockey Women's Underwear Supersoft Brief - 3 Pack",
#                "Jerzees Men's Super Sweats Crew Neck Sweatshirt",
#                "SEOBEAN Mens Low Rise Sexy Sport Swimwear Trunk Boxer Brief Bikini Swimsuit"]

# top_products = predict(list_products, model, product_to_idx, idx_to_product)
# top_products[:10]

[40, 21, 66, 9, 59, 51, 88, 79, 53, 91]

In [26]:
list_id_products = ["B01FWRXN0Y",
                "B01DXHX81O",
                "B01B3Q4Q0O",
                "B01ADDSL9U"]

top_products = predictId(list_id_products, model, id2mapid, map2id)
top_products[:10]

['B014BP7JB2',
 'B014CZFIFK',
 'B014882XIG',
 'B019P1X5XI',
 'B015OHQNL8',
 'B014L79H8I',
 'B016AQNDM4',
 'B01ID3ZS5W',
 'B01G6MBEBY',
 'B014WBIGKI']

In [34]:
for i in range(len(df.product_id)):
    if 'B01DXHX81O' == df.product_id[i]:
        print(df.iloc[i]['product_title'])
        break

Jockey Women's Underwear Supersoft Brief - 3 Pack
