# Create product embedding for instacart dataset

Idea: similar to word2vec, products in the same order are close to each other

## Load orders and product name

In [1]:
!ls data

aisles.csv		       order_products__train.csv.zip
aisles.csv.zip		       orders.csv
departments.csv.zip	       orders.csv.zip
__MACOSX		       products.csv
order_products__prior.csv      products.csv.zip
order_products__prior.csv.zip  sample_submission.csv
order_products__train.csv      sample_submission.csv.zip


In [2]:
import pandas as pd

product_df = pd.read_csv("data/products.csv")
order_df = pd.read_csv("data/orders.csv")
order_prior_df = pd.read_csv("data/order_products__prior.csv")
# print(product_df.head(10))
# print(order_df.head(10))
# print(order_prior_df.head(10))



In [3]:
print(f"Number of products: {len(product_df)}")
print(f"Number of orders: {len(order_prior_df['order_id'].value_counts())}")

Number of products: 49688
Number of orders: 3214874


In [4]:
# Filter our order with less than 2 products
df1 = order_prior_df[order_prior_df.groupby("order_id")['order_id'].transform('size') > 10]
print(f"Number of orders with at least 2 products: {len(df1['order_id'].value_counts())}")

# Get list of products in these orders
product_ids = df1["product_id"].unique().tolist()
product_df1 = product_df[product_df["product_id"].isin(product_ids)]
print(f"Number of reduced products: {len(product_ids)}")

Number of orders with at least 2 products: 1212743
Number of reduced products: 49371


In [5]:
# Build product mapping

product_ids = product_df1["product_id"]
product_names = product_df1["product_name"]
# map product id to indices starting from 0
product_id_to_ind = {product_id: i for i, product_id in enumerate(product_ids)}
product_ind_to_name = {i: product_name for i, product_name in enumerate(product_names)}
product_name_to_ind = {product_name: i for i, product_name in enumerate(product_names)}

In [6]:
import tqdm
def build_data(order_df, product_id_to_ind, window=2):
    unique_orders = order_df["order_id"].unique().tolist()
    order_to_ind = {
        order_id: i
        for i, order_id in enumerate(unique_orders)
    }
    # self.orders to store a list of orders, each order is represented by a list of products in that order
    orders = [[] for _ in unique_orders]
    for order_id, product_id in tqdm.tqdm(zip(order_df["order_id"], order_df["product_id"])):
        order_ind = order_to_ind[order_id]
        product_ind = product_id_to_ind[product_id]
        orders[order_ind].append(product_ind)
#     num_products = len(product_id_to_ind)        
    data = []
    for products in tqdm.tqdm(orders):
        for i, product in enumerate(products):
            start = max(0, i - window)
            end = min(len(products), i + window + 1)
            data.extend([
                (product, products[j])
                for j in range(start, end) if j !=i
            ])
    return data

data = build_data(df1, product_id_to_ind, window=2)

21404993it [00:07, 3049432.95it/s]
100%|██████████| 1212743/1212743 [00:16<00:00, 73832.83it/s]


In [13]:
# torch dataset
import torch
import random
from torch import nn
from torch.utils.data import DataLoader, Dataset
import tqdm

class ProductPairDataset(Dataset):
    """Dataset class that returns a pair of (context, target) product ids.
    
    The pair is a random combination of 2 products in the same order.
    
    """
    def __init__(self, data, num_products, num_noisy_products: int = 200, window=2):
        
        
        self.num_noisy_products = num_noisy_products
        self.num_products = num_products
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
        # Get a random order and a random pair of products in that order
#         products = self.orders[index]
#         selected_products = random.sample(products, 2)
# #         print(selected_products)
# #         print(self.num_products)
#         context_product_ind = selected_products[0]
#         target_product_ind = selected_products[1]
#         context_product_ind, target_product_ind = self.data[index]
        # buid mask
#         mask = [0 for _ in range(self.num_products)]
#         random_ids = random.sample(range(self.num_products), self.num_noisy_products)
#         for i in random_ids + [context_product_ind]:
#             mask[i] = 1
            
        # convert target_product_ind to one_hot vector
#         target_product = [0 for _ in range(self.num_products)]
#         target_product[target_product_ind] = 1
#         return context_product_ind, target_product_ind, mask
        # return context_product_ind, target_product, mask
#         breakpoint()
#         return (
#             torch.tensor(context_product_ind, dtype=torch.int),
# #             torch.tensor(target_product, dtype=torch.float32),
#             torch.tensor(target_product_ind, dtype=torch.long),
# #             torch.tensor(mask, dtype=torch.float32)
#         )
        
training_data = ProductPairDataset(data, num_products=len(product_id_to_ind))
  

In [14]:
training_data[0]

(46548, 26277)

In [None]:

# batch_size = 8
# train_dataloader = DataLoader(
#     training_data, batch_size=batch_size, shuffle=True, num_workers=0
# )
import time

for i in range(5):
    batch_size = 2**(i+1)
    train_dataloader = DataLoader(
        training_data, batch_size=batch_size, shuffle=True, num_workers=0
    )
    t0 = time.time()
    for batch in train_dataloader:
        break
    
    print(f"batch_size = {batch_size}: execution time {time.time() - t0}")

In [10]:
data[1]


(46548, 39512)

In [11]:
# Build the pytorch lightning model
import pytorch_lightning as pl
import torch.multiprocessing
import torch.nn.functional as F
class SigmoidBCELoss(nn.Module):
    "BCEWithLogitLoss with masking on call."

    def __init__(self):
        super().__init__()

    def forward(self, inputs, target, mask=None):
#         breakpoint()
        out = nn.functional.binary_cross_entropy_with_logits(
            inputs, target, weight=mask, reduction="mean")
#         breakpoint()
        return out #.mean(dim=1)

loss_fn = SigmoidBCELoss()
loss2 = torch.nn.CrossEntropyLoss()

class Prod2VecModel(pl.LightningModule):
    def __init__(self, num_products, embed_size: int = 50):
        super().__init__()
        self.embed_size = embed_size
        self.embedding = nn.Embedding(num_products, self.embed_size)
        self.hidden = nn.Linear(self.embed_size, num_products, bias=True)


    def forward(self, contexts):
#         breakpoint()
        hid = self.embedding(contexts)
        pre_sigmoid = self.hidden(hid)
        return pre_sigmoid

    def training_step(self, batch, batch_idx):
        contexts, targets, masks = batch
        output = self.forward(contexts)
        loss = loss_fn(output, targets, masks)
#         breakpoint()
#         loss = loss2(output, targets)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(
            self.parameters(), lr=0.5, momentum=0.1, weight_decay=1e-3
        )  # learning rate
        return optimizer

In [12]:
num_products = len(product_id_to_ind)
embed_size = 20
model = Prod2VecModel(num_products, embed_size)
trainer = pl.Trainer(gpus=1, max_epochs=10)
trainer.fit(model, train_dataloader, train_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 987 K 
1 | hidden    | Linear    | 1.0 M 
----------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
8.097     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
torch.save(model.state_dict(), 'model.pt')

In [None]:
import torch
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()

In [None]:
output

In [None]:
input

In [None]:
target

In [None]:
a = [[] for i in range(3)]
print(a)

In [None]:
a[0].append(1)
a[1].append(2)
print(a)

In [None]:
product_df1.iloc[0]["product_name"]