# Import Library and Data

In [1]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from typing import Union

np.random.seed(123)

pd.options.mode.chained_assignment = None

In [2]:
cols = ['cust_id', 'item_id','rating','timestamp']
df = pd.read_csv("data/Amazon_Electronics_Ratings.csv", names=cols)
df['timestamp'] = pd.to_datetime(df.timestamp, unit='s')
df_copy = df.copy()
df.shape

(7824482, 4)

In [3]:
df.head()

Unnamed: 0,cust_id,item_id,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,2013-04-13
1,A2CX7LUOHB2NDG,321732944,5.0,2012-07-01
2,A2NWSAGRHCP8N5,439886341,1.0,2013-04-29
3,A2WNBOD3WNDNKT,439886341,3.0,2013-07-22
4,A1GI0U4ZRJA8WN,439886341,1.0,2012-04-18


In [4]:
def sparsity(df: pd.DataFrame, decimal: int) -> None:
    n_custs = df.cust_id.nunique()
    n_items = df.item_id.nunique()
    print(f"Number of customers: {n_custs}")
    print(f"Number of products: {n_items}")
    sparsity = round(1 - len(df) / (n_custs * n_items), decimal)
    print(f"The sparsity level of the original df is {sparsity*100}%")
    return

In [5]:
df = df_copy.copy()
sparsity(df, 6)

Number of customers: 4201696
Number of products: 476002
The sparsity level of the original df is 99.9996%


# Data Preparation

In [6]:
# get top customers and items in terms of 'frequency'
top_custs = (
    df.groupby(["cust_id"])
    .size()
    .sort_values(ascending=False)[:1400000]
    .keys()
    .to_list()
)
top_items = (
    df.groupby(["item_id"]).size().sort_values(ascending=False)[:5000].keys().to_list()
)
# filter and keep only the entries with the top customers and items
df = df[df.cust_id.isin(top_custs) & df.item_id.isin(top_items)]
df_reduced = df.copy()

In [7]:
df = df_reduced.copy()
sparsity(df, 5)

Number of customers: 1015362
Number of products: 5000
The sparsity level of the original df is 99.959%


In [8]:
df = df_reduced.copy()
# re-encode cust_id and item_id
df['cust_id'] = LabelEncoder().fit_transform(df.cust_id.values)
df['item_id'] = LabelEncoder().fit_transform(df.item_id.values)
# converting the dataset into an implicit feedback dataset
df['rating']  = 1
df.rename(columns={'rating': 'purchase'}, inplace=True)
df_cleaned = df.copy()
df.head()

Unnamed: 0,cust_id,item_id,purchase,timestamp
184,993279,0,1,2014-07-13
185,600478,0,1,2014-07-18
186,759467,0,1,2014-07-22
187,266867,0,1,2014-07-12
188,936876,0,1,2011-04-16


In [9]:
df.sample(5)

Unnamed: 0,cust_id,item_id,purchase,timestamp
7594684,822015,4904,1,2014-01-16
5079596,322875,3306,1,2014-01-27
7115290,18000,4629,1,2014-01-11
6723680,308061,4377,1,2014-06-04
1732750,561151,1059,1,2010-10-10


In [27]:
df.to_csv("data/df_clean.csv", index=False)

# Train Test Split

In [2]:
df = pd.read_csv("data/df_clean.csv")
df['timestamp'] = pd.to_datetime(df.timestamp)
df.shape

(2062552, 4)

In [3]:
def split_train_test(df: pd.DataFrame) -> Union[pd.DataFrame]:
    # leave one out
    df["rank_latest"] = df.groupby(["cust_id"])["timestamp"].rank(
        method="first", ascending=False
    )
    train = df[df.rank_latest != 1]
    test = df[df.rank_latest == 1]
    
    # keep only useful columns
    train.drop(columns=['timestamp', 'rank_latest'], inplace=True)
    test.drop(columns=['timestamp', 'rank_latest'], inplace=True)
    
    print(f'Shape of the train set: {train.shape}')
    print(f'Shape of the test set: {test.shape}')
    
    return train, test

In [4]:
# df = df_cleaned.copy()
df_train, df_test = split_train_test(df)

Shape of the train set: (1047190, 3)
Shape of the test set: (1015362, 3)


# Define PyTorch Dataset

In [5]:
class BuildDataset(Dataset):
    """Build PyTorch Dataset for Training

    Args:
        df (pd.DataFrame): Dataframe containing the movie df
        all_item_ids (list): List containing all movieIds

    """

    def __init__(self, df: pd.DataFrame, all_item_ids):
        """
        Balance and initialize the training data and labels
        """
        self.users, self.items, self.labels = self.generate_neg(df, all_item_ids)

    def __len__(self):
        """
        Returns the number of samples in the training data
        """
        return len(self.users)

    def __getitem__(self, idx):
        """
        Loads and returns a sample from the dataset with the given index.
        """
        return self.users[idx], self.items[idx], self.labels[idx]

    def generate_neg(self, df, all_item_ids):
        """
        Generate 4 random negative interactions for 
        every purchase (positive interaction)
        """
        # training data placeholders
        users, items, labels = [], [], []
        # user-item interaction set that contains items that 
        # each user has interaction with
        user_item_set = set(zip(df["cust_id"], df["item_id"]))

        # 4:1 ratio of negative to positive samples
        num_neg = 4 # can be tuned
        for user, pos_item in user_item_set:
            users.append(user)
            items.append(pos_item)
            labels.append(1)  # items that the user has interacted with are positive
            for _ in range(num_neg):
                # randomly select an item
                neg_item = np.random.choice(all_item_ids)
                # check that the user has not interacted with this item
                while (user, neg_item) in user_item_set:
                    neg_item = np.random.choice(all_item_ids)
                users.append(user)
                items.append(neg_item)
                labels.append(0)  # items not interacted with are negative

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

# Define NCF Model

In [6]:
class NCF(pl.LightningModule):
    """Neural Collaborative Filtering (NCF)

    Args:
        n_custs (int): Number of unique users
        n_items (int): Number of unique items
        df (pd.DataFrame): Dataframe containing the movie df for training
        all_item_ids (list): List containing all movieIds (train + test)
    """

    def __init__(self, n_custs, n_items, df, all_item_ids):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_embeddings=n_custs, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=n_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.df = df
        self.all_item_ids = all_item_ids

    def forward(self, train_user, train_item):

        # Pass through embedding layers
        user_embedded = self.user_embedding(train_user)
        item_embedded = self.item_embedding(train_item)

        # Concat the two embedding layers
        x = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        x = nn.ReLU()(self.fc1(x))
        x = nn.ReLU()(self.fc2(x))

        # Output layer
        pred = nn.Sigmoid()(self.output(x))

        return pred

    def training_step(self, batch, batch_idx):
        train_user, train_item, labels = batch
        predicted_labels = self(train_user, train_item)
        # Use Binary Cross Entropy as criterion
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(
            BuildDataset(self.df, self.all_item_ids), 
            batch_size=512, num_workers=4   # can be tuned
        )

In [7]:
class TQDMProgressBar(TQDMProgressBar):
    def init_train_tqdm(self):
        bar = super().init_train_tqdm()
        bar.refresh_rate = 50
        return bar

In [8]:
n_custs = df.cust_id.nunique() + 1
n_items = df.item_id.nunique() + 1
# Get a list of all item IDs
all_item_ids = df["item_id"].unique()

model = NCF(n_custs, n_items, df_train, all_item_ids)

trainer = pl.Trainer(
    max_epochs=1,
    gpus=None,
    reload_dataloaders_every_n_epochs=1,
    logger=False,
    callbacks=[TQDMProgressBar()],
    enable_checkpointing=False,
)

trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 8.1 M 
1 | item_embedding | Embedding | 40.0 K
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
8.2 M     Trainable params
0         Non-trainable params
8.2 M     Total params
32.664    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




In [9]:
# save model
model_path = 'model/ncf.pt'
torch.save(model.state_dict(), model_path)

# Evaluation - Hit Ratio @ 10

In [25]:
# model = NCF(n_custs, n_items, df_train, all_item_ids)
# model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [27]:
%%time
# cust-item pairs for testing
test_cust_item_set = set(zip(df_test['cust_id'], df_test['item_id']))

# Dict of all items that are interacted with by each cust
cust_interacted_items = df.groupby('cust_id')['item_id'].apply(list).to_dict()

buy = []
for (u,i) in test_cust_item_set:
    # For each customer, randomly select 99 items that the customer has not interacted with.
    interacted_items = cust_interacted_items[u]
    not_interacted_items = set(all_item_ids) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    # Combine these 99 items with the test item:
    # the actual item that the customer last interacted with.
    # The final test item list has 100 items.
    test_items = selected_not_interacted + [i]
    
    # Run the model on these 100 items
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    # Select the top 10 items from the list of 100 items. 
    top10_labels = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    # If the test item is present within the top 10 items, 
    # then we say that this is a hit.
    if i in top10_labels:
        buy.append(1)
    else:
        buy.append(0)
        
print(f"The Hit Ratio @ 10 is {np.average(buy):.2f}")

The Hit Ratio @ 10 is 0.36
CPU times: user 6h 37min 29s, sys: 15.2 s, total: 6h 37min 45s
Wall time: 50min 15s


In [47]:
# with open('predicted_labels.npy', 'wb') as f:
#     np.save(f, predicted_labels)

In [48]:
# with open('predicted_labels.npy', 'rb') as f:
#     predicted_labels = np.load(f)