In [1]:
!pwd

/content


In [2]:
!cd /sample_data

/bin/bash: line 0: cd: /sample_data: No such file or directory


In [3]:
!ls

kaggle.json  MovieLens20M  sample_data


In [4]:
!cd sample_data

In [5]:
!mkdir MovieLens20M

mkdir: cannot create directory ‘MovieLens20M’: File exists


In [6]:
!pip install -q kaggle

In [7]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle.json': b'{"username":"irtnm8210","key":"6183b3e8fb1fdf9bc81a966e07e83c1b"}'}

In [8]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [9]:
!cp kaggle.json ~/.kaggle

In [10]:
!chmod 600 ~/.kaggle/kaggle.json

In [11]:
!kaggle datasets download -d grouplens/movielens-20m-dataset -p MovieLens20M

movielens-20m-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [12]:
!pip install pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# IMPORTS

In [13]:
import pandas as pd
import pytorch_lightning as pl

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from pathlib import Path
import requests
import zipfile

import numpy as np

from tqdm import tqdm

# DEVICE SETUP

In [14]:
# Setup device-agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# EXTRACTING THE ZIP FILE

In [15]:
DATA_PATH = Path("/content/MovieLens20M")
CSV_PATH = DATA_PATH / "data"
# Unzipping data from zip file
with zipfile.ZipFile(DATA_PATH / "movielens-20m-dataset.zip", "r") as zip_ref:
  print("Unzipping csv files...")
  zip_ref.extractall(CSV_PATH)

Unzipping csv files...


In [16]:
ratings = pd.read_csv("/content/MovieLens20M/data/rating.csv", parse_dates=['timestamp'])

In [17]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
5,1,112,3.5,2004-09-10 03:09:00
6,1,151,4.0,2004-09-10 03:08:54
7,1,223,4.0,2005-04-02 23:46:13
8,1,253,4.0,2005-04-02 23:35:40
9,1,260,4.0,2005-04-02 23:33:46


In [18]:
userIds = np.random.choice(ratings['userId'].unique(),
                           size=int(len(ratings['userId'].unique()) * 0.4),
                           replace=False)

ratings = ratings.loc[ratings['userId'].isin(userIds)]

print(f"There are {len(ratings)} rows of {len(userIds)} users")

There are 7965324 rows of 55397 users


# SPLIT DATA INTO TRAINING AND TEST SET

For the model to achieve the best quality, we will sort the data in time order, then choose the latest record to test our model.

In [19]:
# Sort the data in time order
ratings['rank_latest'] = ratings.groupby(ratings['userId'])['timestamp'].rank(method='first', ascending=False)

# Choose the latest data for testing
training_df = ratings[ratings['rank_latest'] != 1]
test_df = ratings[ratings['rank_latest'] == 1]

KEPT_COLUMN = ['userId', 'movieId', 'rating']

# Keep the important columns
training_df = training_df[KEPT_COLUMN]
test_df = test_df[KEPT_COLUMN]

# TURN EXPLICIT FEEDBACK (RATINGS) INTO IMPLICIT FEEDBACK (INTEREST OR NOT)

Explicit feedback such as thumb-ups and thumb-downs from `Youtube` video, `Facebook` reactions are harder to find than implicit feedback. For example, you rarely press the like button for a video than using the search bar for content on `Youtube`. The same applies to many other E-Commerce and Social Platform such as `Facebook`, `Amazon`, `Netflix`, etc. In this implementation, we will treat the ratings from the users as interaction as the number of item clicks.

In [20]:
training_df.loc[:, 'rating'] = 1

training_df.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,userId,movieId,rating
3295566,22507,37727,1
16813183,116302,45,1
7347862,50695,540,1
9874584,68223,5989,1
9121646,63071,53161,1
17289451,119542,1036,1
865171,5784,454,1
7599186,52371,468,1
1408356,9544,1017,1
14806330,102262,6615,1


# NEGATIVE SAMPLING

The best to choose the ratio for negative sampling is to balance the ratio between classes

In [21]:
movieIds = ratings['movieId'].unique()
print(f"Number of movies in the dataset: {len(userIds)}")

23204

In [22]:
userIds = ratings['userId'].unique()
print(f"Number of users in the dataset: {len(userIds)}")

55397

In [23]:
user_item_set = set(zip(training_df['userId'], training_df['movieId']))
print(f"User-Item pair: {len(user_item_set)}")

7909927

In [24]:
# Array to keep records of (user, item, label) pair
users, items, labels = [], [], []

# Ratio for negative sampling, tune it
NUM_NEGATIVES = 4

for (user, item) in tqdm(user_item_set):
  users.append(user)
  items.append(item)
  # If user interact with the item, then user interested with the item, then label as 1
  labels.append(1)
  for _ in range(NUM_NEGATIVES):
    # Choose random item, then check if the pair occurs in the user_item_set
    negative_item = np.random.choice(movieIds)
    # If the pair exist, find til the negative one
    while (user, negative_item) in user_item_set:
      negative_item = np.random.choice(movieIds)
    # If the while loop ends, then we've found the negative sample
    users.append(user)
    items.append(negative_item)
    labels.append(0)

100%|██████████| 7909927/7909927 [08:47<00:00, 14989.15it/s]


# Custom Pytorch Dataset for MovieLens20M

In [25]:
class MovieLensTrainingDataset(Dataset):
  """MovieLens Pytorch Dataset for training

  Args:
    ratings (pd.Dataframe): Pandas dataframe containing movie ratings
    movieIds (list): Python List storing all unique movie ids
  """
  def __init__(self, ratings: pd.DataFrame, movieIds: list):
    self.users, self.items, self.labels = self.init_dataset(ratings, movieIds)
  
  def __len__(self):
    return len(self.users)

  def __getitem__(self, idx):
    return self.users[idx], self.items[idx], self.labels[idx] 

  def init_dataset(self, ratings: pd.DataFrame, movieIds: list):
    # Pairing users with items into a set
    user_item_set = set(zip(ratings['userId'], ratings['movieId']))
    # Array to keep records of (user, item, label) pair
    users, items, labels = [], [], []
    # Ratio for negative sampling, tune it
    NUM_NEGATIVES = 4

    for (user, item) in tqdm(user_item_set):
      users.append(user)
      items.append(item)
      # If user interact with the item, then user interested with the item, then label as 1
      labels.append(1)
      for _ in range(NUM_NEGATIVES):
        # Choose random item, then check if the pair occurs in the user_item_set
        negative_item = np.random.choice(movieIds)
        # If the pair exist, find til the negative one
        while (user, negative_item) in user_item_set:
          negative_item = np.random.choice(movieIds)
        # If the while loop ends, then we've found the negative sample
        users.append(user)
        items.append(negative_item)
        labels.append(0)
    return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

# Neural Collaborative Filtering Implementation

In [26]:
class NCF(pl.LightningModule):
  """Neural Collaborative Filtering (NCF)
  Original: https://arxiv.org/pdf/1708.05031.pdf

  Args:
    num_users (int): Number of users in the dataset
    num_items (int): Number of items in the dataset
    ratings (pd.DataFrame): Pandas dataframe storing ratings
    movieIds (list): Python List storing all movie ids
  """

  def __init__(self, num_users, num_items, ratings, movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = movieIds
        
  def forward(self, user_input, item_input):
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.SELU()(self.fc1(vector))
        vector = nn.SELU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
  def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

  def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

  def train_dataloader(self):
        return DataLoader(MovieLensTrainingDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=2)

Define the model

In [27]:
num_users = ratings['userId'].max() + 1
num_items = ratings['movieId'].max() + 1
movieIds = ratings['movieId'].unique()

model = NCF(num_users=num_users, num_items=num_items, ratings=training_df, movieIds=movieIds)

We will train the model in 3 epochs, then measure its performance using `Hit Ratio @ k`

In [28]:
trainer = pl.Trainer(max_epochs=3, gpus=1, logger=False)

trainer.fit(model)

  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 1.1 M 
1 | item_embedding | Embedding | 1.1 M 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.645     Total estimated model params size (MB)
100%|██████████| 7909927/7909927 [09:00<00:00, 14647.47it/

Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


# TEST METHODOLOGY

We define the test methodology using the `Hit Ratio @ k`.\
The `Hit Ratio @ k` is defined as: For the test dataset, suppose that user A buys item B. We want to measure if we sample 100 items for example, what is the probability that items like B appears at the top `k` items if we sort the sample of 100 items according to the metric of the output from the model. For example, we took 10 item, and 7 out of 10 are the items that user A interested, then we say `Hit Ratio @ k = 0.7`


In [None]:
#user-item pairs for testing
test_user_item_set = set(zip(test_df['userId'], test_df['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print(f"\nThe Hit Ratio @ 10 is {np.average(hits)}")