In [7]:
from src.api import DataApi
from src.config import DATA
from src.preprocess import DataCleansing, weighted_rating

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

import warnings
warnings.filterwarnings(action='ignore')
np.random.seed(123)

In [12]:
MOVIE_DB = pd.read_csv(DATA + "movies.csv")
RATE_DB = pd.read_csv(DATA + "ratings.csv", parse_dates=['timestamp'])

In [13]:
# to keep memory usage, use data from 30% of the users in this datasets.
SIZE_PERCENTAGE = 0.3
rand_userIds = np.random.choice(RATE_DB['userId'].unique(),
                                size = int(len(RATE_DB['userId'].unique()) * SIZE_PERCENTAGE),
                                replace=False)

ratings = RATE_DB.loc[RATE_DB['userId'].isin(rand_userIds)]
print(f"Get {len(ratings)} rows of data from {len(rand_userIds)} users.")

Get 7550809 rows of data from 48762 users.


In [7]:
# Train test split by leave-one-out model
ratings['rank_latest'] = (ratings.groupby(['userId'])['timestamp']
                          .rank(method = 'first', ascending = False))

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

In [13]:
# Convert Explicit data to implicit feedback dataset
train_ratings.loc[:, 'rating'] = 1
train_ratings.sample(5)

# Get a 4 negative(0) samples
all_movieIds = ratings['movieId'].unique()

# Placeholders
users, items, labels = [], [], []

user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1)

    for _ in range(num_negatives):
        negative_item = np.random.choice(all_movieIds)
        # check user has not interated with item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
            users.append(u)
            items.append(negative_item)
            labels.append(0)

  0%|          | 0/7443866 [00:00<?, ?it/s]

In [8]:
class MovieLensTrainDaset(Dataset):
    """MovieLens Pytorch Dataset for Trainig

    Args:
    ratings (pd.DataFrame) : DataFrame containing the movie ratings
    all_movieIds (list) : List containing all movieIds
    """
    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [9]:
class NCF(pl.LightningModule):
    """Neural Collaborative Filtering (NCF)

    Args:
    num_users (int) : Number of unique users
    num_items (int) : Number of unique items
    ratings (pd.DataFrame) : DataFrame containing the movie ratings for training
    all_movieIds (list) : List containing all movieIds (train + test)
    """

    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()

        self.user_embedding = nn.Embedding(num_embeddings = num_users, embedding_dim = 8)
        self.item_embedding = nn.Embedding(num_embeddings = num_items, embedding_dim = 8)
        self.fc1 = nn.Linear(in_features = 16, out_features = 64)
        self.fc2 = nn.Linear(in_features = 64, out_features = 32)
        self.output = nn.Linear(in_features = 32, out_features = 1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds

    def forward(self, user_input, item_input):

        # pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # concate embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # pass through dense layer
        vector = nn.LeakyReLU()(self.fc1(vector))
        vector = nn.LeakyReLU()(self.fc2(vector))

        # output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)

        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDaset(self.ratings, self.all_movieIds),
                          batch_size = 512, num_workers = 8, pin_memory=True)

# num_users = ratings['userId'].max() + 1
# num_items = ratings['movieId'].max() + 1

# all_movieIds = ratings['movieId'].unique()

# model = NCF(num_users, num_items, train_ratings, all_movieIds)

In [25]:
trainer = pl.Trainer(max_epochs = 5,
                     accelerator='gpu',
                     enable_model_summary=True,
                     enable_progress_bar=True,
                     logger = False)
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 1.3 M 
1 | item_embedding | Embedding | 1.7 M 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
11.907    Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [27]:
trainer.save_checkpoint("./models/DNNCF.ckpt")

In [26]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]

    predicted_labels = np.squeeze(model(torch.tensor([u]*100),
                                        torch.tensor(test_items)).detach().numpy())

    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]

    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)

print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/48762 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.93


In [66]:
def recommends(model, query_id):
    interacted_items = ratings[ratings['userId'] == query_id].movieId
    not_interacted_items = list(set(all_movieIds) - set(interacted_items))

    predicted_labels = np.squeeze(model(torch.tensor([query_id]*len(not_interacted_items)), torch.tensor(not_interacted_items)).detach().numpy())
    top10_items = [all_movieIds[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    return top10_items

#recommends(model, 129655)

[8965, 31878, 106487, 176, 1252, 88125, 994, 75927, 176731, 98595]

In [73]:
# torch.save(model, "./models/DNNCF.pt")

In [18]:
load_model = torch.load("./models/DNNCF.pt")
all_movieIds = ratings['movieId'].unique()

ghost_user_id = 0
ghost_interacted_items = ratings.movieId.sample(10).values
not_interacted_items = list(set(all_movieIds) - set(ghost_interacted_items))

predicted_labels = np.squeeze(load_model(torch.tensor([ghost_user_id]*len(not_interacted_items)), torch.tensor(not_interacted_items)).detach().numpy())
top10_items = [all_movieIds[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]

ghost_interacted_items_df = MOVIE_DB[MOVIE_DB['movieId'].isin(ghost_interacted_items)]
predicted_interacted_items_df = MOVIE_DB[MOVIE_DB['movieId'].isin(ratings.iloc[top10_items].movieId.values)]

In [19]:
ghost_interacted_items_df

Unnamed: 0,movieId,title,genres
191,193,Showgirls (1995),Drama
534,539,Sleepless in Seattle (1993),Comedy|Drama|Romance
585,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
1013,1036,Die Hard (1988),Action|Crime|Thriller
1063,1090,Platoon (1986),Drama|War
1834,1923,There's Something About Mary (1998),Comedy|Romance
3006,3099,Shampoo (1975),Comedy|Drama|Romance
10684,44204,Tsotsi (2005),Crime|Drama
11124,48516,"Departed, The (2006)",Crime|Drama|Thriller
13575,70286,District 9 (2009),Mystery|Sci-Fi|Thriller


In [20]:
predicted_interacted_items_df

Unnamed: 0,movieId,title,genres
452,457,"Fugitive, The (1993)",Thriller
898,919,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical
1175,1206,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
2867,2959,Fight Club (1999),Action|Crime|Drama|Thriller
3713,3814,Love and Death (1975),Comedy
4887,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
5371,5479,K-19: The Widowmaker (2002),Action|Adventure|Drama|Thriller
6751,6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller
19956,103539,The Spectacular Now (2013),Comedy|Drama|Romance
