# Import Libraries and data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None

In [2]:
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'data/ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

In [3]:
n_users = ratings.user_id.unique().shape[0]
n_movies = ratings.movie_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 943 | Number of movies = 1682


In [4]:
train_data, test_data = train_test_split(ratings, test_size=0.25)

# Collaborative Filtering

In [5]:
train_data.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
49447,488,50,4,891293974
35988,295,427,4,879517412
55917,593,385,4,886194041
25044,130,246,4,874953698
57304,798,365,3,875639656


### Create User-Item Matrices

In [6]:
# create two user-item matrices, one for training and another for testing
train_data_UI_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    train_data_UI_matrix[line[1]-1, line[2]-1] = line[3]
    
test_data_UI_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    test_data_UI_matrix[line[1]-1, line[2]-1] = line[3]

## Memeory-Based CF by Computing Cosine Similarity

In [7]:
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances

### Calculate Cosine Similarity
Note, the output will range from 0 to 1 since the ratings are all positive.

In [8]:
user_similarity = pairwise_distances(train_data_UI_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_UI_matrix.T, metric='cosine')
print(user_similarity.shape)
print(item_similarity.shape)

(943, 943)
(1682, 1682)


In [9]:
user_similarity

array([[0.        , 0.88924119, 0.97547817, ..., 0.91464761, 0.85205611,
        0.68931606],
       [0.88924119, 0.        , 0.85084855, ..., 0.82651205, 0.85730583,
        0.92076344],
       [0.97547817, 0.85084855, 0.        , ..., 0.92489063, 0.87113324,
        0.97955186],
       ...,
       [0.91464761, 0.82651205, 0.92489063, ..., 0.        , 0.89592051,
        0.97144285],
       [0.85205611, 0.85730583, 0.87113324, ..., 0.89592051, 0.        ,
        0.89657686],
       [0.68931606, 0.92076344, 0.97955186, ..., 0.97144285, 0.89657686,
        0.        ]])

### Prediction

In [10]:
def predict(ratings, similarity, type="user"):
    if type == "user":
        mean_user_rating = ratings.mean(axis=1)
        # you use np.newaxis so that mean_user_rating has same format as ratings
        rating_diff = ratings - mean_user_rating[:, np.newaxis]
#         print((mean_user_rating[:, np.newaxis]).shape)
#         print(similarity.shape)
#         print(rating_diff.shape)
#         print((similarity.dot(rating_diff)).shape)
#         print((np.array([np.abs(similarity).sum(axis=0)])).shape)
#         print((np.array([np.abs(similarity).sum(axis=1)])).shape)
        pred = (
            mean_user_rating[:, np.newaxis]
            + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        )
#         print(pred.shape)
    elif type == "item":
#         print(ratings.shape)
#         print(similarity.shape)
#         print((np.array([np.abs(similarity).sum(axis=0)])).shape)
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
#         print(pred.shape)
    return pred

In [11]:
item_prediction = predict(train_data_UI_matrix, item_similarity, type='item')
user_prediction = predict(train_data_UI_matrix, user_similarity, type='user')
print(item_prediction.shape)
print(user_prediction.shape)

(943, 1682)
(943, 1682)


### Evaluation
Using the most popular metric: RMSE

In [12]:
def rmse(prediction, ground_truth):
#     print(prediction.shape)
#     print(ground_truth.shape)
#     print(ground_truth.nonzero())
#     print(prediction[ground_truth.nonzero()])
#     print(prediction[ground_truth.nonzero()].flatten())
#     print(ground_truth[ground_truth.nonzero()].flatten())
    prediction = prediction[ground_truth.nonzero()]#.flatten()
    ground_truth = ground_truth[ground_truth.nonzero()]#.flatten()
    return round(sqrt(mean_squared_error(prediction, ground_truth)), 4)

In [13]:
print('User-base CF RMSE: ' + str(rmse(user_prediction, test_data_UI_matrix)))
print('Item-base CF RMSE: ' + str(rmse(item_prediction, test_data_UI_matrix)))

User-base CF RMSE: 3.122
Item-base CF RMSE: 3.4476


## Model-based CF by Using Sigular Value Decomposition (SVD)
based on matrix factorization (MF)

![Matrix Factorization(MF)](https://developers.google.com/machine-learning/recommendation/images/Matrixfactor.svg)

![](https://miro.medium.com/max/91/0*qZBT80xpE9OYAuEr)

Given `m x n` matrix X:<br>
* `U` is an `(m x r)` orthogonal matrix
* `S` is an `(r x r)` diagonal matrix with non-negative real numbers on the diagonal
* `V^T` is an `(r x n)` orthogonal matrix <br>
Elements on the diagonal in `S` are known as *singular values* of `X`.<br>
Matrix `X` can be factorized to `U`, `S` and `V`. The `U` matrix represents the feature vectors corresponding to the users in the hidden feature space and the `V` matrix represents the feature vectors corresponding to the items in the hidden feature space.<br>
Now you can make a prediction by taking product of `U`, `S` and `V^T`.

In [14]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [15]:
sparsity = round(1.-len(ratings)/(n_users*n_movies), 3)
print(f'The sparsity level of MovieLen100K is {str(sparsity*100)}%')

The sparsity level of MovieLen100K is 93.7%


### Create SVD components

In [16]:
# Get SVD components from the train matrix. Choose k.
u, s, vt = svds(train_data_UI_matrix, k=20)
s_diag_matrix = np.diag(s)

### Prediction

In [17]:
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

### Evaluation
Using the most popular metric: RMSE

In [18]:
print('User-base CF RMSE: ' + str(rmse(X_pred, test_data_UI_matrix)))

User-base CF RMSE: 2.7185


# Deep Learning Based

In [19]:
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)

In [20]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Neural Collaborative Filtering (NCF) <br>  - Simple Version with Implicit Feeback

### Train Test Split

In [21]:
ratings['rank_latest'] = ratings.groupby(['user_id'])['unix_timestamp'].rank(method='first', ascending=False)
train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

train_ratings.drop(columns=['unix_timestamp', 'rank_latest'], inplace=True)
test_ratings.drop(columns=['unix_timestamp', 'rank_latest'], inplace=True)

In [22]:
# Converting the dataset into an implicit feedback dataset
train_ratings.loc[:, 'rating'] = 1
train_ratings.sample(5)

Unnamed: 0,user_id,movie_id,rating
60348,786,7,1
98185,486,244,1
46874,592,338,1
50089,627,58,1
33401,234,417,1


We do have a problem now though. After binarizing our dataset, we see that every sample in the dataset now belongs to the positive class. However, we also require negative samples to train our models, to indicate movies that the user has not interacted with. We assume that such movies are those that the user are not interested in — even though this is a sweeping assumption that may not be true, it usually works out rather well in practice. <br>
The code below generates 4 negative samples for each row of data. In other words, the ratio of negative to positive samples is 4:1. This ratio is chosen arbitrarily but I found that it works rather well in practice(feel free to find the best ratio yourself!).

### Define PyTroch Dataset

In [23]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        # Placeholders that will hold the training data
        users, items, labels = [], [], []
        # This is the set of items that each user has interaction with
        user_item_set = set(zip(ratings['user_id'], ratings['movie_id']))
        
        # 4:1 ratio of negative to positive samples
        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1) # items that the user has interacted with are positive
            for _ in range(num_negatives):
                # randomly select an item
                negative_item = np.random.choice(all_movieIds)
                # check that the user has not interacted with this item
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0) # items not interacted with are negative

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

### Define NCF Model

In [24]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)
        
        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=4)

### Train NCF Model

In [25]:
num_users = ratings.user_id.nunique() + 1 # + bias term (?) (if RNN: to take into account the zero padding)
num_items = ratings.movie_id.nunique() + 1 # + bias term (?) (if RNN: to take into account the zero padding)
# Get a list of all movie IDs
all_movieIds = ratings['movie_id'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

trainer = pl.Trainer(max_epochs=5, gpus=0, reload_dataloaders_every_epoch=True,
                     progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
  "`reload_dataloaders_every_epoch` is deprecated in v1.4 and will be removed in v1.6."
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 7.6 K 
1 | item_embedding | Embedding | 13.5 K
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
24.2 K    Trainable params
0         Non-trainable params
24.2 K    Total params
0.097     Total estimated model params size (MB)





### Evaluation - Hit Ratio @ 10

In [31]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['user_id'], test_ratings['movie_id']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('user_id')['movie_id'].apply(list).to_dict()


buy = []
for (u,i) in test_user_item_set:
    # For each user, randomly select 99 items that the user has not interacted with.
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    # Combine these 99 items with the test item (the actual item that 
    # the user last interacted with). We now have 100 items.
    test_items = selected_not_interacted + [i]
    
    # Run the model on these 100 items
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    # Select the top 10 items from the list of 100 items. 
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    # If the test item is present within the top 10 items, 
    # then we say that this is a hit.
    if i in top10_items:
        buy.append(1)
    else:
        buy.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(buy)))

The Hit Ratio @ 10 is 0.41


In [27]:
!pip freeze | grep tqdm

tqdm==4.40.0


In [28]:
!pip freeze | grep tqdm

tqdm==4.40.0


In [29]:
!pip freeze | grep pytorch-lightning

pytorch-lightning==1.5.0


In [30]:
!pip freeze | grep tensorflow

tensorflow==2.6.2
tensorflow-estimator==2.6.0


In [None]:
!pip freeze | grep tensorflow