# Neural Collaborative filtering
Previosuly, we introduce the basics of collaborative filtering using item and user-based CF. <br>
However, traditional CF cannot capture more general pattern due to the simplicity of its method.<br>
Therefore, people resort to neural networks to learn more complex, fine-grained inforamtion from user-item interaction. <br>
In this lecture, we will implement two classic NN-based CF model including: **BPR-MF** and **NeuMF**.

## Data preparation
Here we will use the most common, famous dataset: Movielens which contains the user-movie interactions.

In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install -q git+https://github.com/snap-stanford/deepsnap.git
!pip install -U -q PyDrive

In [None]:
# import required modules
import random
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, Tensor
import torch.nn.functional as F
from torch_geometric.data import download_url, extract_zip
from torch_geometric.utils import structured_negative_sampling

In [None]:
# download the dataset
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')
movie_path = './ml-latest-small/movies.csv'
rating_path = './ml-latest-small/ratings.csv'

In [None]:
# load user and movie nodes
def load_mapping(path, index_col):
    """Loads csv containing interaction information

    Args:
        path (str): path to csv file
        index_col (str): column name of index column

    Returns:
        dict: mapping of csv row to unique id
    """
    df = pd.read_csv(path, index_col=index_col)
    
    # assign unique index for each user/movie
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    return mapping


user_mapping = load_mapping(rating_path, index_col='userId')
movie_mapping = load_mapping(movie_path, index_col='movieId')

In [None]:
# load edges between users and movies
def load_interaction(path, src_index_col, src_mapping, dst_index_col, dst_mapping, link_index_col, rating_threshold=4):
    """Loads csv containing edges between users and items

    Args:
        path (str): path to csv file
        src_index_col (str): column name of users
        src_mapping (dict): mapping between row number and user id
        dst_index_col (str): column name of items
        dst_mapping (dict): mapping between row number and item id
        link_index_col (str): column name of user item interaction
        rating_threshold (int, optional): Threshold to determine positivity of edge. Defaults to 4.

    Returns:
        torch.Tensor: N by 2 matrix containing the user item interaction
    """
    df = pd.read_csv(path)
    edge_index = None
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    is_positive = torch.from_numpy(df[link_index_col].values).view(-1, 1).to(torch.long) >= rating_threshold

    interactions = []
    for i in range(is_positive.shape[0]):
        if is_positive[i]:
            interactions.append([src[i],dst[i]])

    return torch.tensor(interactions)


interactions = load_interaction(
    rating_path,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='movieId',
    dst_mapping=movie_mapping,
    link_index_col='rating',
    rating_threshold=4,
)

In [None]:
# split the edges of the graph using a 80/10/10 train/validation/test split
num_users, num_movies = len(user_mapping), len(movie_mapping)
num_interactions = interactions.shape[0]
all_indices = [i for i in range(num_interactions)]

train_indices, test_indices = train_test_split(
    all_indices, test_size=0.2, random_state=1)
val_indices, test_indices = train_test_split(
    test_indices, test_size=0.5, random_state=1)

train_interactions = interactions[train_indices,:]
val_interactions = interactions[val_indices,:]
test_interactions = interactions[test_indices,:]

## The BPR-MF model
BPR-MF is a matrix factorization model that factorize the user-item interaction matrix with user and item embedding matrix.
![](https://i.imgur.com/FM1w89a.png)

## Loss Function



We utilize a Bayesian Personalized Ranking (BPR) loss, a pairwise objective which encourages the predictions of positive samples to be higher than negative samples for each user.

\begin{equation}
L_{BPR} = -\sum_{u = 1}^M \sum_{i \in N_u} \sum_{j \notin N_u} \ln{\sigma(\hat{y}_{ui} - \hat{y}_{uj})} + \lambda ||E^{(0)}||^2 
\end{equation}

$\hat{y}_{u}$: predicted score of a positive sample

$\hat{y}_{uj}$: predicted score of a negative sample

$\lambda$: hyperparameter which controls the L2 regularization strength

In [None]:
class BPR(nn.Module):
    def __init__(self, user_size, item_size, dim):
        super().__init__()
        self.user_embedding = nn.Embedding(user_size,dim)
        self.item_embedding = nn.Embedding(item_size,dim)

    def forward(self, u, i, j):
        x_ui = torch.mul(self.user_embedding(u), self.item_embedding(i)).sum(dim=1)
        x_uj = torch.mul(self.user_embedding(u), self.item_embedding(j)).sum(dim=1)
        x_uij = x_ui - x_uj
        bpr_loss = -torch.log(torch.sigmoid(x_uij)).mean()
        return bpr_loss

In [None]:
def bpr_loss(user_embedding, pos_embedding, neg_embedding):
    """Bayesian Personalized Ranking Loss as described in https://arxiv.org/abs/1205.2618

    Args:
        user_embedding (torch.Tensor): user embedding
        pos_embedding (torch.Tensor): embedding of positive items
        neg_embedding (torch.Tensor): embedding of negative items

    Returns:
        torch.Tensor: scalar bpr loss value
    """

    pos_scores = torch.mul(user_embedding, pos_embedding)
    pos_scores = torch.sum(pos_scores, dim=-1) # predicted scores of positive samples
    neg_scores = torch.mul(user_embedding, neg_embedding)
    neg_scores = torch.sum(neg_scores, dim=-1) # predicted scores of negative samples

    loss = -torch.mean(torch.sigmoid(pos_scores - neg_scores))

    return loss

In [None]:
from torch.utils.data import Dataset, DataLoader

class TripletUniformPair(Dataset):
    def __init__(self, num_item, user_list, pair):
        self.num_item = num_item
        self.user_list = user_list
        self.pair = pair

    def __getitem__(self, idx):
        idx = np.random.randint(len(self.pair))
        u = self.pair[idx][0]
        i = self.pair[idx][1]
        j = np.random.randint(self.num_item)
        while j in self.user_list[u]:
            j = np.random.randint(self.num_item)
        return u, i, j

    def __len__(self):
        return len(self.pair)

# Evaluation Metrics

We evalaluate our model using the following metrics

\begin{equation}
\text{Recall} = \frac{TP}{TP + FP}
\end{equation}

\begin{equation}
\text{Precision} = \frac{TP}{TP + FN}
\end{equation}

**Dicounted Cumulative Gain (DCG)** at rank position p is defined as:

\begin{equation}
\text{DCG}_\text{p} = \sum_{i = 1}^p \frac{2^{rel_i} - 1}{\log_2{(i + 1)}}
\end{equation}

p: a particular rank position

$rel_i \in \{0, 1\}$ : graded relevance of the result at position $i$

**Idealised Dicounted Cumulative Gain (IDCG)**, namely the maximum possible DCG, at rank position $p$ is defined as:

\begin{equation}
\text{IDCG}_\text{p} = \sum_{i = 1}^{|REL_p|} \frac{2^{rel_i} - 1}{\log_2{(i + 1)}}
\end{equation}

$|REL_p|$ : list of items ordered by their relevance up to position p

**Normalized Dicounted Cumulative Gain (NDCG)** at rank position $p$ is defined as:

\begin{equation}
\text{nDCG}_\text{p} = \frac{\text{DCG}_p}{\text{nDCG}_p}
\end{equation}

Specifically, we use the metrics recall@K, precision@K, and NDCG@K. @K indicates that these metrics are computed on the top K recommendations.

In [None]:
# helper function to get N_u
def get_user_positive_items(interactions):
    """Generates dictionary of positive items for each user

    Args:
        interactions (torch.Tensor): N by 2 list of interaction

    Returns:
        dict: dictionary of positive items for each user
    """
    user_pos_items = {}
    for i in range(interactions.shape[0]):
        user = interactions[i][0].item()
        item = interactions[i][1].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

In [None]:
# computes recall@K and precision@K
def RecallPrecision_ATk(groundTruth, r, k):
    """Computers recall @ k and precision @ k

    Args:
        groundTruth (list): list of lists containing highly rated items of each user
        r (list): list of lists indicating whether each top k item recommended to each user
            is a top k ground truth item or not
        k (intg): determines the top k items to compute precision and recall on

    Returns:
        tuple: recall @ k, precision @ k
    """
    num_correct_pred = torch.sum(r, dim=-1)  # number of correctly predicted items per user
    # number of items liked by each user in the test set
    user_num_liked = torch.Tensor([len(groundTruth[i])
                                  for i in range(len(groundTruth))])
    recall = torch.mean(num_correct_pred / user_num_liked)
    precision = torch.mean(num_correct_pred) / k
    return recall.item(), precision.item()

In [None]:
# computes NDCG@K
def NDCGatK_r(groundTruth, r, k):
    """Computes Normalized Discounted Cumulative Gain (NDCG) @ k

    Args:
        groundTruth (list): list of lists containing highly rated items of each user
        r (list): list of lists indicating whether each top k item recommended to each user
            is a top k ground truth item or not
        k (int): determines the top k items to compute ndcg on

    Returns:
        float: ndcg @ k
    """
    assert len(r) == len(groundTruth)

    test_matrix = torch.zeros((len(r), k))

    for i, items in enumerate(groundTruth):
        length = min(len(items), k)
        test_matrix[i, :length] = 1
    max_r = test_matrix
    idcg = torch.sum(max_r * 1. / torch.log2(torch.arange(2, k + 2)), axis=1)
    dcg = r * (1. / torch.log2(torch.arange(2, k + 2)))
    dcg = torch.sum(dcg, axis=1)
    idcg[idcg == 0.] = 1.
    ndcg = dcg / idcg
    ndcg[torch.isnan(ndcg)] = 0.
    return torch.mean(ndcg).item()

In [None]:
# wrapper function to get evaluation metrics
def get_metrics(model, interactions, exclude_interactions, k):
    """
    Computes the evaluation metrics: recall, precision, and ndcg @ k
    """
    user_embedding, item_embedding = model.user_embedding.weight, model.item_embedding.weight

    # get ratings between every user and item - shape is num users x num movies
    rating = torch.matmul(user_embedding, item_embedding.T)

    for exclude_record in exclude_interactions:
        # gets all the positive items for each user from the edge index
        user_pos_items = get_user_positive_items(exclude_record)
        # get coordinates of all edges to exclude
        exclude_users = []
        exclude_items = []
        for user, items in user_pos_items.items():
            exclude_users.extend([user] * len(items))
            exclude_items.extend(items)

        # set ratings of excluded edges to large negative value
        rating[exclude_users, exclude_items] = -(1 << 10)

    # get the top k recommended items for each user
    _, top_K_items = torch.topk(rating, k=k)

    # get all unique users in evaluated split
    users = interactions[:,0].unique()

    test_user_pos_items = get_user_positive_items(interactions)

    # convert test user pos items dictionary into a list
    test_user_pos_items_list = [
        test_user_pos_items[user.item()] for user in users]

    # determine the correctness of topk predictions
    r = []
    for user in users:
        ground_truth_items = test_user_pos_items[user.item()]
        label = list(map(lambda x: x in ground_truth_items, top_K_items[user]))
        r.append(label)
    r = torch.Tensor(np.array(r).astype('float'))

    recall, precision = RecallPrecision_ATk(test_user_pos_items_list, r, k)
    ndcg = NDCGatK_r(test_user_pos_items_list, r, k)

    return recall, precision, ndcg

In [None]:
# wrapper function to evaluate model
def evaluation(model, edge_index, exclude_edge_indices, k):
    """
    Evaluates model loss and metrics including recall, precision, ndcg @ k
    """
    # get embeddings
    users_emb_final, items_emb_final = model.user_embedding.weight, model.item_embedding.weight
    edges = structured_negative_sampling(
        edge_index.T, contains_neg_self_loops=False)
    
    # indices
    user_indices, pos_item_indices, neg_item_indices = edges[0], edges[1], edges[2]
    users_emb_final = users_emb_final[user_indices]
    pos_items_emb_final = items_emb_final[pos_item_indices]
    neg_items_emb_final = items_emb_final[neg_item_indices]

    recall, precision, ndcg = get_metrics(
        model, edge_index, exclude_edge_indices, k)

    return recall, precision, ndcg

# Training

In [None]:
# define contants
ITERATIONS = 500
BATCH_SIZE = 1024
LR = 1e-3
ITERS_PER_EVAL = 100
ITERS_PER_LR_DECAY = 200
K = 20
LAMBDA = 1e-6
DIM = 32

In [None]:
# setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}.")

model = BPR(num_users, num_movies, DIM)
model = model.to(device)
model.train()

# initialize parameters
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=LAMBDA)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

user_interaction_list = get_user_positive_items(train_interactions)
train_dataset = TripletUniformPair(num_movies, user_interaction_list, train_interactions.numpy())
train_loader = DataLoader(train_dataset,batch_size=BATCH_SIZE,pin_memory=True,num_workers=8)

In [None]:
from tqdm.autonotebook import tqdm, trange

In [None]:
# training loop
for iter in trange(ITERATIONS):
    for batch in train_loader:
        batch = [item.to(device) for item in batch]
        train_loss = model(*batch)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
    if iter % ITERS_PER_EVAL == 0:
        model.eval()
        recall, precision, ndcg = evaluation(model, val_interactions, [train_interactions], K)
        print(f"[Iteration {iter}/{ITERATIONS}] val_recall@{K}: {recall:.4f}, val_precision@{K}: {precision:.4f}, val_ndcg@{K}: {ndcg:.4f}")
        model.train()

    if iter % ITERS_PER_LR_DECAY == 0 and iter != 0:
        scheduler.step()

In [None]:
# evaluate on test set
model.eval()

test_recall, test_precision, test_ndcg = evaluation(
            model, test_interactions, [train_interactions, val_interactions], K)

print(f"test_recall@{K}: {round(test_recall, 5)}, test_precision@{K}: {round(test_precision, 5)}, test_ndcg@{K}: {round(test_ndcg, 5)}")

# Make New Recommendatios for a Given User

In [None]:
model.eval()
df = pd.read_csv(movie_path)
movieid_title = pd.Series(df.title.values,index=df.movieId).to_dict()
movieid_genres = pd.Series(df.genres.values,index=df.movieId).to_dict()

user_pos_items = get_user_positive_items(interactions)

In [None]:
def make_predictions(user_id, num_recs):
    user = user_mapping[user_id]
    e_u = model.user_embedding.weight[user]
    scores = model.item_embedding.weight @ e_u

    values, indices = torch.topk(scores, k=len(user_pos_items[user]) + num_recs)

    movies = [index.cpu().item() for index in indices if index in user_pos_items[user]][:num_recs]
    movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in movies]
    titles = [movieid_title[id] for id in movie_ids]
    genres = [movieid_genres[id] for id in movie_ids]

    print(f"Here are some movies that user {user_id} rated highly")
    for i in range(num_recs):
        print(f"title: {titles[i]}, genres: {genres[i]} ")

    print()

    movies = [index.cpu().item() for index in indices if index not in user_pos_items[user]][:num_recs]
    movie_ids = [list(movie_mapping.keys())[list(movie_mapping.values()).index(movie)] for movie in movies]
    titles = [movieid_title[id] for id in movie_ids]
    genres = [movieid_genres[id] for id in movie_ids]

    print(f"Here are some suggested movies for user {user_id}")
    for i in range(num_recs):
        print(f"title: {titles[i]}, genres: {genres[i]} ")

In [None]:
USER_ID = 30
NUM_RECS = 10

make_predictions(USER_ID, NUM_RECS)

# Colaborative filtering with NeuMF
Neural Collaborative Filtering(NCF) replaces the user-item inner product with a neural architecture. By doing so NCF tried to achieve the following:

* NCF tries to express and generalize MF under its framework.
* NCF tries to learn User-item interactions through a multi-layer perceptron.

## Generalize Matrix Factorization
The predicted output of the GMF can be expressed as

![](https://miro.medium.com/max/1196/1*oLGMj-8x7WLectRAk20ZVA.png)
where
* $a_{out}$: activation function
* $h$: weights of the output layer
![](https://miro.medium.com/max/1400/1*dwXGkFZCbzhRR0blfgQkVA.png)

As you can see from the above table that GMF with identity activation function and edge weights as 1 is indeed MF. The other 2 variations are expansions on the generic MF. The last variation of GMF with sigmoid as activation is used in NCF.

In [None]:
class GMF(nn.Module):
    def __init__(self, user_size, item_size, dim):
        super().__init__()
        self.user_embedding = nn.Embedding(user_size,dim)
        self.item_embedding = nn.Embedding(item_size,dim)
        self.output_layer = nn.Linear(dim,1,bias=False)

    def forward(self, user_id, item_id):
        user_vectors = self.user_embedding(user_id)
        item_vectors = self.item_embedding(item_id)
        
        # interaction
        interactions = user_vectors * item_vectors
        
        # predictions
        predictions = self.output_layer(interactions)
        
        return predictions.flatten()

In [None]:
import scipy.sparse as sp
# load ratings as a dok matrix
train_mat = sp.dok_matrix((num_users, num_movies), dtype=np.float32)
for x in train_interactions:
    train_mat[x[0], x[1]] = 1.0

In [None]:
class NCFData(Dataset):
    def __init__(self, features, 
                num_item, train_mat=None, num_ng=0, is_training=None):
        super(NCFData, self).__init__()
        """ Note that the labels are only useful when training, we thus 
            add them in the ng_sample() function.
        """
        self.features_ps = features
        self.num_item = num_item
        self.train_mat = train_mat
        self.num_ng = num_ng
        self.is_training = is_training
        self.labels = [0 for _ in range(len(features))]

    def ng_sample(self):
        assert self.is_training, 'no need to sampling when testing'

        self.features_ng = []
        for x in self.features_ps:
            u = x[0]
            for t in range(self.num_ng):
                j = np.random.randint(self.num_item)
                while (u, j) in self.train_mat:
                    j = np.random.randint(self.num_item)
                self.features_ng.append([u, j])

        labels_ps = [1 for _ in range(len(self.features_ps))]
        labels_ng = [0 for _ in range(len(self.features_ng))]

        self.features_fill = self.features_ps + self.features_ng
        self.labels_fill = labels_ps + labels_ng

    def __len__(self):
        return (self.num_ng + 1) * len(self.labels)

    def __getitem__(self, idx):
        features = self.features_fill if self.is_training \
                    else self.features_ps
        labels = self.labels_fill if self.is_training \
                    else self.labels

        user = features[idx][0]
        item = features[idx][1]
        label = labels[idx]
        return user, item ,label

In [None]:
def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = pred_items.index(gt_item)
        return np.reciprocal(np.log2(index+2))
    return 0


def metrics(model, test_loader, top_k):
    HR, NDCG = [], []

    for user, item, label in test_loader:
        user = user.cuda()
        item = item.cuda()

        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(
                item, indices).cpu().numpy().tolist()

        gt_item = item[0].item()
        HR.append(hit(gt_item, recommends))
        NDCG.append(ndcg(gt_item, recommends))

    return np.mean(HR), np.mean(NDCG)

In [None]:
# construct the train and test datasets
BATCH_SIZE = 64
train_dataset = NCFData(
    train_interactions.tolist(), num_movies, train_mat, 1, True)
test_dataset = NCFData(
        test_interactions.tolist(), num_movies, train_mat, 0, False)
train_loader = DataLoader(train_dataset,
        batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset,
        batch_size=100, shuffle=False, num_workers=0)

In [None]:
# define contants
ITERATIONS = 20
BATCH_SIZE = 256
LR = 1e-3
K = 20
LAMBDA = 0
DIM = 16

# setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}.")

model = GMF(num_users, num_movies, DIM)
model = model.to(device)
model.train()

# initialize parameters
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=LAMBDA)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
objective_function = nn.BCEWithLogitsLoss()

In [None]:
# training loop
for iter in trange(ITERATIONS):
    train_loader.dataset.ng_sample()
    for batch in train_loader:
        batch = [item.to(device) for item in batch]
        user, item, label = batch
        predictions = model(user,item)
        loss = objective_function(predictions,label.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # evaluation
    model.eval()
    HR, NDCG = metrics(model, test_loader, K)
    print("HR@{}: {:.3f}  NDCG@{}: {:.3f}".format(K,np.mean(HR),K, np.mean(NDCG)))

## MLP in NCF
NCF is an example of multimodal deep learning as it contains data from 2 pathways namely user and item. The most intuitive way to combine them is by concatenation. But a simple vector concatenation does not account for user-item interactions and is insufficient to model the collaborative filtering effect. To address this NCF adds hidden layers on top of concatenated user-item vectors(MLP framework), to learn user-item interactions. This endows the model with a lot of flexibility and non-linearity to learn the user-item interactions. This is an upgrade over MF that uses a fixed element-wise product on them. More precisely, the MLP alter Equation 1 as follows
![](https://miro.medium.com/max/1252/1*tIQfBeTur0gaKObfvDXmpw.png)
![](https://miro.medium.com/max/1400/1*aP-Mx266ExwoWZPSdHtYpA.png)

In [None]:
class MLP(nn.Module):
    def __init__(self, user_size, item_size, dim):
        super().__init__()
        self.user_embedding = nn.Embedding(user_size,dim*2)
        self.item_embedding = nn.Embedding(item_size,dim*2)
        self.MLP = nn.Sequential(
            nn.Linear(dim*4,dim*2),
            nn.ReLU(),
            nn.Linear(dim*2,dim),
            nn.ReLU(),
            nn.Linear(dim,1),
        )

    def forward(self, user_id, item_id):
        user_vectors = self.user_embedding(user_id)
        item_vectors = self.item_embedding(item_id)
        
        # interaction
        interactions = torch.cat([user_vectors, item_vectors],dim=-1)
        
        # predictions
        predictions = self.MLP(interactions)
        
        return predictions.flatten()

In [None]:
# define contants
ITERATIONS = 50
LR = 1e-4
K = 20
LAMBDA = 0
DIM = 16

# setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}.")

model = MLP(num_users, num_movies, DIM)
model = model.to(device)
model.train()

# initialize parameters
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=LAMBDA)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
objective_function = nn.BCEWithLogitsLoss()

In [None]:
# training loop
for iter in trange(ITERATIONS):
    train_loader.dataset.ng_sample()
    for batch in train_loader:
        batch = [item.to(device) for item in batch]
        user, item, label = batch
        predictions = model(user,item)
        loss = objective_function(predictions,label.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # evaluation
    model.eval()
    HR, NDCG = metrics(model, test_loader, K)
    print("HR@{}: {:.3f}  NDCG@{}: {:.3f}".format(K,np.mean(HR),K, np.mean(NDCG)))

# Practice: implementing NeuMF that combines GMF and MLP

![](https://miro.medium.com/max/1400/1*Tqk7Q2q7wsr6MLF8Xl-emg.png)

In [None]:
class NeuMF(nn.Module):
    def __init__(self, user_size, item_size, dim):
        super().__init__()
        ############################################################################
        # TODO: Your code here!
        # create embeddings and layers from GMF and MLP

        ############################################################################
        # an output layer that transform concat vector to prediction
        self.output_layer = nn.Linear(dim*2,1,bias=False)

    def forward(self, user_id, item_id):
        MLP_feature = None
        GMF_feature = None
        ############################################################################
        # TODO: Your code here!
        # please obtain the embedding from MLP and GMF
        # store the embedding in MLP_feature and GMF_feature, respectively.
        
        ############################################################################
        
        # predictions
        features = torch.cat([MLP_feature,GMF_feature],dim=-1)
        predictions = self.output_layer(features)
        
        return predictions.flatten()

In [None]:
# define contants


# setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}.")

model = NeuMF(num_users, num_movies, DIM).to(device)
model.train()

# initialize parameters
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=LAMBDA)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
objective_function = nn.BCEWithLogitsLoss()

## Training!
Let's see if NeuMF outperforms GMF and MLP?

In [None]:
# training loop
for iter in trange(ITERATIONS):
    train_loader.dataset.ng_sample()
    for batch in train_loader:
        batch = [item.to(device) for item in batch]
        user, item, label = batch
        predictions = model(user,item)
        loss = objective_function(predictions,label.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # evaluation
    model.eval()
    HR, NDCG = metrics(model, test_loader, K)
    print("HR@{}: {:.3f}  NDCG@{}: {:.3f}".format(K,np.mean(HR),K, np.mean(NDCG)))