# **NeuMF for Collaborative Filtering Analysis**
This file utilizes Neural Matrix Factorization (NeuMF) for collaborative filtering on the Netflix Prize Dataset. By leveraging matrix factorization with deep learning, the model learns user and movie embeddings to generate personalized movie recommendations.

## **Data Preprocessing and Loading**

In [1]:
import pandas as pd

from d2l import mxnet as d2l
from mxnet import gluon, np, npx

npx.set_np()

def read_data(filename):
    """Reads data from a CSV file and returns the data, number of unique customers, and number of unique movies."""
    
    data = pd.read_csv(filename, sep=';')

    num_customers = data["CustomerID"].unique().shape[0]
    num_movies = data["MovieID"].unique().shape[0]

    return data, num_customers, num_movies

def split_data(data, test_ratio=0.2):
    """Splits the data into training and testing sets based on the given test ratio."""
    
    mask = [True if x == 1 else False for x in np.random.uniform(0, 1, (len(data))) < 1 - test_ratio]
    neg_mask = [not x for x in mask]
    train_data, test_data = data[mask], data[neg_mask]

    return train_data, test_data

def load_data(data):
    """Loads customer, movie, and rating data from a DataFrame into NumPy arrays."""
    
    customers, movies, rates = [], [], []

    for line in data.itertuples():
        customer_index, movie_index, rate = int(line[1] - 1), int(line[2] - 1), int(line[3])
        customers.append(customer_index)
        movies.append(movie_index)
        rates.append(rate)

    return np.array(customers), np.array(movies), np.array(rates)

def split_and_load(filename, test_ratio=0.2, batch_size=256):
    """Combines data reading, splitting, and loading into a single function, returning data loaders and metadata."""
    
    data, num_customers, num_movies = read_data(filename)
    train_data, test_data = split_data(data, test_ratio)
    train_customers, train_movies, train_rates = load_data(train_data)
    test_customers, test_movies, test_rates = load_data(test_data)

    train_set = gluon.data.ArrayDataset(train_customers, train_movies, train_rates)
    test_set = gluon.data.ArrayDataset(test_customers, test_movies, test_rates)

    train_iter = gluon.data.DataLoader(train_set, shuffle=True, batch_size=batch_size)
    test_iter = gluon.data.DataLoader(test_set, batch_size=batch_size)

    return num_customers, num_movies, train_iter, test_iter

filename, test_ratio, batch_size = '../data/random_100k_sample.csv', 0.2, 256
num_customers, num_movies, train_iter, test_iter = split_and_load(filename, test_ratio, batch_size)

## **Defining the Matrix Factorization Class**

In [2]:
from mxnet.gluon import nn

class MatrixFactorization(nn.Block):
    """Defines a matrix factorization model using embedding layers for customers and movies."""
    
    def __init__(self, num_customers, num_movies, num_factors, **kwargs):
        super(MatrixFactorization, self).__init__(**kwargs)
        
        self.customer_embedding = nn.Embedding(num_customers, num_factors)
        self.movie_embedding = nn.Embedding(num_movies, num_factors)
        self.customer_bias = nn.Embedding(num_customers, 1)
        self.movie_bias = nn.Embedding(num_movies, 1)

    def forward(self, customers, movies):
        """Forward pass of the model, calculating predicted ratings."""
        
        customer_vecs = self.customer_embedding(customers)
        movie_vecs = self.movie_embedding(movies)
        
        preds = (
            (customer_vecs * movie_vecs).sum(axis=1) +
            self.customer_bias(customers).squeeze() +
            self.movie_bias(movies).squeeze()
        )

        return preds

## **Defining the Training Function**

In [3]:
from mxnet import autograd, nd, init
from mxnet.gluon import Trainer, loss as gloss

def train_recommender(net, train_iter, num_epochs, lr, wd, ctx):
    """Trains a recommender model using the given network, data iterators, and hyperparameters."""
    
    net.initialize(ctx=ctx, force_reinit=True, init=init.Normal(0.01))
    
    trainer = Trainer(net.collect_params(), 'adam', {'learning_rate': lr, 'wd': wd})
    loss = gloss.L2Loss()

    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)

        for i, (customers, movies, rates) in enumerate(train_iter):
            customers, movies, rates = customers.as_in_ctx(ctx), movies.as_in_ctx(ctx), rates.as_in_ctx(ctx)

            with autograd.record():
                preds = net(customers, movies)
                l = loss(preds, rates)
                
            l.backward()
            trainer.step(batch_size=rates.shape[0])
            metric.add(l.sum().item(), rates.size, 1)

        train_rmse = nd.sqrt(nd.array([metric[0]]) / nd.array([metric[1]]))
        print(f'epoch {epoch + 1}, train RMSE: {train_rmse.asscalar():.4f}')

## **Matrix Factorization Model Training**

In [4]:
ctx = d2l.try_gpu()
num_factors = 20
net = MatrixFactorization(num_customers, num_movies, num_factors)
num_epochs, lr, wd, batch_size = 20, 0.005, 0, 256
train_recommender(net, train_iter, num_epochs, lr, wd, ctx)

epoch 1, train RMSE: 1.4812
epoch 2, train RMSE: 0.8118
epoch 3, train RMSE: 0.7567
epoch 4, train RMSE: 0.7383
epoch 5, train RMSE: 0.7281
epoch 6, train RMSE: 0.7210
epoch 7, train RMSE: 0.7151
epoch 8, train RMSE: 0.7097
epoch 9, train RMSE: 0.7058
epoch 10, train RMSE: 0.7017
epoch 11, train RMSE: 0.6986
epoch 12, train RMSE: 0.6961
epoch 13, train RMSE: 0.6938
epoch 14, train RMSE: 0.6915
epoch 15, train RMSE: 0.6898
epoch 16, train RMSE: 0.6882
epoch 17, train RMSE: 0.6870
epoch 18, train RMSE: 0.6861
epoch 19, train RMSE: 0.6851
epoch 20, train RMSE: 0.6843


## **RMSE Calculation for Model Performance**

In [5]:
from mxnet import nd

def evaluate_rmse(net, data_iter, ctx):
    """Evaluates the Root Mean Squared Error (RMSE) of a given network on a dataset."""
    
    loss = gluon.loss.L2Loss()
    metric = d2l.Accumulator(2)

    for i, (customers, movies, rates) in enumerate(data_iter):
        customers, movies, rates = customers.as_in_ctx(ctx), movies.as_in_ctx(ctx), rates.as_in_ctx(ctx)
        preds = net(customers, movies)
        l = loss(preds, rates)
        metric.add(l.sum().item(), rates.size)

    rmse = nd.sqrt(nd.array(metric[0]) / nd.array(metric[1]))
    return rmse.asscalar()

test_rmse = evaluate_rmse(net, test_iter, ctx)
print(f'Test RMSE: {test_rmse:.4f}')

Test RMSE: 0.7563


## **Movie Recommendations**

In [6]:
import numpy as np
import random
import mxnet as mx

def recommend_movies(net, num_customers, num_movies, ctx, num_recommendations=10):
    """Generates movie recommendations for a random customer based on the trained model."""
    
    customer_id = random.randint(0, num_customers - 1)

    movie_ids = np.arange(num_movies)
    customers = np.full_like(movie_ids, customer_id)
    movies = movie_ids

    customers = mx.np.array(customers, ctx=ctx)
    movies = mx.np.array(movies, ctx=ctx)

    preds = net(customers, movies)
    pred_with_movie_id = list(zip(preds.asnumpy(), movie_ids))
    pred_with_movie_id.sort(reverse=True, key=lambda x: x[0])

    recommended_movies = [movie_id for _, movie_id in pred_with_movie_id[:num_recommendations]]
    return customer_id, recommended_movies

customer_id, recommended_movies = recommend_movies(net, num_customers, num_movies, ctx, 10)
print(f"Recommended movies for customer {customer_id}: {recommended_movies}")

Recommended movies for customer 44285: [9300, 3997, 5976, 6543, 1330, 1993, 7627, 469, 8001, 249]
