In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

**Contents**

- [Matrix factorization methods](#1.-Matrix-factorization-methods)
- [Incorporating User and item biases](#2.-Incorporating-user-and-item-biases)

## 1. Matrix factorization methods

In [None]:
def matrix_factorization(R, k, learning_rate, n_epochs):
    m, n = R.shape
    
    # number of nonzero ratings
    n_zero_ratings = np.sum(~np.isnan(R))
    
    # initialization
    U = np.random.randn(m,k)
    V = np.random.randn(n,k)
    
    mean_error = np.zeros(n_epochs)
    
    # gradient descent steps
    for i in range(n_epochs):
        
        # error matrix
        E = R-U.dot(V.T)
        E[np.isnan(E)]=0

        # update U and V
        U,V = U + learning_rate*E.dot(V), V + learning_rate*E.T.dot(U)
        
        # compute mean_error
        error_squared = np.sum(E**2)
        mean_error[i] = np.sqrt(error_squared/(n_zero_ratings))
        
    return U, V, mean_error

**Simple example**

In [None]:
R = np.array([
    [2, np.nan, 3, np.nan],
    [np.nan, 5, np.nan, 4],
    [1, 3, np.nan, 5],
    [np.nan, 4, 2, np.nan]
])
R

In [None]:
U, V, mean_error = matrix_factorization(R, 
                                        k = 2,
                                        learning_rate = 0.05,
                                        n_epochs = 50)

In [None]:
plt.plot(mean_error)

In [None]:
R

In [None]:
# approximation of R
U.dot(V.T).round(2)

**MoviLens dataset example**

In [None]:
ratings = pd.read_csv('ratings_large.csv')
ratings.head()

In [None]:
# ratings matrix
ratings_matrix = ratings.pivot('title','userId')['rating']
ratings_matrix

In [None]:
# load Javier's fake user (or your fake user, if you created one)
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/fake_user.csv'
user = pd.read_csv(url, index_col='title', squeeze=True)
user

In [None]:
# add fake user to the ratings matrix
ratings_matrix['user'] = user
ratings_matrix

In [None]:
R = ratings_matrix.to_numpy()

In [None]:
U, V, mean_error = matrix_factorization(R, 
                                        k = 50, 
                                        learning_rate = .00005, 
                                        n_epochs = 50)

In [None]:
plt.plot(mean_error)

In [None]:
# predictions
R_pred = U.dot(V.T)
user_pred = pd.Series(R_pred[:,-1], index=user.index) 
user_pred

In [None]:
# top 20 recommendations
user_pred[user.isna()].sort_values(ascending=False).head(20)

In [None]:
# bottom 20 recommendations
user_pred[user.isna()].sort_values().head(20)

In [None]:
user[user.notna()]

In [None]:
user_pred[user.notna()]

## 2. Incorporating user and item biases

In [None]:
def matrix_factorization_with_biases(R, k, learning_rate, n_epochs):
    m, n = R.shape
    
    # number of nonzero ratings
    n_zero_ratings = np.sum(~np.isnan(R))
    
    # initialization
    U = np.random.randn(m,k+2)
    U[:,k+1] = 1 # set (k+2)th column to 1
    V = np.random.randn(n,k+2)
    V[:,k] = 1 # set (k+1)th column to 1
    
    mean_error = np.zeros(n_epochs)
    
    # gradient descent steps
    for i in range(n_epochs):
        
        # error matrix
        E = R-U.dot(V.T)
        E[np.isnan(E)]=0

        # update U and V
        U, V  = U + learning_rate*E.dot(V), V + learning_rate*E.T.dot(U)
        U[:,k+1] = 1 # restore column of ones
        V[:,k] = 1 # # restore column of ones

        
        # compute mean_error
        error_squared = np.sum(E**2)
        mean_error[i] = np.sqrt(error_squared/(n_zero_ratings))
        
    return U, V, mean_error

**MovieLens dataset example**

In [None]:
U, V, mean_error = matrix_factorization_with_biases(R, 
                                                    k = 20,
                                                    learning_rate = .00005, 
                                                    n_epochs = 50)

In [None]:
plt.plot(mean_error)

In [None]:
# predictions
R_pred = U.dot(V.T)
user_pred = pd.Series(R_pred[:,-1], index=user.index) # 

In [None]:
# top 20 recommendations
user_pred[user.isna()].sort_values(ascending=False).head(20)

In [None]:
# bottom 20 recommendations
user_pred[user.isna()].sort_values().head(20)