In [1]:
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF

### Load prepared data

In [2]:
ratings = pd.read_csv("data/ratings_prepared.csv")
movies = pd.read_csv("data/movies_prepared.csv")

In [3]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931


In [4]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,Grumpier Old Men (1995),Comedy|Romance
2,2,Heat (1995),Action|Crime|Thriller
3,3,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### Initialize a sparse user-item rating matrix

In [5]:
# (data, (row_ind, col_ind))
R = csr_matrix((ratings["rating"], (ratings["user_id"], ratings["movie_id"])))

In [6]:
df_r = pd.DataFrame(R.todense())
df_r

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,3.0,4.5,4.0,0.0,3.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,0.0,4.5,4.5,3.0,0.0,4.0,0.0,3.0,...,0.0,0.5,0.0,0.0,4.5,0.0,2.5,1.0,1.5,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create a model and set the hyperparameters

In [7]:
run_model = False

if run_model:
    # model assumes R ~ PQ'
    model = NMF(n_components=1000, max_iter=1000)

    model.fit(R)

    # Save model
    with open('./data/model_nmf.pkl', 'wb') as file:
        pickle.dump(model, file)

else:
    # Load model
    with open('./data/model_nmf.pkl', 'rb') as file:
        model = pickle.load(file)

In [8]:
model.reconstruction_err_  # reconstruction error

8.551120578323198

## Recommender function

In [9]:
def recommended_movies(query, model, ratings, movies, k=10):
    """
    Filters and recommends the top k movies for any given input query
    based on a trained NMF model.
    Returns a list of k movie ids and corresponding movie titles.
    """

    # Create user vector
    df_new_user = pd.DataFrame(query, columns=movies["movie_id"], index=["new_user"])

    # Fill missing values
    df_new_user_filled = df_new_user.fillna(0)

    # Create user-feature matrix P for new user
    P_new_user_matrix = model.transform(df_new_user_filled)

    # get as dataframe for a better visualizarion
    P_new_user = pd.DataFrame(
        P_new_user_matrix,
        columns=model.get_feature_names_out(),
        index=["new_user"],
    )

    # Reconstruct the user-movie(item) matrix/dataframe for the new user
    Q_matrix = model.components_
    Q = pd.DataFrame(Q_matrix)
    R_hat_new_user_matrix = np.dot(P_new_user, Q)
    R_hat_new_user = pd.DataFrame(R_hat_new_user_matrix, index=["new_user"])

    # Get a list of k-top rated movie to recommend to the new user
    ranked = R_hat_new_user.T.sort_values("new_user", ascending=False)
    recommended = ranked[~ranked.index.isin(query)].reset_index()
    recommended.columns = ["movie_id", "score"]

    # Get movie ids of k best rated movies
    movie_ids = recommended.iloc[:k]["movie_id"]

    # Get corresponding titles in the same order
    titles = [movies.loc[id]["title"] for id in movie_ids]

    return movie_ids, titles

In [10]:
# movie_id: rating
user_query = {
    10: 4,  # Billy Madison (1995)
    100: 3, # Bambi (1942)
    555: 3.5,  # Mortal Kombat (1995)
    756: 2,  # Inside Man (2006)
    1224: 5,  # Babe: Pig in the City (1998)
}

recommended_movies(user_query, model, ratings, movies, k=5)


(0    839
 1    250
 2    276
 3     15
 4     26
 Name: movie_id, dtype: int64,
 ['101 Dalmatians (1996)',
  'English Patient, The (1996)',
  'Life Is Beautiful (La Vita è bella) (1997)',
  'Pulp Fiction (1994)',
  "Schindler's List (1993)"])

In [11]:
# movie_id: rating
user_query = {
    10: 4,  # Billy Madison (1995)
    100: 3, # Bambi (1942)
    555: 3.5,  # Mortal Kombat (1995)
    756: 2,  # Inside Man (2006)
    1224: 5,  # Babe: Pig in the City (1998)
}

movie_ids, titles = recommended_movies(user_query, model, ratings, movies, k=5)

print("Recommended movies:\n")
for i, title in enumerate(titles):
    print(f"{i+1}. {title}")


Recommended movies:

1. 101 Dalmatians (1996)
2. English Patient, The (1996)
3. Life Is Beautiful (La Vita è bella) (1997)
4. Pulp Fiction (1994)
5. Schindler's List (1993)


### Build a simple recommender

In [12]:
def recommend_popular(query, ratings, k=10):
    """
    Function that returns a list of k unseen, most pupular movies.
    """
    # Create a list of movies by popularity
    df_popularity = (
        ratings.groupby("movie_id")
        .agg(mean=("rating", "mean"))
        .sort_values("mean", ascending=False)
        .reset_index()
        .copy()
    )

    # Filter out movie_ids the user has seen (rated)
    df_popularity = df_popularity[df_popularity["movie_id"].isin(query)]

    return df_popularity["movie_id"].head(k).to_list()

In [13]:
query = ratings[ratings["user_id"] == 4][["movie_id", "rating"]]
query = query.set_index("movie_id").to_dict()["rating"]

recommened_ids = recommend_popular(query, ratings, k=10)

recommended_movies = movies[movies["movie_id"].isin(recommened_ids)]
recommended_movies[["movie_id", "title"]]

Unnamed: 0,movie_id,title
4,4,"Usual Suspects, The (1995)"
7,7,Braveheart (1995)
15,15,Pulp Fiction (1994)
23,23,"Fugitive, The (1993)"
26,26,Schindler's List (1993)
34,34,Fargo (1996)
180,180,"Shawshank Redemption, The (1994)"
221,221,"Postman, The (Postino, Il) (1994)"
230,230,In the Name of the Father (1993)
322,322,Terminator 2: Judgment Day (1991)
