In [1]:
import numpy as np
import pandas as pd
import sklearn

In [3]:
ratings = pd.read_csv('datasets/movielens_original/ratings.csv')
movies = pd.read_csv('datasets/movielens_original/movies.csv')

In [4]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    N = df['userId'].nunique()
    M = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [5]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [6]:
sparsity = X.count_nonzero()/(X.shape[0]*X.shape[1])

print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 0.26%


## Create helper functions

In [11]:
# pip install fuzzywuzzy
# pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.20.9-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.20.9
  Downloading Levenshtein-0.20.9-cp39-cp39-win_amd64.whl (101 kB)
     -------------------------------------- 101.3/101.3 kB 5.7 MB/s eta 0:00:00
Collecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.13.7-cp39-cp39-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 21.8 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.20.9 python-Levenshtein-0.20.9 rapidfuzz-2.13.7
Note: you may need to restart the kernel to use updated packages.


In [12]:
from fuzzywuzzy import process

def movie_finder(title):
    all_titles = movies['title'].tolist()
    closest_match = process.extractOne(title,all_titles)
    return closest_match[0]

movie_title_mapper = dict(zip(movies['title'], movies['movieId']))
movie_title_inv_mapper = dict(zip(movies['movieId'], movies['title']))

def get_movie_index(title):
    fuzzy_title = movie_finder(title)
    movie_id = movie_title_mapper[fuzzy_title]
    movie_idx = movie_mapper[movie_id]
    return movie_idx

def get_movie_title(movie_idx): 
    movie_id = movie_inv_mapper[movie_idx]
    title = movie_title_inv_mapper[movie_id]
    return title 

In [14]:
# pip install implicit

Collecting implicit
  Downloading implicit-0.6.2-cp39-cp39-win_amd64.whl (647 kB)
     ------------------------------------- 647.2/647.2 kB 20.5 MB/s eta 0:00:00
Installing collected packages: implicit
Successfully installed implicit-0.6.2
Note: you may need to restart the kernel to use updated packages.


In [50]:
import implicit
model = implicit.als.AlternatingLeastSquares(factors=50)

In [49]:
model.fit(X)

  0%|          | 0/15 [00:00<?, ?it/s]

In [25]:
movie_of_interest = 'forrest gump'

movie_index = get_movie_index(movie_of_interest)
related = model.similar_items(movie_index)
related

(array([   351,  14133,  75541,  42795,  94058,  99496,  95990, 107148,
        114131,  65354]),
 array([0.9999999 , 0.8141073 , 0.78864574, 0.78839755, 0.78799903,
        0.77834475, 0.7742283 , 0.7629699 , 0.7585686 , 0.75593567],
       dtype=float32))

In [46]:
movies.iloc[351]

movieId                         356
title           Forrest Gump (1994)
genres     Comedy|Drama|Romance|War
Name: 351, dtype: object

In [26]:
print(f"Because you watched {movie_finder(movie_of_interest)}...")
for r in related:
    recommended_title = get_movie_title(r[0])
    if recommended_title != movie_finder(movie_of_interest):
        print(recommended_title)

Because you watched Forrest Gump (1994)...


KeyError: 0.9999999

In [47]:
X_t = X.T.tocsr()

user_idx = user_mapper[373]
recommendations = model.recommend(user_idx, X_t)
recommendations

ValueError: user_items must contain 1 row for every user in userids