In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Create a function named `myIBCF`:

- Input: `new_user`  a 3706-by-1 vector (denoted as $w$)containing ratings for the 3,706 movies from a new user. Many entries in this vector will be zero. The order of the movies in this vector should match the rating matrix $R$.

- Inside the function: Upon receiving this input, your function should download the similarity matrix and use it to compute predictions for movies that have not been rated by this new user yet. Use the following formula to compute the prediction for movie $l$:

$$
\frac{1}{\sum_{i \in S(l)} S_{li} \textbf{1}_{\{w_i \neq NA\}}} \displaystyle \sum_{i \in S(l)} S_{li}w_{i}
$$

where $S(l)$ denotes the set of movies in the 30-nearest neighborhood of movie $l$. Again NA values may occur.

- Output: Based on your predictions, recommend the top 10 movies to this new user, using the column names of the rating matrix $R$. Explain what your code should do if fewer than 10 predictions are non-NA. Provide a method to suggest additional movies that have not been rated by this user.

In [3]:
S = pd.read_csv('Top_Smat_2.csv', index_col=0)
R = pd.read_csv('Rmat.csv', index_col=0)
M = pd.read_csv('./movies.dat', sep='::', engine = 'python', encoding="ISO-8859-1", header=None)
M.columns = ['MovieID', 'Title', 'Genres']

In [4]:
w = R.loc['u1181']
all_movies = S.index

rated_movies = w[~np.isnan(w)].index

predicted_ratings = w.copy()

for movie in all_movies:
    if movie not in rated_movies:
        predicted_ratings[movie] = np.nan
        
        S_movie = S.loc[movie] # Similarity of the movie
        S_movie_index = S_movie[~np.isnan(S_movie.values)].index # Only select movies with similarities
        useful_movies = S_movie_index.intersection(rated_movies) # Further choose movies with both similarities and ratings
        
        U = np.sum(S_movie[useful_movies]*w[useful_movies])
        D = np.sum(S_movie[useful_movies])
        
        if D!=0:
            predicted_ratings[movie] = U/D
        
predicted_ratings = predicted_ratings.drop(rated_movies)
final_recommendation = predicted_ratings.sort_values(ascending=False)[:10]

In [5]:
final_recommendation

m3732    5.000000
m749     4.526559
m3899    4.526066
m3789    4.000000
m504     4.000000
m1235    4.000000
m249     4.000000
m1914    4.000000
m1253    4.000000
m2793    4.000000
Name: u1181, dtype: float64