In [85]:
import pandas as pd

In [86]:
ratings = pd.read_csv('data/ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv('data/movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [87]:
ratings['UserID'].nunique()

6040

In [88]:
movies['MovieID'].nunique()

3883

In [89]:
item_feature_matrix = ratings.pivot_table(index='MovieID', columns='UserID', values='Rating')
item_feature_matrix.head()

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,


In [90]:
mean_ratings = item_feature_matrix.mean(axis=1, skipna=True)
mean_ratings.head()

MovieID
1    4.146846
2    3.201141
3    3.016736
4    2.729412
5    3.006757
dtype: float64

In [91]:
normalized_ratings = item_feature_matrix.sub(mean_ratings, axis=0)
normalized_ratings.fillna(0, inplace=True)
normalized_ratings.head()

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.853154,0.0,0.0,0.0,0.0,-0.146846,0.0,-0.146846,0.853154,0.853154,...,0.0,-0.146846,0.0,0.0,-0.146846,0.0,0.0,0.0,0.0,-1.146846
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.798859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-2.016736,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.270588,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.729412,-0.729412,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-2.006757,0.0,0.0,0.0,0.0,0.0


In [121]:
import numpy as np
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
def compute_similarity_matrix(centered_rating_matrix, top_n=None, n_jobs=-1):
    """
    Compute the item-item similarity matrix for a given matrix of centered ratings using sparse matrices and parallel computation.

    Parameters:
    - centered_rating_matrix: pd.DataFrame, a DataFrame where rows represent movies,
      columns represent users, and values represent centered ratings.
    - top_n: int, the number of most similar items to keep for each item.
    - n_jobs: int, the number of jobs to run in parallel. -1 means using all processors.

    Returns:
    - pd.DataFrame, the similarity matrix with movies as both rows and columns, containing top N similarities.
    """
    # Convert the centered rating matrix to a sparse CSR matrix
    sparse_rating_matrix = csr_matrix(centered_rating_matrix.fillna(0))

    # Compute the cosine similarity matrix, which will also be in sparse format
    cosine_sim_sparse = cosine_similarity(sparse_rating_matrix, dense_output=False)

    # Scale the similarity values to be between 0 and 1 directly on the sparse matrix
    cosine_sim_sparse.data = 0.5 + 0.5 * cosine_sim_sparse.data

    # Convert the sparse similarity matrix to a dense DataFrame
    similarity_matrix_dense = pd.DataFrame(cosine_sim_sparse.todense(),
                                           index=centered_rating_matrix.index,
                                           columns=centered_rating_matrix.index)

    # Create a binary user-movie matrix
    user_movie_matrix = (centered_rating_matrix != 0).astype(int)

    # Compute the common ratings mask
    common_ratings_mask = user_movie_matrix.T.dot(user_movie_matrix) >= 3

    # Apply the mask to set values to NaN where the condition is not met
    similarity_matrix_dense = similarity_matrix_dense.where(common_ratings_mask, np.nan)

    if top_n is None:
        return similarity_matrix_dense

    # Define function to be parallelized for extracting the top N similar movies
    def extract_top_n(similarity_series):
        non_nan_series = similarity_series.dropna()
        if len(non_nan_series) < top_n:
            return non_nan_series.index.tolist() + [np.nan] * (top_n - len(non_nan_series))
        return non_nan_series.nlargest(top_n).tolist()

    # Extract the top N similar movies for each movie using parallel computation
    results = Parallel(n_jobs=n_jobs)(delayed(extract_top_n)(similarity_matrix_dense.loc[movie_id])
                                      for movie_id in similarity_matrix_dense.index)

    # Construct the final DataFrame with the top N similarities for each movie
    similarity_matrix_top_n = pd.DataFrame(results, index=centered_rating_matrix.index, columns=range(top_n))

    return similarity_matrix_top_n


In [122]:
top_30_similarity_matrix = compute_similarity_matrix(normalized_ratings, top_n=30)

## Display the pairwise similarity values from the S
## matrix with movie ID 1, 10, 100, 1510, 260, and 3212

In [123]:

specified_movies = [1, 10, 100, 1510, 260, 3212]
top_30_similarity_matrix.loc[specified_movies].round(7)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.67965,0.631565,0.620909,0.608463,0.598782,0.58841,0.5847,0.583637,0.578165,...,0.564597,0.563764,0.563369,0.560699,0.560594,0.560391,0.559671,0.559486,0.559485,0.558081
10,1.0,0.722866,0.669208,0.62708,0.61975,0.61742,0.609215,0.608969,0.607427,0.607246,...,0.593867,0.593278,0.593148,0.590649,0.590347,0.590138,0.590091,0.589431,0.588898,0.58771
100,1.0,0.635303,0.615785,0.609417,0.607869,0.594638,0.594057,0.581703,0.581676,0.581057,...,0.572211,0.571587,0.570265,0.57002,0.568915,0.568709,0.568583,0.567979,0.567846,0.567522
1510,1.0,0.99099,0.933013,0.933013,0.933013,0.853553,0.826797,0.788675,0.788675,0.780224,...,0.718089,0.714835,0.713497,0.702777,0.702777,0.702031,0.700033,0.693649,0.686052,0.679605
260,1.0,0.734181,0.686621,0.637557,0.603056,0.583446,0.578629,0.57565,0.575426,0.575278,...,0.559066,0.55752,0.553382,0.551599,0.551414,0.551269,0.549415,0.549279,0.548992,0.548812
3212,1.0,1.0,0.806186,0.788675,0.75,0.75,0.75,0.75,0.644338,0.639525,...,0.601827,0.594641,0.594611,0.594501,0.59417,0.592205,0.590139,0.589009,0.586704,0.586613


In [178]:
similarity_matrix = compute_similarity_matrix(normalized_ratings)

In [179]:
# convert 0 to nan
similarity_matrix = pd.DataFrame(np.where(similarity_matrix == 0, np.nan, similarity_matrix),
                                 index=similarity_matrix.index,
                                 columns=similarity_matrix.columns)

In [201]:
similarity_matrix.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.537607,0.522879,0.536191,0.521466,0.509518,,0.511407,0.498038,0.527814,...,0.493622,0.500337,0.508483,0.505667,0.5162,0.522014,0.516212,0.506648,0.505805,0.514216
2,0.537607,1.0,0.517747,0.505575,0.543893,0.513145,0.531335,0.528447,0.536256,0.552714,...,0.493723,0.49569,0.507821,0.505402,0.49702,0.52579,0.503044,0.493918,0.496796,0.502759
3,0.522879,0.517747,1.0,0.52768,0.565521,0.522627,0.535027,0.516557,0.513039,0.53621,...,0.501359,0.505299,0.492101,0.507501,0.508046,0.529197,0.500322,0.51309,0.508881,0.507372
4,0.536191,0.505575,0.52768,1.0,0.562948,,0.508397,0.503695,0.475853,0.499224,...,,,0.504792,0.507342,0.497206,,0.473614,,0.494245,
5,0.521466,0.543893,0.565521,0.562948,1.0,0.508942,0.541223,0.525349,0.542284,0.526204,...,0.491023,0.51372,0.50014,0.507307,0.514429,0.53387,0.491736,0.506181,0.500081,0.498425


In [211]:
def recommend_movies(new_user_ratings, similarity, n_recommendations=10):



    # set new user ratings to nan if it is 0
    new_user_ratings = np.where(new_user_ratings == 0, np.nan, new_user_ratings)

    # Convert the user ratings array into a Series with the movie index
    user_ratings_series = pd.Series(new_user_ratings, index=similarity.columns)
    # Initialize a list to hold the similarity scores for unrated movies
    similarity_scores = []

    # Iterate through the similarity matrix
    for movie_id, similar_movies in similarity.iterrows():
        # Skip movies the user has already rated
        if user_ratings_series[movie_id] > 0:
            continue

        # Retrieve the user ratings for the similar movies
        user_ratings = user_ratings_series[similar_movies.index].values
        # Calculate the weighted score (similarity score * user rating)
        weighted_scores = similar_movies.values * user_ratings
        # Sum the weighted scores and normalize by the sum of the similarities for rated movies
        sum_similarity = np.nansum(similar_movies.values)
        if sum_similarity > 0:
            weighted_score = np.nansum(weighted_scores) / sum_similarity
            similarity_scores.append((movie_id, weighted_score))

    # Sort the movies based on the weighted score and select the top recommendations
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_movie_ids = [movie_id for movie_id, _ in similarity_scores[:n_recommendations]]
    # print(similarity_scores)
    return top_movie_ids

In [221]:
# generate a vector 1 x 3706, which represents the rating of user 1 to all movies
user_1 = ratings[ratings['UserID'] == 1]
user_1_ratings = user_1.set_index('MovieID')['Rating']
user_1_ratings = user_1_ratings.reindex(range(1, 3707), fill_value=0)
user_1_ratings.head()

MovieID
1    5
2    0
3    0
4    0
5    0
Name: Rating, dtype: int64

In [222]:
user_1_ratings.values.reshape(1,-1).shape

(1, 3706)

In [223]:
recommend_movies(user_1_ratings.values, similarity_matrix)

[1039, 3485, 3373, 889, 706, 3762, 1555, 3603, 404, 3899]

In [224]:
# get the movie titles by movie ids
def get_movie_titles(movie_ids):
    return movies[movies['MovieID'].isin(movie_ids)]

In [225]:
user_1_recommend = get_movie_titles(recommend_movies(user_1_ratings.values, similarity_matrix, n_recommendations=10))
user_1_recommend

Unnamed: 0,MovieID,Title,Genres
400,404,Brother Minister: The Assassination of Malcolm...,Documentary
697,706,Sunset Park (1996),Drama
877,889,1-900 (1994),Romance
1026,1039,Synthetic Pleasures (1995),Documentary
1516,1555,"To Have, or Not (1995)",Drama
3304,3373,Buck and the Preacher (1972),Western
3416,3485,Autopsy (Macchie Solari) (1975),Horror
3534,3603,"Gay Deceivers, The (1969)",Comedy
3693,3762,Daughter of Dr. Jeckyll (1957),Horror
3829,3899,Circus (2000),Comedy


In [226]:
# get 10 movies are rated by user 1, sorted by rating
user_1_watched = user_1[user_1['MovieID'].isin(range(1, 3707))].sort_values(by='Rating', ascending=False)
# get movies info
user_1_movie = get_movie_titles(user_1_watched['MovieID'].values)
user_1_movie

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
47,48,Pocahontas (1995),Animation|Children's|Musical|Romance
148,150,Apollo 13 (1995),Drama
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
523,527,Schindler's List (1993),Drama|War
527,531,"Secret Garden, The (1993)",Children's|Drama
584,588,Aladdin (1992),Animation|Children's|Comedy|Musical
590,594,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
591,595,Beauty and the Beast (1991),Animation|Children's|Musical
604,608,Fargo (1996),Crime|Drama|Thriller
