In [86]:
import torch
import pandas as pd


In [87]:
ratings = pd.read_csv('data/ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv('data/movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [88]:
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [89]:
item_feature_matrix = ratings.pivot_table(index='MovieID', columns='UserID', values='Rating')
item_feature_matrix

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,,,,,,,,,3.0,4.0,...,,,,,,,,,,
3949,,,,,,,,,,,...,,,,,,,,,,
3950,,,,,,,,,,,...,,,,,,,,,,
3951,,,,,,,,,,,...,,,,,,,,,,


In [90]:
item_feature_matrix

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,,,,,,,,,3.0,4.0,...,,,,,,,,,,
3949,,,,,,,,,,,...,,,,,,,,,,
3950,,,,,,,,,,,...,,,,,,,,,,
3951,,,,,,,,,,,...,,,,,,,,,,


In [91]:
mean_ratings = item_feature_matrix.mean(axis=0, skipna=True)
mean_ratings.shape

(6040,)

In [92]:
normalized_ratings = item_feature_matrix.sub(mean_ratings, axis=1)
# if the rating is negative, set it to 0
# normalized_ratings = normalized_ratings.clip(lower=0)
normalized_ratings.head()

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.811321,,,,,0.098592,,0.115108,1.264151,0.885287,...,,-0.134615,,,1.389286,,,,,-0.577713
2,,,,,,,,,,0.885287,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,-1.610714,,,,,
4,,,,,,,,-0.884892,,,...,,,,,-0.610714,-1.302928,,,,
5,,,,,,,,,,,...,,,,,-1.610714,,,,,


In [93]:
# count number of not null values for 1510
not_null_count = normalized_ratings.loc[1510].notnull()
normalized_ratings.loc[1510, not_null_count]

UserID
1470   -0.694366
4169   -0.551858
Name: 1510, dtype: float64

In [94]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed

def compute_similarity_matrix(rating, top_n=None, n_jobs=-1):
    """
    Compute the item-item similarity matrix for a given matrix of centered ratings using sparse matrices and parallel computation.

    Parameters:
    - centered_rating_matrix: pd.DataFrame, a DataFrame where rows represent movies,
      columns represent users, and values represent centered ratings.
    - top_n: int, the number of most similar items to keep for each item.
    - n_jobs: int, the number of jobs to run in parallel. -1 means using all processors.

    Returns:
    - pd.DataFrame or scipy.sparse matrix, the similarity matrix with movies as both rows and columns, containing top N similarities.
    """
    # for each movie, if user has rated it, set it to 1, otherwise set it to 0
    rating = rating.fillna(0)
    binary_rating = pd.DataFrame(np.where(rating > 0, 1, 0), index=rating.index, columns=rating.columns)
    
    binary_tensor = torch.tensor(binary_rating.values, dtype=torch.float32)
    #binary_tensor = binary_tensor.cuda()

    rating_count = binary_tensor.matmul(binary_tensor.t())
    rating_count = rating_count.cpu().numpy()

    rating_count = np.where(rating_count < 3, 0, 1)
    cosine_sim_df = pd.DataFrame(index=rating.index, columns=rating.index)

    def compute_similarity(i, j, rating_matrix, rating_count):
        if rating_count[i, j]:
            vec_i = rating_matrix.iloc[i]
            vec_j = rating_matrix.iloc[j]
            similarity = np.dot(vec_i, vec_j) / (np.linalg.norm(vec_i) * np.linalg.norm(vec_j))
        else:
            similarity = 0
        return i, j, similarity

    # Parallel computation of cosine similarity
    results = Parallel(n_jobs=-1)(delayed(compute_similarity)(i, j, rating, rating_count)
                                  for i in range(len(rating))
                                  for j in range(i, len(rating)))

    # results = []
    # # don't use parallel to compute cosine similarity
    # for i in range(len(rating)):
    #      for j in range(i, len(rating)):
    #         results.append(compute_similarity(i, j, rating, rating_count))

    # Fill the DataFrame with the computed similarities
    for i, j, similarity in results:
        cosine_sim_df.iloc[i, j] = similarity
        cosine_sim_df.iloc[j, i] = similarity  # symmetry
    if top_n is None:
        return cosine_sim_df

    # # Convert the sparse similarity matrix to a dense DataFrame for further processing
    # similarity_matrix_dense = pd.DataFrame(cosine_sim_sparse.todense(),
    #                                        index=centered_rating_matrix.index,
    #                                        columns=centered_rating_matrix.index)
    #
    # # Create a binary user-movie matrix
    # user_movie_matrix = (centered_rating_matrix != 0).astype(int)
    #
    # # Compute the common ratings mask
    # common_ratings_mask = user_movie_matrix.T.dot(user_movie_matrix) >= 3
    #
    # # Apply the mask to set values to NaN where the condition is not met
    # similarity_matrix_dense.where(common_ratings_mask, np.nan, inplace=True)
    #
    # # Define function to be parallelized for extracting the top N similar movies
    # def extract_top_n(similarity_series):
    #     non_nan_series = similarity_series.dropna()
    #     if len(non_nan_series) < top_n:
    #         return non_nan_series.index.tolist() + [np.nan] * (top_n - len(non_nan_series))
    #     return non_nan_series.nlargest(top_n).tolist()
    #
    # # Extract the top N similar movies for each movie using parallel computation
    # results = Parallel(n_jobs=n_jobs)(delayed(extract_top_n)(similarity_matrix_dense.loc[movie_id])
    #                                   for movie_id in similarity_matrix_dense.index)
    #
    # # Construct the final DataFrame with the top N similarities for each movie
    # similarity_matrix_top_n = pd.DataFrame(results, index=centered_rating_matrix.index, columns=range(top_n))

    # return similarity_matrix_top_n


In [95]:
# top_30_similarity_matrix = compute_similarity_matrix(normalized_ratings, top_n=30)

## Display the pairwise similarity values from the S
## matrix with movie ID 1, 10, 100, 1510, 260, and 3212

In [96]:

specified_movies = [1, 10, 100, 1510, 260, 3212]
# top_30_similarity_matrix.loc[specified_movies].round(7)

In [97]:
normalized_ratings = normalized_ratings
normalized_ratings.shape

(3706, 6040)

In [98]:
normalized_ratings.index

Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
       ...
       3943, 3944, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
      dtype='int64', name='MovieID', length=3706)

In [99]:
similarity_matrix = compute_similarity_matrix(normalized_ratings)

KeyboardInterrupt: 

In [100]:
similarity_matrix = similarity_matrix.replace(0, np.nan)
similarity_matrix = 0.5 + 0.5 * similarity_matrix

In [None]:
# similarity_matrix = (similarity_matrix - 0.5) / similarity_matrix

In [101]:
np.linalg.norm([-3, 4], ord=2)

5.0

In [102]:
# convert to dense matrix
# replace 0 to nan
specified_movies = [1, 10, 100, 1510, 260, 3212]
similarity_matrix.loc[specified_movies, specified_movies].round(7)

MovieID,1,10,100,1510,260,3212
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,0.752754,0.738755,,0.820829,
10,0.752754,1.0,0.754089,,0.758474,
100,0.738755,0.754089,1.0,,0.735644,
1510,,,,,,
260,0.820829,0.758474,0.735644,,1.0,
3212,,,,,,


In [103]:
# # save sparse matrix
# from scipy.sparse import save_npz
# save_npz('data/similarity_matrix.npz', similarity_matrix)

In [150]:
import numpy as np
from scipy.sparse import csr_matrix

def recommend_movies(new_user_ratings, similarity_sparse, n_recommendations=10):
    """
    Generate movie recommendations based on new user ratings and a sparse similarity matrix.

    Parameters:
    - new_user_ratings: np.array, user's ratings for movies; 0 indicates the movie hasn't been rated.
    - similarity_sparse: scipy.sparse matrix, item-item similarity matrix in sparse format.
    - n_recommendations: int, the number of recommendations to return.

    Returns:
    - List of movie indices representing the top N recommendations.
    """

    # Validate the shape of new_user_ratings
    if new_user_ratings.shape[0] != similarity_sparse.shape[0]:
        raise ValueError("The length of new_user_ratings must match the size of similarity matrix.")

    # Convert new user ratings to NaN if 0 (user hasn't rated the movie)
    user_ratings = np.where(new_user_ratings == 0, np.nan, new_user_ratings)
    
    # Filter out movies the user has already rated
    unrated_movies_mask = np.isnan(user_ratings)

    unrated_similarity = similarity_sparse.iloc[unrated_movies_mask, :]

    # Calculate the weighted scores using matrix multiplication
    weighted_scores = unrated_similarity.dot(user_ratings)
    print(weighted_scores.shape)
    print(np.sum(np.isnan(weighted_scores)))

    # Normalize by the sum of the similarities for rated movies
    sum_similarity = unrated_similarity.sum(axis=1).values  # Convert to 1D array
    valid_mask = sum_similarity > 0
    normalized_scores = np.divide(weighted_scores, sum_similarity, where=valid_mask)

    # Select top N recommendations
    top_movie_indices = np.argsort(-normalized_scores)[:n_recommendations]

    return top_movie_indices.tolist()

recommend_movies(user_1_ratings.values, similarity_matrix)


(2260,)
2260


  return bound(*args, **kwds)


[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]

In [137]:
# generate a vector 1 x 3706, which represents the rating of user 1 to all movies
user_1 = ratings[ratings['UserID'] == 1181]
user_1_ratings = user_1.set_index('MovieID')['Rating']
user_1_ratings = user_1_ratings.reindex(range(1, 3707), fill_value=0)
print(user_1_ratings.head())
print(type(user_1_ratings))

MovieID
1    3
2    1
3    1
4    0
5    0
Name: Rating, dtype: int64
<class 'pandas.core.series.Series'>


In [121]:
user_1_ratings.values.shape

(3706,)

In [122]:

recommend_movies(user_1_ratings.values, similarity_matrix)

  return bound(*args, **kwds)


[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]

In [123]:
# get the movie titles by movie ids
def get_movie_titles(movie_ids):
    return movies[movies['MovieID'].isin(movie_ids)]

In [124]:
user_1_recommend = get_movie_titles(recommend_movies(user_1_ratings.values, similarity_matrix, n_recommendations=50))
user_1_recommend

  return bound(*args, **kwds)


Unnamed: 0,MovieID,Title,Genres
169,171,Jeffrey (1995),Comedy
170,172,Johnny Mnemonic (1995),Action|Sci-Fi|Thriller
171,173,Judge Dredd (1995),Action|Adventure|Sci-Fi


In [125]:
# get 10 movies are rated by user 1, sorted by rating
user_1_watched = user_1[user_1['MovieID'].isin(range(1, 3707))].sort_values(by='Rating', ascending=False)
# get movies info
user_1_movie = get_movie_titles(user_1_watched['MovieID'].values)
user_1_movie

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
...,...,...,...
3633,3702,Mad Max (1979),Action|Sci-Fi
3634,3703,Mad Max 2 (a.k.a. The Road Warrior) (1981),Action|Sci-Fi
3635,3704,Mad Max Beyond Thunderdome (1985),Action|Sci-Fi
3636,3705,Bird on a Wire (1990),Action|Adventure|Romance|Thriller
