In [61]:
import pandas as pd

In [62]:
ratings = pd.read_csv('data/ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv(r'D:\UIUC\CSE_598_Statistical_Learning\MP\CS-598-Statistical_Learning\CS_598_Project_4\data\movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [63]:
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [64]:
item_feature_matrix = ratings.pivot_table(index='MovieID', columns='UserID', values='Rating')
item_feature_matrix

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,


In [84]:
item_feature_matrix

(3706,)

In [85]:
mean_ratings = item_feature_matrix.mean(axis=0, skipna=True)
mean_ratings.shape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.853154,,,,,0.121277,,0.985294,2.343137,1.459459,...,,,,,,,,,,
2,,,,,,,,,,1.459459,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,-0.014706,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [67]:
normalized_ratings = item_feature_matrix.sub(mean_ratings, axis=1)
# if the rating is negative, set it to 0
# normalized_ratings = normalized_ratings.clip(lower=0)
normalized_ratings.head()

1470   -2.0
Name: 1510, dtype: float64

In [68]:
# count number of not null values for 1510
not_null_count = normalized_ratings.loc[1510].notnull()
normalized_ratings.loc[1510, not_null_count]

In [69]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed

def compute_similarity_matrix(rating, top_n=None, n_jobs=-1):
    """
    Compute the item-item similarity matrix for a given matrix of centered ratings using sparse matrices and parallel computation.

    Parameters:
    - centered_rating_matrix: pd.DataFrame, a DataFrame where rows represent movies,
      columns represent users, and values represent centered ratings.
    - top_n: int, the number of most similar items to keep for each item.
    - n_jobs: int, the number of jobs to run in parallel. -1 means using all processors.

    Returns:
    - pd.DataFrame or scipy.sparse matrix, the similarity matrix with movies as both rows and columns, containing top N similarities.
    """
    # for each movie, if user has rated it, set it to 1, otherwise set it to 0
    rating = rating.fillna(0)
    binary_rating = pd.DataFrame(np.where(rating > 0, 1, 0), index=rating.index, columns=rating.columns)
    import torch
    binary_tensor = torch.tensor(binary_rating.values, dtype=torch.float32)
    binary_tensor = binary_tensor.cuda()

    rating_count = binary_tensor.matmul(binary_tensor.t())
    rating_count = rating_count.cpu().numpy()

    rating_count = np.where(rating_count < 3, 0, 1)
    cosine_sim_df = pd.DataFrame(index=rating.index, columns=rating.index)

    def compute_similarity(i, j, rating_matrix, rating_count):
        if rating_count[i, j]:
            vec_i = rating_matrix.iloc[i]
            vec_j = rating_matrix.iloc[j]
            similarity = np.dot(vec_i, vec_j) / (np.linalg.norm(vec_i) * np.linalg.norm(vec_j))
        else:
            similarity = 0
        return i, j, similarity

    # Parallel computation of cosine similarity
    results = Parallel(n_jobs=-1)(delayed(compute_similarity)(i, j, rating, rating_count)
                                  for i in range(len(rating))
                                  for j in range(i, len(rating)))

    # Fill the DataFrame with the computed similarities
    for i, j, similarity in results:
        cosine_sim_df.iloc[i, j] = similarity
        cosine_sim_df.iloc[j, i] = similarity  # symmetry
    if top_n is None:
        return cosine_sim_df

    # # Convert the sparse similarity matrix to a dense DataFrame for further processing
    # similarity_matrix_dense = pd.DataFrame(cosine_sim_sparse.todense(),
    #                                        index=centered_rating_matrix.index,
    #                                        columns=centered_rating_matrix.index)
    #
    # # Create a binary user-movie matrix
    # user_movie_matrix = (centered_rating_matrix != 0).astype(int)
    #
    # # Compute the common ratings mask
    # common_ratings_mask = user_movie_matrix.T.dot(user_movie_matrix) >= 3
    #
    # # Apply the mask to set values to NaN where the condition is not met
    # similarity_matrix_dense.where(common_ratings_mask, np.nan, inplace=True)
    #
    # # Define function to be parallelized for extracting the top N similar movies
    # def extract_top_n(similarity_series):
    #     non_nan_series = similarity_series.dropna()
    #     if len(non_nan_series) < top_n:
    #         return non_nan_series.index.tolist() + [np.nan] * (top_n - len(non_nan_series))
    #     return non_nan_series.nlargest(top_n).tolist()
    #
    # # Extract the top N similar movies for each movie using parallel computation
    # results = Parallel(n_jobs=n_jobs)(delayed(extract_top_n)(similarity_matrix_dense.loc[movie_id])
    #                                   for movie_id in similarity_matrix_dense.index)
    #
    # # Construct the final DataFrame with the top N similarities for each movie
    # similarity_matrix_top_n = pd.DataFrame(results, index=centered_rating_matrix.index, columns=range(top_n))

    # return similarity_matrix_top_n


In [84]:
# top_30_similarity_matrix = compute_similarity_matrix(normalized_ratings, top_n=30)

## Display the pairwise similarity values from the S
## matrix with movie ID 1, 10, 100, 1510, 260, and 3212

In [71]:

specified_movies = [1, 10, 100, 1510, 260, 3212]
# top_30_similarity_matrix.loc[specified_movies].round(7)

(3706, 6040)

In [72]:
normalized_ratings = normalized_ratings
normalized_ratings.shape

Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
       ...
       3943, 3944, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
      dtype='int64', name='MovieID', length=3706)

In [73]:
normalized_ratings.index

In [74]:
similarity_matrix = compute_similarity_matrix(normalized_ratings)

In [75]:
similarity_matrix = similarity_matrix.replace(0, np.nan)
similarity_matrix = 0.5 + 0.5 * similarity_matrix

In [76]:
# similarity_matrix = (similarity_matrix - 0.5) / similarity_matrix

MovieID,1,10,100,1510,260,3212
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,0.602455,0.515104,,0.707566,
10,0.602455,1.0,0.550467,,0.612781,
100,0.515104,0.550467,1.0,,0.497183,
1510,,,,,,
260,0.707566,0.612781,0.497183,,1.0,
3212,,,,,,


In [77]:
np.linalg.norm([-3, 4], ord=2)

In [78]:
# convert to dense matrix
# replace 0 to nan
specified_movies = [1, 10, 100, 1510, 260, 3212]
similarity_matrix.loc[specified_movies, specified_movies].round(7)

In [79]:
# # save sparse matrix
# from scipy.sparse import save_npz
# save_npz('data/similarity_matrix.npz', similarity_matrix)

MovieID
1    5
2    0
3    0
4    0
5    0
Name: Rating, dtype: int64

In [80]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

def recommend_movies(new_user_ratings, similarity_sparse, n_recommendations=10):
    """
    Generate movie recommendations based on new user ratings and a sparse similarity matrix.

    Parameters:
    - new_user_ratings: np.array, user's ratings for movies; 0 indicates the movie hasn't been rated.
    - similarity_sparse: scipy.sparse matrix, item-item similarity matrix in sparse format.
    - n_recommendations: int, the number of recommendations to return.

    Returns:
    - List of movie indices representing the top N recommendations.
    """

    # Validate the shape of new_user_ratings
    if new_user_ratings.shape[0] != similarity_sparse.shape[0]:
        raise ValueError("The length of new_user_ratings must match the size of similarity matrix.")

    # Convert new user ratings to NaN if 0 (user hasn't rated the movie)
    user_ratings = np.where(new_user_ratings == 0, np.nan, new_user_ratings)

    # Filter out movies the user has already rated
    unrated_movies_mask = np.isnan(user_ratings)

    # Extract the similarity scores for unrated movies
    unrated_similarity = similarity_sparse[unrated_movies_mask, :]

    # Calculate the weighted scores using matrix multiplication
    weighted_scores = unrated_similarity.dot(user_ratings)

    # Normalize by the sum of the similarities for rated movies
    sum_similarity = unrated_similarity.sum(axis=1).A1  # Convert to 1D array
    valid_mask = sum_similarity > 0
    normalized_scores = np.divide(weighted_scores, sum_similarity, where=valid_mask)

    # Select top N recommendations
    top_movie_indices = np.argsort(-normalized_scores)[:n_recommendations]

    return top_movie_indices.tolist()


(3706,)

In [81]:
# generate a vector 1 x 3706, which represents the rating of user 1 to all movies
user_1 = ratings[ratings['UserID'] == 1]
user_1_ratings = user_1.set_index('MovieID')['Rating']
user_1_ratings = user_1_ratings.reindex(range(1, 3707), fill_value=0)
user_1_ratings.head()

InvalidIndexError: (array([False,  True,  True, ...,  True,  True,  True]), slice(None, None, None))

In [None]:
user_1_ratings.values.shape

In [None]:
recommend_movies(user_1_ratings.values, similarity_matrix)

In [None]:
# get the movie titles by movie ids
def get_movie_titles(movie_ids):
    return movies[movies['MovieID'].isin(movie_ids)]

In [None]:
user_1_recommend = get_movie_titles(recommend_movies(user_1_ratings.values, similarity_matrix, n_recommendations=50))
user_1_recommend

In [None]:
# get 10 movies are rated by user 1, sorted by rating
user_1_watched = user_1[user_1['MovieID'].isin(range(1, 3707))].sort_values(by='Rating', ascending=False)
# get movies info
user_1_movie = get_movie_titles(user_1_watched['MovieID'].values)
user_1_movie