In [2]:
import pandas as pd

In [8]:
ratings = pd.read_csv('data/ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv(r'D:\UIUC\CSE_598_Statistical_Learning\MP\CS-598-Statistical_Learning\CS_598_Project_4\data\movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [9]:
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
ratings['UserID'].nunique()

6040

In [5]:
movies['MovieID'].nunique()

3883

In [29]:
item_feature_matrix = ratings.pivot_table(index='MovieID', columns='UserID', values='Rating')
item_feature_matrix.head()

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,


In [30]:
mean_ratings = item_feature_matrix.mean(axis=1, skipna=True)
mean_ratings.head()

MovieID
1    4.146846
2    3.201141
3    3.016736
4    2.729412
5    3.006757
dtype: float64

In [31]:
normalized_ratings = item_feature_matrix.sub(mean_ratings, axis=0)
normalized_ratings.head()

UserID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.853154,,,,,-0.146846,,-0.146846,0.853154,0.853154,...,,-0.146846,,,-0.146846,,,,,-1.146846
2,,,,,,,,,,1.798859,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,-2.016736,,,,,
4,,,,,,,,0.270588,,,...,,,,,-0.729412,-0.729412,,,,
5,,,,,,,,,,,...,,,,,-2.006757,,,,,


In [32]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity_matrix(centered_rating_matrix, top_n=None, n_jobs=-1):
    """
    Compute the item-item similarity matrix for a given matrix of centered ratings using sparse matrices and parallel computation.

    Parameters:
    - centered_rating_matrix: pd.DataFrame, a DataFrame where rows represent movies,
      columns represent users, and values represent centered ratings.
    - top_n: int, the number of most similar items to keep for each item.
    - n_jobs: int, the number of jobs to run in parallel. -1 means using all processors.

    Returns:
    - pd.DataFrame or scipy.sparse matrix, the similarity matrix with movies as both rows and columns, containing top N similarities.
    """
    # Convert the centered rating matrix to a sparse CSR matrix
    sparse_rating_matrix = csr_matrix(centered_rating_matrix.fillna(0))

    # Compute the cosine similarity matrix, which will also be in sparse format
    cosine_sim_sparse = cosine_similarity(sparse_rating_matrix, dense_output=False)

    # Scale the similarity values to be between 0 and 1 directly on the sparse matrix
    cosine_sim_sparse.data = 0.5 + 0.5 * cosine_sim_sparse.data

    # If top_n is None, return the sparse similarity matrix as is
    if top_n is None:
        return cosine_sim_sparse

    # Convert the sparse similarity matrix to a dense DataFrame for further processing
    similarity_matrix_dense = pd.DataFrame(cosine_sim_sparse.todense(),
                                           index=centered_rating_matrix.index,
                                           columns=centered_rating_matrix.index)

    # Create a binary user-movie matrix
    user_movie_matrix = (centered_rating_matrix != 0).astype(int)

    # Compute the common ratings mask
    common_ratings_mask = user_movie_matrix.T.dot(user_movie_matrix) >= 3

    # Apply the mask to set values to NaN where the condition is not met
    similarity_matrix_dense.where(common_ratings_mask, np.nan, inplace=True)

    # Define function to be parallelized for extracting the top N similar movies
    def extract_top_n(similarity_series):
        non_nan_series = similarity_series.dropna()
        if len(non_nan_series) < top_n:
            return non_nan_series.index.tolist() + [np.nan] * (top_n - len(non_nan_series))
        return non_nan_series.nlargest(top_n).tolist()

    # Extract the top N similar movies for each movie using parallel computation
    results = Parallel(n_jobs=n_jobs)(delayed(extract_top_n)(similarity_matrix_dense.loc[movie_id])
                                      for movie_id in similarity_matrix_dense.index)

    # Construct the final DataFrame with the top N similarities for each movie
    similarity_matrix_top_n = pd.DataFrame(results, index=centered_rating_matrix.index, columns=range(top_n))

    return similarity_matrix_top_n


In [33]:
top_30_similarity_matrix = compute_similarity_matrix(normalized_ratings, top_n=30)

## Display the pairwise similarity values from the S
## matrix with movie ID 1, 10, 100, 1510, 260, and 3212

In [34]:

specified_movies = [1, 10, 100, 1510, 260, 3212]
top_30_similarity_matrix.loc[specified_movies].round(7)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.67965,0.631565,0.620909,0.608463,0.602423,0.598782,0.591427,0.58841,0.586603,...,0.570234,0.568234,0.568071,0.567276,0.566814,0.564763,0.564627,0.564597,0.563764,0.563369
10,1.0,0.722866,0.669208,0.62708,0.61975,0.61742,0.609215,0.608969,0.607427,0.607246,...,0.593867,0.593278,0.593148,0.590649,0.590347,0.590138,0.590091,0.589431,0.588898,0.58771
100,1.0,0.635303,0.615785,0.609417,0.607869,0.596375,0.594638,0.594057,0.581703,0.581676,...,0.57347,0.572971,0.572868,0.572211,0.571587,0.570265,0.57002,0.568915,0.568709,0.568583
1510,1.0,0.99099,0.933013,0.933013,0.933013,0.853553,0.826797,0.788675,0.788675,0.788675,...,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75,0.75
260,1.0,0.734181,0.686621,0.637557,0.603056,0.583446,0.579363,0.578629,0.57565,0.575426,...,0.560232,0.559597,0.559066,0.55752,0.553382,0.551599,0.551414,0.551269,0.549415,0.549279
3212,1.0,1.0,0.806186,0.788675,0.755155,0.75,0.75,0.75,0.75,0.75,...,0.625,0.621028,0.619737,0.61547,0.610138,0.609272,0.609109,0.602142,0.602062,0.601827


In [35]:
similarity_matrix = compute_similarity_matrix(normalized_ratings)

In [36]:
# save sparse matrix
from scipy.sparse import save_npz
save_npz('data/similarity_matrix.npz', similarity_matrix)

In [37]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

def recommend_movies(new_user_ratings, similarity_sparse, n_recommendations=10):
    """
    Generate movie recommendations based on new user ratings and a sparse similarity matrix.

    Parameters:
    - new_user_ratings: np.array, user's ratings for movies; 0 indicates the movie hasn't been rated.
    - similarity_sparse: scipy.sparse matrix, item-item similarity matrix in sparse format.
    - n_recommendations: int, the number of recommendations to return.

    Returns:
    - List of movie indices representing the top N recommendations.
    """

    # Validate the shape of new_user_ratings
    if new_user_ratings.shape[0] != similarity_sparse.shape[0]:
        raise ValueError("The length of new_user_ratings must match the size of similarity matrix.")

    # Convert new user ratings to NaN if 0 (user hasn't rated the movie)
    user_ratings = np.where(new_user_ratings == 0, np.nan, new_user_ratings)

    # Filter out movies the user has already rated
    unrated_movies_mask = np.isnan(user_ratings)

    # Extract the similarity scores for unrated movies
    unrated_similarity = similarity_sparse[unrated_movies_mask, :]

    # Calculate the weighted scores using matrix multiplication
    weighted_scores = unrated_similarity.dot(user_ratings)

    # Normalize by the sum of the similarities for rated movies
    sum_similarity = unrated_similarity.sum(axis=1).A1  # Convert to 1D array
    valid_mask = sum_similarity > 0
    normalized_scores = np.divide(weighted_scores, sum_similarity, where=valid_mask)

    # Select top N recommendations
    top_movie_indices = np.argsort(-normalized_scores)[:n_recommendations]

    return top_movie_indices.tolist()


In [38]:
# generate a vector 1 x 3706, which represents the rating of user 1 to all movies
user_1 = ratings[ratings['UserID'] == 1]
user_1_ratings = user_1.set_index('MovieID')['Rating']
user_1_ratings = user_1_ratings.reindex(range(1, 3707), fill_value=0)
user_1_ratings.head()

MovieID
1    5
2    0
3    0
4    0
5    0
Name: Rating, dtype: int64

In [39]:
user_1_ratings.values.shape

(3706,)

In [40]:
recommend_movies(user_1_ratings.values, similarity_matrix)

[1826, 381, 383, 385, 1468, 3245, 2254, 3208, 1420, 3173]

In [41]:
# get the movie titles by movie ids
def get_movie_titles(movie_ids):
    return movies[movies['MovieID'].isin(movie_ids)]

In [42]:
user_1_recommend = get_movie_titles(recommend_movies(user_1_ratings.values, similarity_matrix, n_recommendations=50))
user_1_recommend

Unnamed: 0,MovieID,Title,Genres
214,216,Billy Madison (1995),Comedy
377,381,When a Man Loves a Woman (1994),Drama
379,383,Wyatt Earp (1994),Western
381,385,"Man of No Importance, A (1994)",Drama
521,525,"Saint of Fort Washington, The (1993)",Drama
552,556,"War Room, The (1993)",Documentary
555,559,"Paris, France (1993)",Comedy
558,562,Welcome to the Dollhouse (1995),Comedy|Drama
560,564,Chasers (1994),Comedy
607,611,Hellraiser: Bloodline (1996),Action|Horror|Sci-Fi


In [43]:
# get 10 movies are rated by user 1, sorted by rating
user_1_watched = user_1[user_1['MovieID'].isin(range(1, 3707))].sort_values(by='Rating', ascending=False)
# get movies info
user_1_movie = get_movie_titles(user_1_watched['MovieID'].values)
user_1_movie

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
47,48,Pocahontas (1995),Animation|Children's|Musical|Romance
148,150,Apollo 13 (1995),Drama
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
523,527,Schindler's List (1993),Drama|War
527,531,"Secret Garden, The (1993)",Children's|Drama
584,588,Aladdin (1992),Animation|Children's|Comedy|Musical
590,594,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
591,595,Beauty and the Beast (1991),Animation|Children's|Musical
604,608,Fargo (1996),Crime|Drama|Thriller
