In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Function to normalize the rating matrix by centering each row
def normalize_ratings_by_row_mean(matrix):
    row_means = matrix.mean(axis=1, skipna=True)  # Calculate row means excluding NaN values
    normalized_matrix = matrix.sub(row_means, axis=0)
    return normalized_matrix

In [3]:
# Read the R-matrix
Rmat = pd.read_csv('Rmat.csv', index_col=0)

# Normalize R-matrix
Rmat = normalize_ratings_by_row_mean(Rmat)

Rmat

Unnamed: 0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,...,m3943,m3944,m3945,m3946,m3947,m3948,m3949,m3950,m3951,m3952
u1,0.811321,,,,,,,,,,...,,,,,,,,,,
u2,,,,,,,,,,,...,,,,,,,,,,
u3,,,,,,,,,,,...,,,,,,,,,,
u4,,,,,,,,,,,...,,,,,,,,,,
u5,,,,,,-1.146465,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u6036,,,,-1.302928,,-0.302928,,,,,...,,,,,,,,,,
u6037,,,,,,,,,,,...,,,,,,,,,,
u6038,,,,,,,,,,,...,,,,,,,,,,
u6039,,,,,,,,,,,...,,,,,,,,,,


In [4]:
import time
def compute_similarity(matrix):
    matrix_values = matrix.values
    num_movies = len(matrix.columns)
    similarity_matrix = np.ones((num_movies, num_movies))*np.nan

    for i in range(num_movies):
        start_time = time.time()
        for j in range(i+1, num_movies):
            common_users = np.where(~np.isnan(matrix_values[:, i]) & ~np.isnan(matrix_values[:, j]))[0]
            if len(common_users) > 2:
                ratings_i = matrix_values[common_users, i]
                ratings_j = matrix_values[common_users, j]

                numerator = np.sum(ratings_i * ratings_j)
                denominator = np.sqrt(np.sum(ratings_i**2)) * np.sqrt(np.sum(ratings_j**2))

                if denominator != 0:
                    similarity = 0.5 + 0.5 * (numerator / denominator)
                    similarity_matrix[i, j] = similarity
                    similarity_matrix[j, i] = similarity
        if (i+1)%50==0 or i+1==num_movies:
            finish_time = time.time()
            run_time = finish_time-start_time
            print(f'Progress: {i+1}/{num_movies} (Run Time: {round(run_time, 2)})')
    return similarity_matrix

In [5]:
Smat = compute_similarity(Rmat)

Progress: 50/3706 (Run Time: 0.09)
Progress: 100/3706 (Run Time: 0.06)
Progress: 150/3706 (Run Time: 0.07)
Progress: 200/3706 (Run Time: 0.07)
Progress: 250/3706 (Run Time: 0.07)
Progress: 300/3706 (Run Time: 0.07)
Progress: 350/3706 (Run Time: 0.05)
Progress: 400/3706 (Run Time: 0.07)
Progress: 450/3706 (Run Time: 0.06)
Progress: 500/3706 (Run Time: 0.06)
Progress: 550/3706 (Run Time: 0.04)
Progress: 600/3706 (Run Time: 0.04)
Progress: 650/3706 (Run Time: 0.06)
Progress: 700/3706 (Run Time: 0.07)
Progress: 750/3706 (Run Time: 0.02)
Progress: 800/3706 (Run Time: 0.06)
Progress: 850/3706 (Run Time: 0.06)
Progress: 900/3706 (Run Time: 0.04)
Progress: 950/3706 (Run Time: 0.05)
Progress: 1000/3706 (Run Time: 0.04)
Progress: 1050/3706 (Run Time: 0.05)
Progress: 1100/3706 (Run Time: 0.05)
Progress: 1150/3706 (Run Time: 0.04)
Progress: 1200/3706 (Run Time: 0.06)
Progress: 1250/3706 (Run Time: 0.05)
Progress: 1300/3706 (Run Time: 0.04)
Progress: 1350/3706 (Run Time: 0.04)
Progress: 1400/3706 (

In [6]:
Smat_df = pd.DataFrame(Smat, index=Rmat.columns, columns=Rmat.columns)
Smat_df

Unnamed: 0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,...,m3943,m3944,m3945,m3946,m3947,m3948,m3949,m3950,m3951,m3952
m1,,0.442636,0.410766,0.404020,0.362457,0.579168,0.475257,0.303379,0.188966,0.512106,...,0.381834,0.089488,0.227803,0.270004,0.649773,0.548122,0.702972,0.555231,0.703109,0.600869
m2,0.442636,,0.546743,0.460495,0.648002,0.412014,0.571488,0.488686,0.689619,0.541504,...,0.398421,,0.441406,0.685420,0.261454,0.553910,0.323460,0.316563,0.223420,0.421348
m3,0.410766,0.546743,,0.640756,0.677383,0.437453,0.547960,0.669407,0.587030,0.568835,...,0.439756,0.458236,0.385446,0.703825,0.503654,0.528168,0.284015,0.515971,0.490683,0.427316
m4,0.404020,0.460495,0.640756,,0.710145,0.318943,0.513088,0.638414,0.494078,0.369146,...,0.764084,,,0.907509,0.348647,0.457428,0.168765,0.621891,0.660954,0.449424
m5,0.362457,0.648002,0.677383,0.710145,,0.388942,0.587943,0.608642,0.758438,0.503592,...,0.131195,0.509458,0.601690,0.855912,0.644852,0.590165,0.239982,0.517104,,0.347755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m3948,0.548122,0.553910,0.528168,0.457428,0.590165,0.534482,0.477317,0.323261,0.441275,0.574412,...,0.465711,0.536067,0.448532,0.515220,0.369859,,0.491852,0.425333,0.307334,0.506530
m3949,0.702972,0.323460,0.284015,0.168765,0.239982,0.615329,0.355397,0.110612,0.213767,0.439866,...,0.563115,0.301218,0.252580,0.295247,0.657607,0.491852,,0.594070,0.656516,0.566232
m3950,0.555231,0.316563,0.515971,0.621891,0.517104,0.419051,0.649063,0.796771,,0.589969,...,0.480608,0.233652,0.020660,0.502494,0.532813,0.425333,0.594070,,0.662502,0.573728
m3951,0.703109,0.223420,0.490683,0.660954,,0.380572,0.676682,0.131729,,0.022272,...,0.426416,0.387007,0.173395,0.398000,,0.307334,0.656516,0.662502,,0.690165


In [7]:
Smat_df.loc[["m1", "m10", "m100", "m1510", 'm260', 'm3212'], ["m1", "m10", "m100", "m1510", 'm260', 'm3212']]

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.512106,0.392,,0.741148,
m10,0.512106,,0.547458,,0.534334,
m100,0.392,0.547458,,,0.329694,
m1510,,,,,,
m260,0.741148,0.534334,0.329694,,,
m3212,,,,,,


In [8]:
Smat_df.to_csv('Smat.csv')

Compute the Cosine similarity among the 3,706 movies. For movies $i$ and $j$, let $I_{ij}$ denote the set of users who rated both movies $i$ and $j$. We decide to ignore similarities computed based on less than three user ratings. Thus, define the similarity between movie $i$ and $j$ as follows, when the cardinality of $I_{ij}$ is bigger than two,

$$
S_{ij} = \frac{1}{2} + \frac{1}{2}\frac{\sum_{l \in I_{ij}}R_{li}R_{lj}}{\sqrt{\sum_{l \in I_{ij}}R^2_{li}} \sqrt{\sum_{l \in I_{ij}}R^2_{lj}}}
$$

This transformation (1 + cos)/2 ensures that similarity measures are between 0 and 1. NA values may occur when 1) the set $I_{ij}$ has a cardinality less than or equal to two (i.e., this pair of movies have been rated by only zero, one, or two users) or 2) one of the denominators is zero.