In [17]:
import torch
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')



In [4]:
# Check if the notebook is running in Google Colab
def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

root = ""

# Adjust the path as per Google Drive directory structure
if is_running_in_colab():
    from google.colab import drive
    drive.mount('/content/drive')
    root = "/content/drive/My Drive/"
else:
    print("The notebook is running in a local environment.")
    



The notebook is running in a local environment.


In [5]:
# load the data
ratings_path = root + 'data/ratings.dat'
movies_path = root + 'data/movies.dat'

ratings = pd.read_csv(ratings_path, sep='::', engine='python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

movies = pd.read_csv(movies_path, sep='::', engine='python', encoding="ISO-8859-1", header=None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [443]:
# define user and movie rating matrix
item_feature_matrix = ratings.pivot_table(index='UserID', columns='MovieID', values='Rating')

# get mean ratings per row (user)
mean_ratings = np.array(item_feature_matrix.mean(axis=1, skipna=True)).reshape(-1, 1)

# normalize ratings by subtracting mean rating per user
normalized_ratings = item_feature_matrix - mean_ratings



Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
       ...
       6031, 6032, 6033, 6034, 6035, 6036, 6037, 6038, 6039, 6040],
      dtype='int64', name='UserID', length=6040)


In [14]:
# get intersection of two tensors
def intersect1d_torch(tensor1, tensor2):
    # Find unique elements in each tensor
    unique1 = torch.unique(tensor1)
    unique2 = torch.unique(tensor2)

    # Find common elements
    common = unique1[torch.isin(unique1, unique2)]

    return common

# compute similarity matrix by tensors
def compute_similarity_matrix_tensor(rating, min_common_users = 3, top_n = 10):
    min_common_users = min_common_users
    top_n = top_n

    # Check if CUDA (GPU support) is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # Convert DataFrame to tensors
    rating_0_tensor = torch.tensor(rating.fillna(0).values, dtype=torch.float32)
    rating_binary_tensor = (rating_0_tensor != 0).float()

    # Compute common ratings count
    common_ratings_count = torch.mm(rating_binary_tensor.T, rating_binary_tensor)

    # Prepare the similarity matrix
    num_movies = rating_0_tensor.shape[1]
    #similarity_matrix = torch.full((num_movies, num_movies), float('nan'))
    similarity_matrix = torch.full((num_movies, num_movies), float('nan')).to(device)


    for i in range(num_movies):
        for j in range(i + 1, num_movies):
            if common_ratings_count[i, j] >= min_common_users:
                # Filter ratings to include only common users
                user_0 = torch.where(rating_binary_tensor[:, i] == 1)[0]
                user_1 = torch.where(rating_binary_tensor[:, j] == 1)[0]
                common_users = intersect1d_torch(user_0, user_1)
                ratings_i = rating_0_tensor[common_users, i]
                ratings_j = rating_0_tensor[common_users, j]

                # Compute dot product and norms for the filtered ratings
                dot_product = torch.dot(ratings_i, ratings_j)
                norm_i = torch.norm(ratings_i)
                norm_j = torch.norm(ratings_j)

                if norm_i > 0 and norm_j > 0:
                    similarity = 0.5 + 0.5 * (dot_product / (norm_i * norm_j))
                else:
                    similarity = 0

                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity

    # Convert the similarity matrix to a DataFrame
    similarity_matrix_df = pd.DataFrame(similarity_matrix.cpu().numpy(),
                                       index=rating.columns,
                                       columns=rating.columns)
    
    # Filter the top_n most similar movies for each movie (row)
    if top_n is not None:
        for i in range(similarity_matrix_df.shape[0]):
            row_indices = np.argsort(-similarity_matrix_df.iloc[i])[:top_n]
            mask = np.ones(similarity_matrix_df.shape[1], dtype=bool)
            mask[row_indices] = False
            similarity_matrix_df.iloc[i, mask] = np.nan

    return similarity_matrix_df


In [18]:
specified_movies = [1, 10, 100]
r_n_matrix = normalized_ratings.iloc[:500, :100]
reduced_similarity_matrix = compute_similarity_matrix_tensor(r_n_matrix, min_common_users=3, top_n=30)
#print(reduced_similarity_matrix.shape)
#print(reduced_similarity_matrix)
print(reduced_similarity_matrix.loc[specified_movies, specified_movies].round(7))

cpu
MovieID       1         10   100
MovieID                         
1             NaN  0.615281  NaN
10       0.615281       NaN  NaN
100           NaN       NaN  NaN


In [None]:
specified_movies = [1, 10, 100, 260]
r_n_matrix = normalized_ratings.iloc[:500, :300]
reduced_similarity_matrix = compute_similarity_matrix_tensor(r_n_matrix, 3, 30)
#print(reduced_similarity_matrix.shape)
#print(reduced_similarity_matrix)
print(reduced_similarity_matrix.loc[specified_movies, specified_movies].round(7))

In [256]:
# get the movie titles by movie ids
# keep the order of movie ids
def get_movie_titles(movie_ids, movies):
    # Create a dictionary to map MovieId to rows
    movie_dict = movies.set_index('MovieID').T.to_dict('list')

    # Use list comprehension to preserve the order of movie_ids
    ordered_data = [movie_dict[movie_id] for movie_id in movie_ids if movie_id in movie_dict]

    # Convert the list of data back to a DataFrame
    sub_df = pd.DataFrame(ordered_data, columns=movies.columns[1:], index=movie_ids)
    return sub_df

print(movies.head())

   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [240]:
# load similarity matrix from local npz file
data = np.load('data/similarity_matrix_raw.npz')
similarity_matrix_raw = data['matrix']

# convert similarity matrix to dataframe
similarity_matrix_raw_df = pd.DataFrame(similarity_matrix_raw,
                                          index=item_feature_matrix.columns,
                                          columns=item_feature_matrix.columns)

#print(similarity_matrix_raw_df)

specified_movies = [1, 10, 100, 1510, 260, 3212]
print(similarity_matrix_raw_df.loc[specified_movies, specified_movies].round(7))

MovieID      1         10        100   1510      260   3212
MovieID                                                    
1             NaN  0.512106  0.392000   NaN  0.741597   NaN
10       0.512106       NaN  0.547458   NaN  0.534349   NaN
100      0.392000  0.547458       NaN   NaN  0.329694   NaN
1510          NaN       NaN       NaN   NaN       NaN   NaN
260      0.741597  0.534349  0.329694   NaN       NaN   NaN
3212          NaN       NaN       NaN   NaN       NaN   NaN


In [237]:
# Filter the top_n most similar movies for each movie (row)
def keep_top_n(similarity_matrix, top_n=30):
    similarity_matrix_top_n = similarity_matrix.copy()
    for i in range(similarity_matrix.shape[0]):
        row_indices = np.argsort(-similarity_matrix.iloc[i])[:top_n]
        mask = np.ones(similarity_matrix.shape[1], dtype=bool)
        mask[row_indices] = False
        similarity_matrix_top_n.iloc[i, mask] = np.nan

    return similarity_matrix_top_n


# construct a 8 by 8 matrix
sample_matrix = pd.DataFrame(np.array(
                         [[0, 0, 0, 0.3, 0.2, 0.4, 0, 0],
                          [0, 0, 0.8, 0.9, 0, 0.2, 0, 0],
                          [0, 0, 0, 0.1, 0.3, 0.5, 0, 0],
                          [0.3, 0.2, 0.1, 0, 0, 0, 0.1, 0.2],
                          [0.2, 0.1, 0.3, 0, 0, 0, 0.2, 0.1],
                          [0.4, 0.3, 0.5, 0, 0, 0, 0.3, 0.4],
                          [0, 0, 0, 0.1, 0.2, 0.3, 0, 0],
                          [0, 0, 0, 0.2, 0.1, 0.4, 0, 0]]))
keep_top_n(sample_matrix, top_n=2)

Unnamed: 0,0,1,2,3,4,5,6,7
0,,,,0.3,,0.4,,
1,,,0.8,0.9,,,,
2,,,,,0.3,0.5,,
3,0.3,0.2,,,,,,
4,0.2,,0.3,,,,,
5,0.4,,0.5,,,,,
6,,,,,0.2,0.3,,
7,,,,0.2,,0.4,,


In [241]:
similarity_matrix_top30_df = keep_top_n(similarity_matrix_raw_df, top_n=30)
print(similarity_matrix_top30_df.loc[specified_movies, specified_movies].round(7))

MovieID  1     10    100   1510  260   3212
MovieID                                    
1         NaN   NaN   NaN   NaN   NaN   NaN
10        NaN   NaN   NaN   NaN   NaN   NaN
100       NaN   NaN   NaN   NaN   NaN   NaN
1510      NaN   NaN   NaN   NaN   NaN   NaN
260       NaN   NaN   NaN   NaN   NaN   NaN
3212      NaN   NaN   NaN   NaN   NaN   NaN


In [423]:
def recommend_movies(user_rating, s_matrix, n_recommendations = 10):
    """
    Generate movie recommendations based on new user ratings and a sparse similarity matrix.

    Parameters:
    - user_ratings: np.array, user's ratings for movies; 0 indicates the movie hasn't been rated.
    - s_matrix: item-item similarity matrix.
    - n_recommendations: int, the number of recommendations to return.

    Returns:
    - List of movie indices representing the top N recommendations.
    """

    # number of movies
    n_movies = len(user_rating)
    
    # initial rated movies
    user_rating_full = np.full(shape=len(user_rating), fill_value = -1.0)
    
    # iterate through each movie and calculate the weighted rating for user
    for l in range(n_movies):
        #print("\nloop :", l)
        
        if user_rating[l] == 0:
            # get the similarity score for moviel l
            l_s_scores = np.nan_to_num(s_matrix.iloc[l,:])
            #print('l_s_scores ', l_s_scores)
            
            # get the weighted sum for movie l
            #print('user_rating ', user_rating)
            weighted_sum = np.dot(l_s_scores, user_rating)
            #print('weighted_sum ', weighted_sum)

            # get the normalization factor
            mask = (user_rating != 0).astype(int)
            #print('mask ', mask)
            norm_factor = np.dot(l_s_scores, mask)
            #print('norm_factor ', norm_factor)

            # get the weighted average for movie l
            weighted_avg = (weighted_sum) / norm_factor if norm_factor != 0 else 0
            #print('weighted_avg ', weighted_avg)

            #print(l, " ", weighted_sum, " ", norm_factor, " ", weighted_avg)
            user_rating_full[l] = weighted_avg
        #else:
            #user_rating[l] = -1

    
    # get the top N recommendations index
    top_n = np.argsort(user_rating_full)[-(n_recommendations):][::-1]
    
    # get top N movie ID
    top_n_id = s_matrix.index[top_n]
    print(top_n_id)
    
    top_n_score = user_rating_full[top_n]
    #print(top_n_score)
        

    return top_n_id, top_n_score
    #return top_n, top_n_score



In [413]:
# construct a 8 by 8 matrix
sample_matrix = pd.DataFrame(np.array(
                         [[0, 0.1, 0, 0.3, 0.2, 0.4, 0, 0.1],
                          [0.1, 0, 0.8, 0.9, 0, 0.2, 0, 0],
                          [0, 0.8, 0, 0, 0.4, 0.1, 0.3, 0.5],
                          [0.3, 0.9, 0, 0, 0, 0.1, 0.0, 0.2],
                          [0.2, 0, 0.4, 0, 0, 0.1, 0.2, 0.1],
                          [0.4, 0.2, 0.1, 0.3, 0.1, 0, 0, 0.1],
                          [0, 0.1, 0.3, 0, 0.2, 0, 0, 0],
                          [0.1, 0, 0.5, 0.2, 0.1, 0.1, 0, 0]]))

sample_matrix_top3_df = keep_top_n(sample_matrix, top_n=3)
#print(sample_matrix_top3_df)

sample_user_ratings = np.array([2, 0, 0, 0, 4, 0, 0, 5])

movie_ids, scores = recommend_movies(sample_user_ratings, sample_matrix_top3_df, 10)

Index([2, 6, 3, 5, 1, 7, 4, 0], dtype='int64')


In [447]:

# get user rating for user 1351
user_ratings = item_feature_matrix.loc[1351].fillna(0)

#create a user rating vector 3707 x 1, populate with 0
user_ratings = np.zeros(3706)
user_ratings[1481] = 5
user_ratings[1600] = 4

# get movie index from similarity matrix
movie_index = similarity_matrix_top30_df.index.get_loc(46)

#movie 3732 s_matrix
movie_s_matrix = similarity_matrix_top30_df.iloc[movie_index, :]

# weighted sum
weighted_sum = np.dot(user_ratings, movie_s_matrix)
print(f'weighted_sum: {weighted_sum:.2f}')

# normalization factor
mask = (user_ratings != 0).astype(int)
norm_factor = np.dot(mask, movie_s_matrix)
print(f'norm_factor: {norm_factor:.2f}')

# weighted average
weighted_avg = (weighted_sum) / norm_factor if norm_factor != 0 else 0

print(f'weighted_avg: {weighted_avg:.2f}')


weighted_sum: nan
norm_factor: nan
weighted_avg: nan


In [453]:
# get user rating for user 1
user_ratings = item_feature_matrix.loc[1811].fillna(0)


# get movie index from similarity matrix
movie_index = similarity_matrix_top30_df.index.get_loc(2000)


#movie 3732 s_matrix
movie_s_matrix = (similarity_matrix_top30_df.iloc[movie_index, :])


# weighted sum
#weighted_sum = np.dot(user_ratings, movie_s_matrix)
weighted_sum = 0
for i in range(len(user_ratings)):
    weighted_sum += user_ratings.iloc[i] * movie_s_matrix.iloc[i]
print(f'weighted_sum: {weighted_sum:.2f}')

# normalization factor

weighted_norm = 0
for i in range(len(user_ratings)):
    weighted_norm += user_ratings.iloc[i] if movie_s_matrix.iloc[i] != np.nan else 0
print(f'norm_factor: {norm_factor:.2f}')

# weighted average
weighted_avg = (weighted_sum) / norm_factor if norm_factor != 0 else 0

print(f'weighted_avg: {weighted_avg:.2f}')

weighted_sum: nan
norm_factor: nan
weighted_avg: nan


In [425]:
print(similarity_matrix_top30_df.index)

index_position = similarity_matrix_top30_df.columns.get_loc(3732)
print(index_position)





Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
       ...
       3943, 3944, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
      dtype='int64', name='MovieID', length=3706)
3490


In [410]:
# hypothetical user testcase

print(similarity_matrix_top30_df.index.get_loc(1613))
print(similarity_matrix_top30_df.index.get_loc(1755))

#create a user rating vector 3707 x 1, populate with 0
user_hypo_ratings = np.zeros(3706)
user_hypo_ratings[1481] = 5
user_hypo_ratings[1600] = 4

movie_ids, scores = recommend_movies(user_hypo_ratings, similarity_matrix_top30_df, 10)

for i in range(len(movie_ids)):
    print(f'm{movie_ids[i]} ({scores[i]:.2f})')
get_movie_titles(movie_ids, movies)

1481
1600
Index([466, 231, 2, 1936, 1997, 2000, 1086, 2881, 1897, 2590], dtype='int64', name='MovieID')
m466 (5.00)
m231 (5.00)
m2 (5.00)
m1936 (5.00)
m1997 (5.00)
m2000 (5.00)
m1086 (5.00)
m2881 (5.00)
m1897 (5.00)
m2590 (5.00)


Unnamed: 0_level_0,Title,Genres
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
466,Hot Shots! Part Deux (1993),Action|Comedy|War
231,Dumb & Dumber (1994),Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
1936,Mrs. Miniver (1942),Drama|War
1997,"Exorcist, The (1973)",Horror
2000,Lethal Weapon (1987),Action|Comedy|Crime|Drama
1086,Dial M for Murder (1954),Mystery|Thriller
2881,Double Jeopardy (1999),Action|Thriller
1897,High Art (1998),Drama|Romance
2590,Hideous Kinky (1998),Drama


In [454]:
# get user rating for user 1351
user_1351_ratings = item_feature_matrix.loc[1351].fillna(0)
# count number of 0 in user rating
print(user_1351_ratings[user_1351_ratings == 0].count())

movie_ids, scores = recommend_movies(user_1351_ratings.values, similarity_matrix_top30_df, 10)

for i in range(len(movie_ids)):
    print(f'm{movie_ids[i]} ({scores[i]:.2f})')
get_movie_titles(movie_ids, movies)

3686
Index([2098, 923, 2846, 2283, 833, 1711, 879, 2801, 2326, 2339], dtype='int64', name='MovieID')
m2098 (5.00)
m923 (5.00)
m2846 (5.00)
m2283 (5.00)
m833 (5.00)
m1711 (5.00)
m879 (5.00)
m2801 (5.00)
m2326 (5.00)
m2339 (5.00)


Unnamed: 0_level_0,Title,Genres
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
2098,Son of Flubber (1963),Children's|Comedy
923,Citizen Kane (1941),Drama
2846,"Adventures of Milo and Otis, The (1986)",Children's
2283,"Sheltering Sky, The (1990)",Drama
833,High School High (1996),Comedy
1711,Midnight in the Garden of Good and Evil (1997),Comedy|Crime|Drama|Mystery
879,"Relic, The (1997)",Horror
2801,Oscar and Lucinda (a.k.a. Oscar & Lucinda) (1997),Drama|Romance
2326,Shattered Image (1998),Drama|Thriller
2339,I'll Be Home For Christmas (1998),Comedy|Romance


In [424]:
# get user rating for user 1181
user_1181_ratings = item_feature_matrix.loc[1181].fillna(0)
# count number of 0 in user rating
#print(user_1181_ratings[user_1181_ratings == 0].count())
print(user_1181_ratings)

movie_ids, scores = recommend_movies(user_1181_ratings.values, similarity_matrix_top30_df, 10)

for i in range(len(movie_ids)):
    print(f'm{movie_ids[i]} ({scores[i]:.2f})')
get_movie_titles(movie_ids, movies)

MovieID
1       3.0
2       1.0
3       1.0
4       0.0
5       0.0
       ... 
3948    3.0
3949    4.0
3950    0.0
3951    0.0
3952    0.0
Name: 1181, Length: 3706, dtype: float64
Index([3184, 3581, 560, 2765, 2201, 3642, 3276, 2843, 389, 1817], dtype='int64', name='MovieID')
m3184 (5.00)
m3581 (5.00)
m560 (4.43)
m2765 (4.42)
m2201 (4.23)
m3642 (4.21)
m3276 (4.16)
m2843 (4.12)
m389 (4.11)
m1817 (4.11)


Unnamed: 0_level_0,Title,Genres
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
3184,Montana (1998),Action|Comedy|Crime|Drama
3581,Human Traffic (1999),Drama
560,"Beans of Egypt, Maine, The (1994)",Drama
2765,"Acid House, The (1998)",Comedy|Drama
2201,"Paradine Case, The (1947)",Drama
3642,In Old California (1942),Western
3276,Gun Shy (2000),Comedy
2843,"Black Cat, White Cat (Crna macka, beli macor) ...",Comedy|Romance
389,"Colonel Chabert, Le (1994)",Drama|Romance|War
1817,No Looking Back (1998),Comedy|Drama|Romance


In [421]:
user_1181_ratings = ratings[ratings['UserID'] == 1181]
#print(user_1181_ratings)

#print(user_1181_ratings.set_index('MovieID'))
#print(user_1181_ratings.set_index('MovieID')['Rating'])

# 3707 by 1 vector


user_1181_ratings = user_1181_ratings.set_index('MovieID')['Rating']

print(user_1181_ratings.sort_index(inplace=False))

MovieID
1       3
2       1
3       1
6       3
7       2
       ..
3943    4
3946    3
3947    3
3948    3
3949    4
Name: Rating, Length: 1521, dtype: int64
