In [59]:
import torch
import pandas as pd
import numpy as np


In [62]:
# Check if the notebook is running in Google Colab
def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

root = ""

# Adjust the path as per Google Drive directory structure
if is_running_in_colab():
    from google.colab import drive
    drive.mount('/content/drive')
    root = "/content/drive/My Drive/"
else:
    print("The notebook is running in a local environment.")
    



The notebook is running in a local environment.


In [None]:
# load the data
ratings_path = root + 'data/ratings.dat'
movies_path = root + 'data/movies.dat'

ratings = pd.read_csv(ratings_path, sep='::', engine='python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

movies = pd.read_csv(movies_path, sep='::', engine='python', encoding="ISO-8859-1", header=None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [63]:
# define user and movie rating matrix
item_feature_matrix = ratings.pivot_table(index='UserID', columns='MovieID', values='Rating')

# get mean ratings per row (user)
mean_ratings = np.array(item_feature_matrix.mean(axis=1, skipna=True)).reshape(-1, 1)

# normalize ratings by subtracting mean rating per user
normalized_ratings = item_feature_matrix - mean_ratings



In [64]:
# get intersection of two tensors
def intersect1d_torch(tensor1, tensor2):
    # Find unique elements in each tensor
    unique1 = torch.unique(tensor1)
    unique2 = torch.unique(tensor2)

    # Find common elements
    common = unique1[torch.isin(unique1, unique2)]

    return common

# compute similarity matrix by tensors
def compute_similarity_matrix_tensor(rating, min_common_users = 3, top_n = 10):
    min_common_users = min_common_users
    top_n = top_n

    # Check if CUDA (GPU support) is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # Convert DataFrame to tensors
    rating_0_tensor = torch.tensor(rating.fillna(0).values, dtype=torch.float32)
    rating_binary_tensor = (rating_0_tensor != 0).float()

    # Compute common ratings count
    common_ratings_count = torch.mm(rating_binary_tensor.T, rating_binary_tensor)

    # Prepare the similarity matrix
    num_movies = rating_0_tensor.shape[1]
    #similarity_matrix = torch.full((num_movies, num_movies), float('nan'))
    similarity_matrix = torch.full((num_movies, num_movies), float('nan')).to(device)


    for i in range(num_movies):
        for j in range(i + 1, num_movies):
            if common_ratings_count[i, j] >= min_common_users:
                # Filter ratings to include only common users
                user_0 = torch.where(rating_binary_tensor[:, i] == 1)[0]
                user_1 = torch.where(rating_binary_tensor[:, j] == 1)[0]
                common_users = intersect1d_torch(user_0, user_1)
                ratings_i = rating_0_tensor[common_users, i]
                ratings_j = rating_0_tensor[common_users, j]

                # Compute dot product and norms for the filtered ratings
                dot_product = torch.dot(ratings_i, ratings_j)
                norm_i = torch.norm(ratings_i)
                norm_j = torch.norm(ratings_j)

                if norm_i > 0 and norm_j > 0:
                    similarity = 0.5 + 0.5 * (dot_product / (norm_i * norm_j))
                else:
                    similarity = 0

                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity

    # Convert the similarity matrix to a DataFrame
    similarity_matrix_df = pd.DataFrame(similarity_matrix.cpu().numpy(),
                                       index=rating.columns,
                                       columns=rating.columns)
    
    # Get top N similar movies
    
    return similarity_matrix_df


In [68]:
specified_movies = [1, 10, 100]
r_n_matrix = normalized_ratings.iloc[:500, :100]
reduced_similarity_matrix = compute_similarity_matrix_tensor(r_n_matrix)
#print(reduced_similarity_matrix.shape)
#print(reduced_similarity_matrix)
print(reduced_similarity_matrix.loc[specified_movies, specified_movies].round(7))

cpu
MovieID       1         10        100
MovieID                              
1             NaN  0.615281  0.210864
10       0.615281       NaN  0.798482
100      0.210864  0.798482       NaN


In [None]:
specified_movies = [1, 10, 100, 1510, 260, 3212]
full_similarity_matrix = compute_similarity_matrix_tensor(normalized_ratings)
print(reduced_similarity_matrix.shape)
#print(reduced_similarity_matrix)
print(full_similarity_matrix.loc[specified_movies, specified_movies].round(7))

In [69]:
# get the movie titles by movie ids
def get_movie_titles(movie_ids):
    return movies[movies['MovieID'].isin(movie_ids)]

In [70]:
def recommend_movies(user_rating, s_matrix, n_recommendations = 10):
    """
    Generate movie recommendations based on new user ratings and a sparse similarity matrix.

    Parameters:
    - user_ratings: np.array, user's ratings for movies; 0 indicates the movie hasn't been rated.
    - s_matrix: item-item similarity matrix.
    - n_recommendations: int, the number of recommendations to return.

    Returns:
    - List of movie indices representing the top N recommendations.
    """

    # number of movies
    n_movies = len(user_rating)

    # iterate through each movie and calculate the weighted rating for user
    for l in range(n_movies):
        if user_rating[l] != 0:
            # get the similarity score for moviel l
            l_s_scores = np.nan_to_num(s_matrix.iloc[l])
            
            # get the weighted sum for movie l
            weighted_sum = np.sum(user_rating * l_s_scores)

            # get the normalization factor
            norm_factor = np.sum(l_s_scores * (user_rating != 0))

            # get the weighted average for movie l
            weighted_avg = np.sum(weighted_sum) / norm_factor if norm_factor != 0 else 0
            #print(l, " ", weighted_sum, " ", norm_factor, " ", weighted_avg)
            user_rating[l] = weighted_avg

    # get the top N recommendations
    top_n = np.argsort(user_rating)[-n_recommendations:]

    return top_n

In [72]:
# generate a vector 1 x 3706, which represents the rating of user 1 to all movies
user_1 = ratings[ratings['UserID'] == 1181]
user_1_ratings = user_1.set_index('MovieID')['Rating']
user_1_ratings = user_1_ratings.reindex(range(1, 3707), fill_value=0)

movie_ids = recommend_movies(user_1_ratings.values, reduced_similarity_matrix, 10)

ValueError: operands could not be broadcast together with shapes (3706,) (100,) 

In [75]:
# get the movie titles by movie ids
def get_movie_titles(movie_ids):
    return movies[movies['MovieID'].isin(movie_ids)]

get_movie_titles(movie_ids)

Unnamed: 0,MovieID,Title,Genres
536,540,Sliver (1993),Thriller
867,878,Bye-Bye (1995),Drama
910,922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Film-Noir
1568,1610,"Hunt for Red October, The (1990)",Action|Thriller
1571,1613,Star Maps (1997),Drama
1573,1615,"Edge, The (1997)",Adventure|Thriller
1576,1619,Seven Years in Tibet (1997),Drama|War
2252,2321,Pleasantville (1998),Comedy
2782,2851,Saturn 3 (1979),Adventure|Sci-Fi|Thriller


In [None]:
# sketch work


rating_0 = normalized_ratings.fillna(0)

# take the first 100 usrs and 500 movies
r_n_matrix = rating_0.iloc[:100, :500]

rating_binary = (r_n_matrix != 0).astype(int)
common_ratings_count = np.array(rating_binary.T.dot(rating_binary)) # fix 1
print(common_ratings_count)

# find users who have rated both movie 0
user_0 = rating_binary[rating_binary.iloc[:, 0] == 1].index - 1 # index starts from 1, fix 2
print(user_0)

# find users who have rated both movie 2
user_2 = rating_binary[rating_binary.iloc[:, 2] == 1].index - 1 # index starts from 1
print(user_2)

# # print user 25's rating for movie 0
# print(r_n_matrix.iloc[25, 0])

# # print user 44's rating for movie 0
# print(r_n_matrix.iloc[44, 0])

# # print user 25's rating for movie 2
# print(r_n_matrix.iloc[25, 2])

# # print user 44's rating for movie 2
# print(r_n_matrix.iloc[44, 2])

# find common users between user_0 and user_2
common_users = np.intersect1d(user_0, user_2) # fix 3
print(common_users)

# find ratings of common users for movie 0
movie_0_ratings = r_n_matrix.iloc[common_users, 0].values # fix 4
movie_0_ratings = r_n_matrix.iloc[common_users, 0].values # fix 4
print(movie_0_ratings)

# find ratings of common users for movie 2
movie_2_ratings = r_n_matrix.iloc[common_users, 2].values # fix 5
print(movie_2_ratings)

# calculate the cosine similarity between movie 0 and movie 2
product = np.dot(movie_0_ratings, movie_2_ratings)
norm_0 = np.linalg.norm(movie_0_ratings)
norm_2 = np.linalg.norm(movie_2_ratings)

score = product / (norm_0 * norm_2)

print(score)
print(0.5 + 0.5 * score)
