In [17]:
import torch
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')



In [4]:
# Check if the notebook is running in Google Colab
def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

root = ""

# Adjust the path as per Google Drive directory structure
if is_running_in_colab():
    from google.colab import drive
    drive.mount('/content/drive')
    root = "/content/drive/My Drive/"
else:
    print("The notebook is running in a local environment.")
    



The notebook is running in a local environment.


In [5]:
# load the data
ratings_path = root + 'data/ratings.dat'
movies_path = root + 'data/movies.dat'

ratings = pd.read_csv(ratings_path, sep='::', engine='python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

movies = pd.read_csv(movies_path, sep='::', engine='python', encoding="ISO-8859-1", header=None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [6]:
# define user and movie rating matrix
item_feature_matrix = ratings.pivot_table(index='UserID', columns='MovieID', values='Rating')

# get mean ratings per row (user)
mean_ratings = np.array(item_feature_matrix.mean(axis=1, skipna=True)).reshape(-1, 1)

# normalize ratings by subtracting mean rating per user
normalized_ratings = item_feature_matrix - mean_ratings



In [14]:
# get intersection of two tensors
def intersect1d_torch(tensor1, tensor2):
    # Find unique elements in each tensor
    unique1 = torch.unique(tensor1)
    unique2 = torch.unique(tensor2)

    # Find common elements
    common = unique1[torch.isin(unique1, unique2)]

    return common

# compute similarity matrix by tensors
def compute_similarity_matrix_tensor(rating, min_common_users = 3, top_n = 10):
    min_common_users = min_common_users
    top_n = top_n

    # Check if CUDA (GPU support) is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # Convert DataFrame to tensors
    rating_0_tensor = torch.tensor(rating.fillna(0).values, dtype=torch.float32)
    rating_binary_tensor = (rating_0_tensor != 0).float()

    # Compute common ratings count
    common_ratings_count = torch.mm(rating_binary_tensor.T, rating_binary_tensor)

    # Prepare the similarity matrix
    num_movies = rating_0_tensor.shape[1]
    #similarity_matrix = torch.full((num_movies, num_movies), float('nan'))
    similarity_matrix = torch.full((num_movies, num_movies), float('nan')).to(device)


    for i in range(num_movies):
        for j in range(i + 1, num_movies):
            if common_ratings_count[i, j] >= min_common_users:
                # Filter ratings to include only common users
                user_0 = torch.where(rating_binary_tensor[:, i] == 1)[0]
                user_1 = torch.where(rating_binary_tensor[:, j] == 1)[0]
                common_users = intersect1d_torch(user_0, user_1)
                ratings_i = rating_0_tensor[common_users, i]
                ratings_j = rating_0_tensor[common_users, j]

                # Compute dot product and norms for the filtered ratings
                dot_product = torch.dot(ratings_i, ratings_j)
                norm_i = torch.norm(ratings_i)
                norm_j = torch.norm(ratings_j)

                if norm_i > 0 and norm_j > 0:
                    similarity = 0.5 + 0.5 * (dot_product / (norm_i * norm_j))
                else:
                    similarity = 0

                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity

    # Convert the similarity matrix to a DataFrame
    similarity_matrix_df = pd.DataFrame(similarity_matrix.cpu().numpy(),
                                       index=rating.columns,
                                       columns=rating.columns)
    
    # Filter the top_n most similar movies for each movie (row)
    if top_n is not None:
        for i in range(similarity_matrix_df.shape[0]):
            row_indices = np.argsort(-similarity_matrix_df.iloc[i])[:top_n]
            mask = np.ones(similarity_matrix_df.shape[1], dtype=bool)
            mask[row_indices] = False
            similarity_matrix_df.iloc[i, mask] = np.nan

    return similarity_matrix_df


In [18]:
specified_movies = [1, 10, 100]
r_n_matrix = normalized_ratings.iloc[:500, :100]
reduced_similarity_matrix = compute_similarity_matrix_tensor(r_n_matrix, min_common_users=3, top_n=30)
#print(reduced_similarity_matrix.shape)
#print(reduced_similarity_matrix)
print(reduced_similarity_matrix.loc[specified_movies, specified_movies].round(7))

cpu
MovieID       1         10   100
MovieID                         
1             NaN  0.615281  NaN
10       0.615281       NaN  NaN
100           NaN       NaN  NaN


In [None]:
specified_movies = [1, 10, 100, 260]
r_n_matrix = normalized_ratings.iloc[:500, :300]
reduced_similarity_matrix = compute_similarity_matrix_tensor(r_n_matrix, 3, 30)
#print(reduced_similarity_matrix.shape)
#print(reduced_similarity_matrix)
print(reduced_similarity_matrix.loc[specified_movies, specified_movies].round(7))

In [55]:
# get the movie titles by movie ids
def get_movie_titles(movie_ids):
    return movies[movies['MovieID'].isin(movie_ids)].reset_index(drop=True)

In [156]:
# load similarity matrix from local npz file
data = np.load('data/similarity_matrix_raw.npz')
similarity_matrix_raw = data['matrix']

# convert similarity matrix to dataframe
similarity_matrix_raw_df = pd.DataFrame(similarity_matrix_raw,
                                          index=item_feature_matrix.columns,
                                          columns=item_feature_matrix.columns)

#print(similarity_matrix_raw_df)

specified_movies = [1, 10, 100, 1510, 260, 3212]
print(similarity_matrix_raw_df.loc[specified_movies, specified_movies].round(7))

MovieID      1         10        100   1510      260   3212
MovieID                                                    
1             NaN  0.512106  0.392000   NaN  0.741597   NaN
10       0.512106       NaN  0.547458   NaN  0.534349   NaN
100      0.392000  0.547458       NaN   NaN  0.329694   NaN
1510          NaN       NaN       NaN   NaN       NaN   NaN
260      0.741597  0.534349  0.329694   NaN       NaN   NaN
3212          NaN       NaN       NaN   NaN       NaN   NaN


In [159]:
# Filter the top_n most similar movies for each movie (row)
def keep_top_n(similarity_matrix, top_n=30):
    for i in range(similarity_matrix.shape[0]):
        row_indices = np.argsort(-similarity_matrix.iloc[i])[:top_n]
        mask = np.ones(similarity_matrix.shape[1], dtype=bool)
        mask[row_indices] = False
        similarity_matrix.iloc[i, mask] = np.nan

    return similarity_matrix


# construct a 8 by 8 matrix
sample_matrix = pd.DataFrame(np.array(
                         [[0, 0, 0, 0.3, 0.2, 0.4, 0, 0],
                          [0, 0, 0.8, 0.9, 0, 0.2, 0, 0],
                          [0, 0, 0, 0.1, 0.3, 0.5, 0, 0],
                          [0.3, 0.2, 0.1, 0, 0, 0, 0.1, 0.2],
                          [0.2, 0.1, 0.3, 0, 0, 0, 0.2, 0.1],
                          [0.4, 0.3, 0.5, 0, 0, 0, 0.3, 0.4],
                          [0, 0, 0, 0.1, 0.2, 0.3, 0, 0],
                          [0, 0, 0, 0.2, 0.1, 0.4, 0, 0]]))
keep_top_n(sample_matrix, top_n=2)

Unnamed: 0,0,1,2,3,4,5,6,7
0,,,,0.3,,0.4,,
1,,,0.8,0.9,,,,
2,,,,,0.3,0.5,,
3,0.3,0.2,,,,,,
4,0.2,,0.3,,,,,
5,0.4,,0.5,,,,,
6,,,,,0.2,0.3,,
7,,,,0.2,,0.4,,


In [None]:
similarity_matrix_top30_df = keep_top_n(similarity_matrix_raw_df, top_n=30)
print(similarity_matrix_top30_df.loc[specified_movies, specified_movies].round(7))

In [172]:
def recommend_movies(user_rating, s_matrix, n_recommendations = 10):
    """
    Generate movie recommendations based on new user ratings and a sparse similarity matrix.

    Parameters:
    - user_ratings: np.array, user's ratings for movies; 0 indicates the movie hasn't been rated.
    - s_matrix: item-item similarity matrix.
    - n_recommendations: int, the number of recommendations to return.

    Returns:
    - List of movie indices representing the top N recommendations.
    """

    # number of movies
    n_movies = len(user_rating)

    # initial rated movies
    ini_movies = set(np.where(user_rating != 0)[0])
    #print(ini_movies)

    # iterate through each movie and calculate the weighted rating for user
    for l in range(n_movies):
        print("loop :", l)
        if user_rating[l] == 0:
            # get the similarity score for moviel l
            l_s_scores = np.nan_to_num(s_matrix.iloc[l])
            print(l_s_scores)
            
            # get the weighted sum for movie l
            print(user_rating)
            weighted_sum = np.dot(user_rating, l_s_scores)
            print(weighted_sum)

            # get the normalization factor
            mask = (user_rating != 0).astype(int)
            print(mask)
            norm_factor = np.dot(mask, l_s_scores)
            print(norm_factor)

            # get the weighted average for movie l
            weighted_avg = (weighted_sum) / norm_factor if norm_factor != 0 else 0
            print(f'{l}: {weighted_avg}') if l == 3663 else None

            #print(l, " ", weighted_sum, " ", norm_factor, " ", weighted_avg)
            user_rating[l] = weighted_avg
        #else:
            #user_rating[l] = -1

    
    # get the top N recommendations index
    top_n = np.argsort(user_rating)[-(n_recommendations):][::-1]
    
    # get top N movie ID
    top_n_id = s_matrix.columns[top_n]
    
    top_n_score = user_rating[top_n]
    #print(top_n_score)
        

    return top_n_id, top_n_score

# construct a 8 by 8 matrix
sample_matrix = pd.DataFrame(np.array(
                         [[0, 0.1, 0, 0.3, 0.2, 0.4, 0, 0.1],
                          [0.1, 0, 0.8, 0.9, 0, 0.2, 0, 0],
                          [0, 0.8, 0, 0, 0.4, 0.1, 0.3, 0.5],
                          [0.3, 0.9, 0, 0, 0.4, 0.1, 0.3, 0.5],
                          [0.2, 0, 0.4, 0, 0, 0.1, 0.2, 0.1],
                          [0.4, 0.2, 0.1, 0.3, 0.1, 0, 0, 0.1],
                          [0, 0.1, 0.3, 0, 0.2, 0, 0, 0],
                          [0.1, 0, 0.5, 0.2, 0.1, 0.1, 0, 0]]))

sample_matrix_top3_df = keep_top_n(sample_matrix, top_n=3)
print(sample_matrix_top3_df)

sample_user_ratings = np.array([2, 0, 0, 0, 4, 0, 0, 5])

movie_ids, scores = recommend_movies(sample_user_ratings, sample_matrix_top3_df, 10)

for i in range(len(movie_ids)):
    print(f'm{movie_ids[i]} ({scores[i]:.2f})')

     0    1    2    3    4    5    6    7
0  NaN  NaN  NaN  0.3  0.2  0.4  NaN  NaN
1  NaN  NaN  0.8  0.9  NaN  0.2  NaN  NaN
2  NaN  0.8  NaN  NaN  0.4  NaN  NaN  0.5
3  NaN  0.9  NaN  NaN  0.4  NaN  NaN  0.5
4  0.2  NaN  0.4  NaN  NaN  NaN  0.2  NaN
5  0.4  0.2  NaN  0.3  NaN  NaN  NaN  NaN
6  NaN  0.1  0.3  NaN  0.2  NaN  NaN  NaN
7  0.1  NaN  0.5  0.2  NaN  NaN  NaN  NaN
loop : 0
loop : 1
[0.  0.  0.8 0.9 0.  0.2 0.  0. ]
[2 0 0 0 4 0 0 5]
0.0
[1 0 0 0 1 0 0 1]
0.0
loop : 2
[0.  0.8 0.  0.  0.4 0.  0.  0.5]
[2 0 0 0 4 0 0 5]
4.1
[1 0 0 0 1 0 0 1]
0.9
loop : 3
[0.  0.9 0.  0.  0.4 0.  0.  0.5]
[2 0 4 0 4 0 0 5]
4.1
[1 0 1 0 1 0 0 1]
0.9
loop : 4
loop : 5
[0.4 0.2 0.  0.3 0.  0.  0.  0. ]
[2 0 4 4 4 0 0 5]
2.0
[1 0 1 1 1 0 0 1]
0.7
loop : 6
[0.  0.1 0.3 0.  0.2 0.  0.  0. ]
[2 0 4 4 4 2 0 5]
2.0
[1 0 1 1 1 1 0 1]
0.5
loop : 7
m7 (5.00)
m6 (4.00)
m4 (4.00)
m3 (4.00)
m2 (4.00)
m5 (2.00)
m0 (2.00)
m1 (0.00)


In [161]:
# generate a vector 1 x 300, which represents the rating of user 1 to all movies


user_1181_ratings = ratings[ratings['UserID'] == 1181]
user_1181_ratings = user_1181_ratings.set_index('MovieID')['Rating']
user_1181_ratings = user_1181_ratings.reindex(range(1, 3707), fill_value=0.0)

movie_ids, scores = recommend_movies(user_1181_ratings.values, similarity_matrix_top30_df, 10)

for i in range(len(movie_ids)):
    print(f'm{movie_ids[i]} ({scores[i]:.2f})')
get_movie_titles(movie_ids)

MovieID
1            NaN
2            NaN
3            NaN
4            NaN
5            NaN
          ...   
3948         NaN
3949         NaN
3950         NaN
3951         NaN
3952    0.449424
Name: 4, Length: 3706, dtype: float32
[3. 1. 1. ... 3. 1. 3.]
nan
[1 1 1 ... 1 1 1]
nan
MovieID
1            NaN
2            NaN
3            NaN
4            NaN
5            NaN
          ...   
3948         NaN
3949         NaN
3950         NaN
3951         NaN
3952    0.347755
Name: 5, Length: 3706, dtype: float32
[3. 1. 1. ... 3. 1. 3.]
nan
[1 1 1 ... 1 1 1]
nan
MovieID
1            NaN
2            NaN
3            NaN
4            NaN
5            NaN
          ...   
3948         NaN
3949         NaN
3950         NaN
3951         NaN
3952    0.258232
Name: 8, Length: 3706, dtype: float32
[3. 1. 1. ... 3. 1. 3.]
nan
[1 1 1 ... 1 1 1]
nan
MovieID
1            NaN
2            NaN
3            NaN
4            NaN
5            NaN
          ...   
3948         NaN
3949         NaN
3950   

Unnamed: 0,MovieID,Title,Genres
0,682,Tigrero: A Film That Was Never Made (1994),Documentary|Drama
1,2033,"Black Cauldron, The (1985)",Animation|Children's
2,2167,Blade (1998),Action|Adventure|Horror
3,2168,Dance with Me (1998),Drama|Romance
4,2169,Dead Man on Campus (1998),Comedy
5,2170,Wrongfully Accused (1998),Action|Comedy
6,2171,"Next Stop, Wonderland (1998)",Comedy|Drama|Romance
7,2172,"Strike! (a.k.a. All I Wanna Do, The Hairy Bird...",Comedy
8,2173,"Navigator: A Mediaeval Odyssey, The (1988)",Adventure|Fantasy|Sci-Fi
9,2175,Déjà Vu (1997),Drama|Romance


In [54]:
# get the movie titles by movie ids
def get_movie_titles(movie_ids):
    return movies[movies['MovieID'].isin(movie_ids)]

get_movie_titles(movie_ids)

Unnamed: 0,MovieID,Title,Genres
24,25,Leaving Las Vegas (1995),Drama|Romance
25,26,Othello (1995),Drama
65,66,Lawnmower Man 2: Beyond Cyberspace (1996),Sci-Fi|Thriller
68,69,Friday (1995),Comedy
81,82,Antonia's Line (Antonia) (1995),Drama
82,83,Once Upon a Time... When We Were Colored (1995),Drama
84,85,Angels and Insects (1995),Drama|Romance
85,86,White Squall (1996),Adventure|Drama
86,87,Dunston Checks In (1996),Children's|Comedy
87,88,Black Sheep (1996),Comedy


In [119]:
print(movies.loc[movies['MovieID'] == 3732])

      MovieID             Title  Genres
3663     3732  Fury, The (1978)  Horror


m2 (4.00)
m6 (3.00)
m5 (3.00)
m3 (2.00)
m1 (0.00)
