<a href="https://colab.research.google.com/github/yanglianglu/CS-598-Statistical_Learning/blob/project4_yan/cs598_project4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 4

## Student Names and IDs
- Zeyu Liao(zeyu9, 667691486, MCS)
- Lu,Yangliang (yl164 661963604 MCS)
- Yan, Zexi (zexiyan2, 651826615 MCS)

## System I

System I recommend movies based on movie's ranking in user's selected genre. Here is the definition of "ranking score":
- score = 1/3 of popularity + 2/3 of weighted rating
- popularity is determined by the number of rating a movie has received.
- weighted rating is a scaled rating so that .....

-------------------------------

## System II

System II recommend movies by the Item-Based Collaborative Filtering Method.

### Setup environment

In [2]:
import torch
import pandas as pd
import numpy as np

def is_running_in_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False

root = ""

# Usage
if is_running_in_colab():
    print("The notebook is running in a Google Colab environment.")
    from google.colab import drive
    drive.mount('/content/drive')
    root = "/content/drive/My Drive/"
else:
    print("The notebook is running in a local environment.")


# Adjust the path as per your Google Drive directory structure
ratings_path = '/content/drive/My Drive/data/ratings.dat'
movies_path = '/content/drive/My Drive/data/movies.dat'

The notebook is running in a Google Colab environment.
Mounted at /content/drive


### Load Data and Initialization

In [3]:
ratings = pd.read_csv(ratings_path, sep='::', engine='python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

movies = pd.read_csv(movies_path, sep='::', engine='python', encoding="ISO-8859-1", header=None)
movies.columns = ['MovieID', 'Title', 'Genres']

item_feature_matrix = ratings.pivot_table(index='UserID', columns='MovieID', values='Rating')

mean_ratings = np.array(item_feature_matrix.mean(axis=1, skipna=True)).reshape(-1, 1)

normalized_ratings = item_feature_matrix - mean_ratings

### Calculate Similarity Matrix for movies

In [4]:
# get intersection of two tensors
def intersect1d_torch(tensor1, tensor2):
    # Find unique elements in each tensor
    unique1 = torch.unique(tensor1)
    unique2 = torch.unique(tensor2)

    # Find common elements
    common = unique1[torch.isin(unique1, unique2)]

    return common

In [5]:
# compute similarity matrix by tensors
def compute_similarity_matrix_tensor(rating, min_common_users = 3, top_n = 10):
    min_common_users = min_common_users
    top_n = top_n

    # Check if CUDA (GPU support) is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    # Convert DataFrame to tensors
    rating_0_tensor = torch.tensor(rating.fillna(0).values, dtype=torch.float32)
    rating_binary_tensor = (rating_0_tensor != 0).float()

    # Compute common ratings count
    common_ratings_count = torch.mm(rating_binary_tensor.T, rating_binary_tensor)

    # Prepare the similarity matrix
    num_movies = rating_0_tensor.shape[1]
    #similarity_matrix = torch.full((num_movies, num_movies), float('nan'))
    similarity_matrix = torch.full((num_movies, num_movies), float('nan')).to(device)


    for i in range(num_movies):
        for j in range(i + 1, num_movies):
            if common_ratings_count[i, j] >= min_common_users:
                # Filter ratings to include only common users
                user_0 = torch.where(rating_binary_tensor[:, i] == 1)[0]
                user_1 = torch.where(rating_binary_tensor[:, j] == 1)[0]
                common_users = intersect1d_torch(user_0, user_1)
                ratings_i = rating_0_tensor[common_users, i]
                ratings_j = rating_0_tensor[common_users, j]

                # Compute dot product and norms for the filtered ratings
                dot_product = torch.dot(ratings_i, ratings_j)
                norm_i = torch.norm(ratings_i)
                norm_j = torch.norm(ratings_j)

                if norm_i > 0 and norm_j > 0:
                    similarity = 0.5 + 0.5 * (dot_product / (norm_i * norm_j))
                else:
                    similarity = 0

                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity

    # Convert the similarity matrix to a DataFrame
    similarity_matrix_df = pd.DataFrame(similarity_matrix.cpu().numpy(),
                                       index=rating.columns,
                                       columns=rating.columns)

    return similarity_matrix_df

In [6]:
specified_movies = [1, 10, 100, 1510, 260, 3212]
full_similarity_matrix = compute_similarity_matrix_tensor(normalized_ratings)
print(full_similarity_matrix.shape)
#print(reduced_similarity_matrix)
print(full_similarity_matrix.loc[specified_movies, specified_movies].round(7))

cuda
(3706, 3706)
MovieID      1         10        100   1510      260   3212
MovieID                                                    
1             NaN  0.512106  0.392000   NaN  0.741597   NaN
10       0.512106       NaN  0.547458   NaN  0.534349   NaN
100      0.392000  0.547458       NaN   NaN  0.329694   NaN
1510          NaN       NaN       NaN   NaN       NaN   NaN
260      0.741597  0.534349  0.329694   NaN       NaN   NaN
3212          NaN       NaN       NaN   NaN       NaN   NaN


In [7]:
### Save the Similarity Matrix online

In [8]:
np.savez('similarity_matrix_raw.npz', matrix=full_similarity_matrix)

### Take the top 30 neighbors of each movie, the rest is changed to NaN

In [11]:
# Filter the top_n most similar movies for each movie (row)
top_n = 30
similarity_matrix_top30_df = full_similarity_matrix.copy()
for i in range(similarity_matrix_top30_df.shape[0]):
        row_indices = np.argsort(-similarity_matrix_top30_df.iloc[i])[:top_n]
        mask = np.ones(similarity_matrix_top30_df.shape[1], dtype=bool)
        mask[row_indices] = False
        similarity_matrix_top30_df.iloc[i, mask] = np.nan

print(similarity_matrix_top30_df.loc[specified_movies, specified_movies].round(7))

MovieID  1     10    100   1510  260   3212
MovieID                                    
1         NaN   NaN   NaN   NaN   NaN   NaN
10        NaN   NaN   NaN   NaN   NaN   NaN
100       NaN   NaN   NaN   NaN   NaN   NaN
1510      NaN   NaN   NaN   NaN   NaN   NaN
260       NaN   NaN   NaN   NaN   NaN   NaN
3212      NaN   NaN   NaN   NaN   NaN   NaN


### Save the top-30 Similarity Matrix online

In [12]:
np.savez('similarity_matrix_top30.npz', matrix=similarity_matrix_top30_df)

### Recommend Movies using IBCF method by taking the Similarity Matrix and user's existing ratings

In [13]:
def recommend_movies(user_rating, s_matrix, n_recommendations = 10):
    """
    Generate movie recommendations based on new user ratings and a sparse similarity matrix.

    Parameters:
    - user_ratings: np.array, user's ratings for movies; 0 indicates the movie hasn't been rated.
    - s_matrix: item-item similarity matrix.
    - n_recommendations: int, the number of recommendations to return.

    Returns:
    - List of movie IDs representing the top N recommendations.
    - List of corresponding rating score predictions for the recommended movies.
    """

    # number of movies
    n_movies = len(user_rating)

    # initial rated movies
    user_rating_full = np.full(shape=len(user_rating), fill_value = -1.0)

    # iterate through each movie and calculate the weighted rating for user
    for l in range(n_movies):
        #print("\nloop :", l)

        if user_rating[l] == 0:
            # get the similarity score for moviel l
            l_s_scores = np.nan_to_num(s_matrix.iloc[l,:])

            # get the weighted sum for movie l
            weighted_sum = np.dot(l_s_scores, user_rating)

            # get the normalization factor
            mask = (user_rating != 0).astype(int)
            norm_factor = np.dot(l_s_scores, mask)

            # get the weighted average for movie l
            weighted_avg = (weighted_sum) / norm_factor if norm_factor != 0 else 0

            user_rating_full[l] = weighted_avg


    # get the top N recommendations index
    top_n = np.argsort(user_rating_full)[-(n_recommendations):][::-1]

    # get top N movie ID by mapping index to S matrix index
    top_n_id = s_matrix.index[top_n]

    top_n_score = user_rating_full[top_n]

    return top_n_id, top_n_score



In [14]:
# get the movie titles by movie ids
# keep the order of movie ids
def get_movie_titles(movie_ids, movies):
    # Create a dictionary to map MovieId to rows
    movie_dict = movies.set_index('MovieID').T.to_dict('list')

    # Use list comprehension to preserve the order of movie_ids
    ordered_data = [movie_dict[movie_id] for movie_id in movie_ids if movie_id in movie_dict]

    # Convert the list of data back to a DataFrame
    sub_df = pd.DataFrame(ordered_data, columns=movies.columns[1:], index=movie_ids)
    return sub_df

### Recommend movies for user `u1181`

In [15]:
# get user rating for user 1181
user_1181_ratings = item_feature_matrix.loc[1181].fillna(0)
# count number of 0 in user rating
#print(user_1181_ratings[user_1181_ratings == 0].count())
#print(user_1181_ratings)

movie_ids, scores = recommend_movies(user_1181_ratings.values, similarity_matrix_top30_df, 10)

for i in range(len(movie_ids)):
    print(f'm{movie_ids[i]} ({scores[i]:.2f})')
get_movie_titles(movie_ids, movies)

m3184 (5.00)
m3581 (5.00)
m560 (4.43)
m2765 (4.42)
m2201 (4.23)
m3642 (4.21)
m3276 (4.16)
m3644 (4.15)
m2843 (4.12)
m389 (4.11)


Unnamed: 0_level_0,Title,Genres
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
3184,Montana (1998),Action|Comedy|Crime|Drama
3581,Human Traffic (1999),Drama
560,"Beans of Egypt, Maine, The (1994)",Drama
2765,"Acid House, The (1998)",Comedy|Drama
2201,"Paradine Case, The (1947)",Drama
3642,In Old California (1942),Western
3276,Gun Shy (2000),Comedy
3644,Dark Command (1940),Western
2843,"Black Cat, White Cat (Crna macka, beli macor) ...",Comedy|Romance
389,"Colonel Chabert, Le (1994)",Drama|Romance|War


Recommend movies for user `1351`

In [16]:
# get user rating for user 1351
user_1351_ratings = item_feature_matrix.loc[1351].fillna(0)

movie_ids, scores = recommend_movies(user_1351_ratings.values, similarity_matrix_top30_df, 10)

for i in range(len(movie_ids)):
    print(f'm{movie_ids[i]} ({scores[i]:.2f})')

get_movie_titles(movie_ids, movies)

m2098 (5.00)
m923 (5.00)
m2846 (5.00)
m2283 (5.00)
m833 (5.00)
m1711 (5.00)
m879 (5.00)
m2801 (5.00)
m2326 (5.00)
m2339 (5.00)


Unnamed: 0_level_0,Title,Genres
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
2098,Son of Flubber (1963),Children's|Comedy
923,Citizen Kane (1941),Drama
2846,"Adventures of Milo and Otis, The (1986)",Children's
2283,"Sheltering Sky, The (1990)",Drama
833,High School High (1996),Comedy
1711,Midnight in the Garden of Good and Evil (1997),Comedy|Crime|Drama|Mystery
879,"Relic, The (1997)",Horror
2801,Oscar and Lucinda (a.k.a. Oscar & Lucinda) (1997),Drama|Romance
2326,Shattered Image (1998),Drama|Thriller
2339,I'll Be Home For Christmas (1998),Comedy|Romance


Recommend movies for A hypothetical user who rates movie `m1613` with `5` and movie `m1755` with `4`.

In [17]:
#create a user rating vector 3707 x 1, populate with 0
user_hypo_ratings = np.zeros(3706)
user_hypo_ratings[1481] = 5
user_hypo_ratings[1600] = 4

movie_ids, scores = recommend_movies(user_hypo_ratings, similarity_matrix_top30_df, 10)

for i in range(len(movie_ids)):
    print(f'm{movie_ids[i]} ({scores[i]:.2f})')

get_movie_titles(movie_ids, movies)

m466 (5.00)
m231 (5.00)
m2 (5.00)
m1936 (5.00)
m1997 (5.00)
m2000 (5.00)
m1086 (5.00)
m2881 (5.00)
m1897 (5.00)
m2590 (5.00)


Unnamed: 0_level_0,Title,Genres
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
466,Hot Shots! Part Deux (1993),Action|Comedy|War
231,Dumb & Dumber (1994),Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
1936,Mrs. Miniver (1942),Drama|War
1997,"Exorcist, The (1973)",Horror
2000,Lethal Weapon (1987),Action|Comedy|Crime|Drama
1086,Dial M for Murder (1954),Mystery|Thriller
2881,Double Jeopardy (1999),Action|Thriller
1897,High Art (1998),Drama|Romance
2590,Hideous Kinky (1998),Drama
