***
### User-Based Collaborative Filter

Victor Agaba, Cheryl Chen, Garrett Lee, Evan Li
***

#### Qn 3

Part a-b)

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# read ubcf.csv into pd df combining the first col as row names
movie_user_df = pd.read_csv('ubcf.csv', index_col=0)

movie_user_array = movie_user_df.values
movies = np.array(movie_user_df.index)
users = np.array(movie_user_df.columns).astype(int)

movie_user_df.iloc[:10, :6] # Note: NaNs are missing ratings

Unnamed: 0,1648,5136,918,2824,3867,860
11: Star Wars IV A New Hope (1977),,4.5,5.0,4.5,4.0,4.0
12: Finding Nemo (2003),,5.0,5.0,,4.0,4.0
13: Forrest Gump (1994),,5.0,4.5,5.0,4.5,4.5
14: American Beauty (1999),,4.0,,,,
22: Pirates of Caribbean: (2003),4.0,5.0,3.0,4.5,4.0,2.5
24: Kill Bill: Vol. 1 (2003),3.0,5.0,,4.0,3.0,3.0
38: Eternal Sunshine Spotless Mind (2004),,5.0,5.0,,,
63: Twelve Monkeys (1995),,3.0,,,,4.0
77: Memento (2000),,,5.0,5.0,,4.5
85: Raiders of the Lost Ark (1981),,5.0,,,4.5,


Part c)

In [3]:
def correlate(array: np.ndarray, col1: int, col2: int) -> float:
    '''
    Find correlation between two columns of a 2D array.
    Contains NaNs, do not consider those rows in the correlation.
    '''
    # get the two columns
    mini_array = array[:, [col1, col2]]
    
    # filter out rows with NaNs
    mini_array = mini_array[~np.isnan(mini_array).any(axis=1)]
    
    # calculate correlation
    corr = np.corrcoef(mini_array.T)[1, 0]
    
    return corr


def corr_mat(array: np.ndarray) -> np.ndarray:
    '''
    Create a correlation matrix for the columns of a 2D array.
    '''
    # initialize matrix
    n = array.shape[1]
    mat = np.identity(n)
    
    # fill in pairwise correlations
    for i in range(n):
        for j in range(i+1, n):
            mat[i, j] = correlate(array, i, j)
            mat[j, i] = mat[i, j]
    
    return mat


user_user_corr = corr_mat(movie_user_array)
print("Displaying first 7*7 of user-user correlation matrix:")
pd.DataFrame(user_user_corr, index=users, columns=users).iloc[:7, :7]

Displaying first 7*7 of user-user correlation matrix:


Unnamed: 0,1648,5136,918,2824,3867,860,3712
1648,1.0,0.40298,-0.142206,0.51762,0.3002,0.480537,-0.312412
5136,0.40298,1.0,0.118979,0.057916,0.341734,0.241377,0.131398
918,-0.142206,0.118979,1.0,-0.317063,0.294558,0.468333,0.092037
2824,0.51762,0.057916,-0.317063,1.0,-0.060913,-0.008066,0.46291
3867,0.3002,0.341734,0.294558,-0.060913,1.0,0.282497,0.400275
860,0.480537,0.241377,0.468333,-0.008066,0.282497,1.0,0.171151
3712,-0.312412,0.131398,0.092037,0.46291,0.400275,0.171151,1.0


Part d)

In [4]:
def top_k_neighbors(user_id: int, k: int, corr_matrix: np.ndarray, users: np.ndarray = None) -> np.ndarray:
    '''
    Find the top k neighbors for a user.
    '''
    if users is None:
        users = np.arange(corr_matrix.shape[0])
    
    # get the user's row
    user_row = corr_matrix[users == user_id, :].squeeze()
    
    # sort row, get top k
    top_k_ind = np.argsort(user_row)
    
    # get the top k 
    top_k = users[top_k_ind][::-1][1:k+1]
    
    # corresponding correlations
    top_k_corr = user_row[top_k_ind][::-1][1:k+1]
    
    return top_k, top_k_corr

test_user = 3712

user1 = 3867
ans1, ans1_corr = top_k_neighbors(user1, 5, user_user_corr, users=users)
print(f"Top 5 neighbors for user {user1}: {ans1}\n\twith correlations: {ans1_corr} respectively.\n")

user2 = 89
ans2, ans2_corr = top_k_neighbors(user2, 5, user_user_corr, users=users)
print(f"Top 5 neighbors for user {user2}: {ans2}\n\twith correlations: {ans2_corr} respectively.")

Top 5 neighbors for user 3867: [2492 3853 2486 3712 2288]
	with correlations: [0.47668328 0.46411015 0.43899155 0.4002745  0.37985627] respectively.

Top 5 neighbors for user 89: [4809 5136  860 5062 3525]
	with correlations: [0.66851595 0.56244874 0.53906585 0.52599044 0.47549485] respectively.


Part e)

In [16]:
def corr_weighted_avg(item_user_array: np.ndarray, k: int, corr_matrix: np.ndarray,
                      normalize=False) -> np.ndarray:
    '''
    Use top k neighbors to predict all ratings per user.
    '''
    output = np.nan * np.ones_like(item_user_array)
    
    def predict(ratings, weights): # helper function
        '''
        Predict rating given a list of ratings and wrights.
        Ratings may be NaN. Then downweight to 0.
        '''
        # filter out NaNs
        weights = weights[~np.isnan(ratings)]
        ratings = ratings[~np.isnan(ratings)]
        
        return np.dot(ratings, weights) / np.sum(weights) if len(weights) > 0 else 0
    
    def get_means(ratings): # helper function
        '''
        Get mean of non-NaN ratings.
        '''
        means = np.zeros(ratings.shape[1])
        for user in range(ratings.shape[1]):
            user_ratings = ratings[:, user]
            user_ratings = user_ratings[~np.isnan(user_ratings)]
            if len(user_ratings) > 0:
                means[user] = np.mean(user_ratings)
        
        return means if normalize else np.zeros(ratings.shape[1])
    
    means = get_means(item_user_array)
    for user in range(output.shape[1]):
        top_k, top_k_corr = top_k_neighbors(user, k, corr_matrix)
        
        for item in range(output.shape[0]):
            ratings = item_user_array[item, top_k]
            ratings -= means[top_k]
            weights = top_k_corr
            output[item, user] = predict(ratings, weights)
        
        output[:, user] += means[user]
    
    return output

k = 5
movie_user_predictions = corr_weighted_avg(movie_user_array, k, user_user_corr)
prediction_df = pd.DataFrame(movie_user_predictions, index=movies, columns=users)
print("Displaying first 20 rows of predictions:")
prediction_df[[user1, user2]].iloc[:20, :]

Displaying first 20 rows of predictions:


Unnamed: 0,3867,89
11: Star Wars IV A New Hope (1977),4.020581,4.133725
12: Finding Nemo (2003),3.347734,4.267451
13: Forrest Gump (1994),3.749478,4.60147
14: American Beauty (1999),3.804172,3.861582
22: Pirates of Caribbean: (2003),3.345121,3.98083
24: Kill Bill: Vol. 1 (2003),4.17262,3.898067
38: Eternal Sunshine Spotless Mind (2004),4.027814,4.551396
63: Twelve Monkeys (1995),4.091132,3.492586
77: Memento (2000),4.472487,4.329285
85: Raiders of the Lost Ark (1981),4.179766,4.359637


Part f)

In [6]:
# top 3 movie IDs and their predicted ratings per user
def top_k_items(ratings: pd.DataFrame, k: int, d: int = 3, mode: str = 'movie') -> np.ndarray:
    '''
    Find the top k items for each user.
    Round ratings to d decimal places.
    '''
    def extract_ids(movies: np.ndarray) -> np.ndarray:
        '''
        Extract movie IDs from movie names.
        '''
        return np.array([int(movie.split(':')[0]) for movie in movies])
    
    ratings_array = np.round(ratings.values, d)
    ratings_array[np.isnan(ratings_array)] = -np.inf  # for sorting
    movies = np.array(ratings.index)
    movies = extract_ids(movies)
    users = ratings.columns.astype(int)
    
    # get row indices of top k items
    base = np.argsort(ratings_array, axis=0)[::-1, :][:k, :]
    
    if mode == 'movie':
        output = movies[base]
    elif mode == 'rating':
        output = ratings_array[base, np.arange(ratings_array.shape[1])]
        output[output == -np.inf] = np.nan  # revert
    
    return pd.DataFrame(output, index=np.arange(1, k+1), columns=users)

In [7]:
print("Top 3 movie IDs/names per user (predictions):")
top_k_items(prediction_df, 3, mode='movie')

Top 3 movie IDs/names per user (predictions):


Unnamed: 0,1648,5136,918,2824,3867,860,3712,2968,3525,4323,3617,4360,2756,89,442,3556,5261,2492,5062,2486,4942,2267,4809,3853,2288
1,238,641,238,807,1891,807,641,274,194,550,424,601,120,238,105,238,38,120,155,275,568,275,629,807,1572
2,8358,238,275,122,155,77,603,197,38,424,13,1597,121,278,24,275,238,121,194,120,105,393,38,752,120
3,13,24,38,8358,122,238,105,63,238,275,568,629,38,807,274,568,77,807,105,121,146,641,1572,77,122


In [8]:
print("Top 3 ratings per user (predictions):")
top_k_items(prediction_df, 3, mode='rating')

Top 3 ratings per user (predictions):


Unnamed: 0,1648,5136,918,2824,3867,860,3712,2968,3525,4323,3617,4360,2756,89,442,3556,5261,2492,5062,2486,4942,2267,4809,3853,2288
1,5.0,5.0,4.902,5.0,4.76,4.846,5.0,5.0,5.0,4.741,5.0,5.0,5.0,4.894,4.892,4.874,5.0,5.0,4.82,5.0,5.0,5.0,5.0,4.86,5.0
2,5.0,4.779,4.752,5.0,4.551,4.846,4.856,5.0,5.0,4.728,4.815,5.0,5.0,4.882,4.888,4.838,5.0,5.0,4.763,4.894,5.0,4.855,5.0,4.853,5.0
3,4.794,4.776,4.645,5.0,4.508,4.791,4.739,5.0,5.0,4.701,4.781,5.0,5.0,4.774,4.798,4.753,4.84,5.0,4.678,4.894,5.0,4.853,5.0,4.793,5.0


In [9]:
print("Top 3 movie IDs/names per user (actual):")
top_k_items(movie_user_df, 3, mode='movie')

Top 3 movie IDs/names per user (actual):


Unnamed: 0,1648,5136,918,2824,3867,860,3712,2968,3525,4323,3617,4360,2756,89,442,3556,5261,2492,5062,2486,4942,2267,4809,3853,2288
1,9802,238,11,77,1572,857,1597,11,278,602,278,4327,12,857,603,24,105,98,122,194,604,857,238,38,585
2,272,568,1894,278,120,238,3049,581,424,597,98,1422,857,550,105,275,77,1891,550,77,77,680,22,24,550
3,155,550,585,13,680,680,585,808,629,414,13,393,98,585,604,424,807,1572,105,807,1422,550,597,238,197


In [10]:
print("Top 3 ratings per user (actual):")
top_k_items(movie_user_df, 3, mode='rating')

Top 3 ratings per user (actual):


Unnamed: 0,1648,5136,918,2824,3867,860,3712,2968,3525,4323,3617,4360,2756,89,442,3556,5261,2492,5062,2486,4942,2267,4809,3853,2288
1,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
2,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.5,5.0,5.0,5.0,5.0,5.0
3,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.5,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.5,4.5,5.0,5.0,5.0,5.0,5.0


#### Qn 4

Part a)

In [11]:
normalized_movie_user_predictions = corr_weighted_avg(movie_user_array, k,
                                                      user_user_corr, normalize=True)
normalized_prediction_df = pd.DataFrame(normalized_movie_user_predictions,
                                        index=movies, columns=users)
print("Displaying first 20 rows of normalized predictions:")
normalized_prediction_df[[user1, user2]].iloc[:20, :]

Displaying first 20 rows of normalized predictions:


Unnamed: 0,3867,89
11: Star Wars IV A New Hope (1977),4.5058,4.686099
12: Finding Nemo (2003),3.477161,4.819825
13: Forrest Gump (1994),4.054794,5.049074
14: American Beauty (1999),3.886764,4.240812
22: Pirates of Caribbean: (2003),3.773592,4.428435
24: Kill Bill: Vol. 1 (2003),4.477936,4.450441
38: Eternal Sunshine Spotless Mind (2004),4.383088,4.930626
63: Twelve Monkeys (1995),4.396449,4.051834
77: Memento (2000),4.777803,4.97764
85: Raiders of the Lost Ark (1981),4.485082,4.850521


Part b)

In [12]:
print("Top 3 movie IDs/names for 2 users (normalized predictions):")
top_k_items(normalized_prediction_df, 3, mode='movie')[[user1, user2]]

Top 3 movie IDs/names for 2 users (normalized predictions):


Unnamed: 0,3867,89
1,1891,238
2,155,278
3,77,275


In [13]:
print("Top 3 ratings for 2 users (normalized predictions):")
top_k_items(normalized_prediction_df, 3, mode='rating')[[user1, user2]]

Top 3 ratings for 2 users (normalized predictions):


Unnamed: 0,3867,89
1,5.246,5.322
2,4.857,5.261
3,4.778,5.241


In [14]:
print("Top 3 movie IDs/names for 2 users (actual):")
top_k_items(movie_user_df, 3, mode='movie')[[user1, user2]]

Top 3 movie IDs/names for 2 users (actual):


Unnamed: 0,3867,89
1,1572,857
2,120,550
3,680,585


In [15]:
print("Top 3 ratings for 2 users (actual):")
top_k_items(movie_user_df, 3, mode='rating')[[user1, user2]]

Top 3 ratings for 2 users (actual):


Unnamed: 0,3867,89
1,5.0,5.0
2,5.0,5.0
3,5.0,5.0


Note: Predictions do not seem to match because of tied 5.0 ratings in the original data.