# H&M RECO test C2W2

Prepare

In [1]:
import pandas as pd
import numpy as np

Define util functions

In [2]:
def get_top_users(uu_corr, target_user, n=5):
    """ Get target user's top neighbours. """
    # loc(): Access a group of rows and columns by label(s) or a boolean array.
    target_corr = uu_corr.loc[target_user]
    #print("target user's corr:",target_corr)
    
    # iloc() : Purely integer-location based indexing for selection by position. e.g. select a row or a column.
    # nlargest(): Return the largest n elements.
    top_neighbours = target_corr.nlargest(n+1).iloc[1:]
    #print("target user's top neighbours:",top_neighbours)
    
    return top_neighbours

In [3]:
def get_user_movie_score(target_movie, target_user):
    """ Calculate the ucer-movie score for a target user. """
    top_neighbours = get_top_users(uu_corr, target_user)
    ratings_sum = 0
    weight_sum = 0
    for target_user, each_weight in zip(top_neighbours.index, top_neighbours.values):
        # Test element-wise for NaN and return result as a boolean array.
        if np.isnan(target_movie[target_user]): 
            continue
        ratings_sum += target_movie[target_user] * each_weight
        weight_sum += each_weight
    if weight_sum == 0:
        return 0
    return ratings_sum/weight_sum

In [4]:
def get_user_movie_score_normalized(target_movie, target_user):
    """ Calculate the ucer-movie score for a target user with normalization. """
    
    top_neighbours = get_top_users(uu_corr, target_user)
    
    ratings_sum = 0
    weight_sum = 0
    
    user_rating_mean = soucre_df.loc[:,target_user].mean()
    
    for target_user, each_weight in zip(
        top_neighbours.index, 
        top_neighbours.values,
    ):
        if np.isnan(target_movie[target_user]): 
            continue
            
        movie_user_mean = soucre_df.loc[:,target_user].mean()
        ratings_sum += (target_movie[target_user] - movie_user_mean) * each_weight
        weight_sum += each_weight
        
    if weight_sum == 0:
        return 0
    
    return user_rating_mean + ratings_sum/ weight_sum

In [5]:
def print_prediction_results(target_user_id, soucre_df, calculation_function):
    """ Print results. """
    
    predict_result = soucre_df.apply(
        calculation_function,
        axis=1,
        args=(target_user_id,),
    )

    final_result = predict_result.sort_values(ascending=False)[:3]
    return f"For tagert user {target_user_id}, the predict results as below: {final_result}"

Load data

In [6]:
soucre_df = pd.read_excel('data/uucf_source_spreadsheet.xls', index_col=0)
soucre_df.head()

Unnamed: 0,1648,5136,918,2824,3867,860,3712,2968,3525,4323,...,3556,5261,2492,5062,2486,4942,2267,4809,3853,2288
11: Star Wars: Episode IV - A New Hope (1977),,4.5,5.0,4.5,4.0,4.0,,5.0,4.0,5.0,...,4.0,,4.5,4.0,3.5,,,,,
12: Finding Nemo (2003),,5.0,5.0,,4.0,4.0,4.5,4.5,4.0,5.0,...,4.0,,3.5,4.0,2.0,3.5,,,,3.5
13: Forrest Gump (1994),,5.0,4.5,5.0,4.5,4.5,,5.0,4.5,5.0,...,4.0,5.0,3.5,4.5,4.5,4.0,3.5,4.5,3.5,3.5
14: American Beauty (1999),,4.0,,,,,4.5,2.0,3.5,5.0,...,4.0,,3.5,4.5,3.5,4.0,,3.5,,
22: Pirates of the Caribbean: The Curse of the Black Pearl (2003),4.0,5.0,3.0,4.5,4.0,2.5,,5.0,3.0,4.0,...,3.0,1.5,4.0,4.0,2.5,3.5,,5.0,,3.5


In [7]:
# Compute pairwise correlation of columns, excluding NA/null values.
uu_corr = soucre_df.corr()

Part 1 - Without Normalization

In [8]:
print_prediction_results(3867, soucre_df, get_user_movie_score)

'For tagert user 3867, the predict results as below: 1891: Star Wars: Episode V - The Empire Strikes Back (1980)    4.760291\n155: The Dark Knight (2008)                                    4.551454\n122: The Lord of the Rings: The Return of the King (2003)      4.507637\ndtype: float64'

In [9]:
print_prediction_results(89, soucre_df, get_user_movie_score)

'For tagert user 89, the predict results as below: 238: The Godfather (1972)               4.894124\n278: The Shawshank Redemption (1994)    4.882194\n807: Seven (a.k.a. Se7en) (1995)        4.774093\ndtype: float64'

Part 2 - Normalization

In [10]:
print_prediction_results(3867, soucre_df, get_user_movie_score_normalized)

'For tagert user 3867, the predict results as below: 1891: Star Wars: Episode V - The Empire Strikes Back (1980)    5.245509\n155: The Dark Knight (2008)                                    4.856770\n77: Memento (2000)                                             4.777803\ndtype: float64'

In [11]:
print_prediction_results(89, soucre_df, get_user_movie_score_normalized)

'For tagert user 89, the predict results as below: 238: The Godfather (1972)               5.322015\n278: The Shawshank Redemption (1994)    5.261424\n275: Fargo (1996)                       5.241111\ndtype: float64'