In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
cols = ['UserID', 'MovieID', 'Rating', 'Timestamp']
# load data
file = 'data/ratings.dat'
df = pd.read_csv(file, sep='::', names=cols)



In [3]:
# split data
train_set, test_set = train_test_split(df, test_size = 0.2) 

In [4]:
# build zero data set 
n_users = train_set['UserID'].max()
n_items = train_set['MovieID'].max()
ratings = np.zeros((n_users, n_items))

In [5]:
# fill rating in training set [user, movie]
for row in train_set.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]

In [6]:
# define loss function
def rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [7]:
ratings.shape

(6040, 3952)

In [8]:
# ——————————baseline——————————
def cal_mean(training_matrix):
    global all_mean, user_mean, item_mean
    all_mean = np.mean(training_matrix[training_matrix!=0])
    user_mean = sum(training_matrix.T) / sum((training_matrix!=0).T)
    item_mean = sum(training_matrix) / sum((training_matrix!=0)) # sum up each col.
    
    if np.isnan(user_mean).any():
        user_mean_nan = True
    else:
        user_mean_nan = False
    if np.isnan(item_mean).any():
        item_mean_nan = True
    else:
        item_mean_nan = False
    print('Existing User_NaN?', user_mean_nan)
    print('Existing Item_NaN?', item_mean_nan)

    # fill with all_mean while user/item mean isnan
    user_mean = np.where(np.isnan(user_mean), all_mean, user_mean)
    item_mean = np.where(np.isnan(item_mean), all_mean, item_mean)
    
    if np.isnan(user_mean).any():
        user_mean_nan = True
    else:
        user_mean_nan = False
    if np.isnan(item_mean).any():
        item_mean_nan = True
    else:
        item_mean_nan = False
    print('Existing User_NaN?', user_mean_nan)
    print('Existing Item_NaN?', item_mean_nan)
    print('all_mean %.4f' % all_mean)

In [9]:
cal_mean(ratings)

Existing User_NaN? False
Existing Item_NaN? True
Existing User_NaN? False
Existing Item_NaN? False
all_mean 3.5815




In [24]:
# calculate similarity by Cosine 
def cal_similarity(ratings, kind, epsilon=1e-9):
    # kind: calculate similarity based on user or item A.T@B/|A||B|
    # epsilon: prevent 1/0 
    if kind == 'user':
        similarity = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        similarity = ratings.T.dot(ratings) + epsilon
    square_vec = 1/np.diag(similarity)
    abs_vec = np.sqrt(square_vec)
    cosine = similarity * abs_vec
    cosine = cosine.T * abs_vec
    return cosine

In [25]:
user_similarity = cal_similarity(ratings, kind='user')
item_similarity = cal_similarity(ratings, kind='item')
# item similarity
print(np.round_(item_similarity[:10,:10], 3))

[[1.    0.308 0.222 0.141 0.208 0.285 0.235 0.108 0.096 0.314]
 [0.308 1.    0.187 0.115 0.209 0.205 0.211 0.116 0.149 0.293]
 [0.222 0.187 1.    0.167 0.22  0.137 0.211 0.058 0.103 0.194]
 [0.141 0.115 0.167 1.    0.211 0.094 0.156 0.056 0.04  0.105]
 [0.208 0.209 0.22  0.211 1.    0.126 0.222 0.07  0.098 0.208]
 [0.285 0.205 0.137 0.094 0.126 1.    0.151 0.053 0.129 0.348]
 [0.235 0.211 0.211 0.156 0.222 0.151 1.    0.047 0.061 0.201]
 [0.108 0.116 0.058 0.056 0.07  0.053 0.047 1.    0.041 0.078]
 [0.096 0.149 0.103 0.04  0.098 0.129 0.061 0.041 1.    0.161]
 [0.314 0.293 0.194 0.105 0.208 0.348 0.201 0.078 0.161 1.   ]]


In [26]:
# validation on test
def test_model(model, loss_function):
    predictions = []
    targets = []
    print('_____%s_____' % model)
    for row in test_set.itertuples():
        user, item, actual = row[1]-1, row[2]-1, row[3]
        predictions.append(model(user, item))
        targets.append(actual)

    print('rmse is %.4f' % loss_function(np.array(predictions), np.array(targets)))

In [27]:
# item based CF
def predict_itemCF(user, item):
    nzero = ratings[user].nonzero()[0]
    prediction = ratings[user, nzero].dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero])
    return prediction
test_model(predict_itemCF, rmse)

_____<function predict_itemCF at 0x1127f3b70>_____
rmse is 1.0025


In [32]:
# item based CF with baseline
def predict_itemCF_baseline(user, item):
    nzero = ratings[user].nonzero()[0] # index of nonzero item
    # bias_user + bias_item + all_mean (baseline of a list of item)
    baseline = item_mean + user_mean[user] - all_mean 
    prediction = (ratings[user, nzero] - baseline[nzero]).dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero]) + baseline[item] 
#     if prediction > 5:
#         prediction = 5
#     if prediction < 1:
#         prediciton = 1    
    return prediction 

In [33]:
test_model(predict_itemCF_baseline, rmse)

_____<function predict_itemCF_baseline at 0x1134359d8>_____
rmse is 0.8955


In [43]:
# user based CF with baseline
def predict_userCF_baseline(user, item):
    nzero = ratings[:,item].nonzero()[0]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = (ratings[nzero, item] - baseline[nzero]).dot(user_similarity[user, nzero])\
                / sum(user_similarity[user, nzero]) + baseline[user]
    # no ratings on this item
    if np.isnan(prediction):
        prediction = baseline[user]
    return prediction

In [44]:
test_model(predict_userCF_baseline, rmse)

_____<function predict_userCF_baseline at 0x1127f37b8>_____




rmse is 0.9233


In [45]:
# adding top k method to item based CF
def predict_topkCF(user, item, k=10):
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    choice = nzero[item_similarity[item, nzero].argsort()[::-1][:k]] # top k similarity
    prediction = (ratings[user, choice] - baseline[choice]).dot(item_similarity[item, choice])\
                / sum(item_similarity[item, choice]) + baseline[item]
    return prediction 

In [46]:
test_model(predict_topkCF, rmse)

_____<function predict_topkCF at 0x113435a60>_____
rmse is 0.8743


In [53]:
# normalize similarity using Pearson correlation coefficient
def cal_similarity_norm(ratings, kind, epsilon=1e-9):
    if kind == 'user':
        # normalization
        rating_user_diff = ratings.copy()
        for i in range(ratings.shape[0]):
            nzero = ratings[i].nonzero()
            rating_user_diff[i][nzero] = ratings[i][nzero] - user_mean[i]
        similarity = rating_user_diff.dot(rating_user_diff.T) + epsilon
    elif kind == 'item':
        rating_item_diff = ratings.copy()
        for j in range(ratings.shape[1]):
            nzero = ratings[:,j].nonzero()
            rating_item_diff[:,j][nzero] = ratings[:,j][nzero] - item_mean[j]
        similarity = rating_item_diff.T.dot(rating_item_diff) + epsilon
    square_vec = 1/np.diag(similarity)
    # square_vec[np.isinf(square_vec)] = 0
    abs_vec = np.sqrt(square_vec)
    cosine = similarity * abs_vec
    cosine = cosine.T * abs_vec
    return cosine

In [54]:
user_similarity_norm = cal_similarity_norm(ratings, kind='user')
item_similarity_norm = cal_similarity_norm(ratings, kind='item')
print(np.round_(item_similarity[:10,:10], 3))

[[1.    0.308 0.222 0.141 0.208 0.285 0.235 0.108 0.096 0.314]
 [0.308 1.    0.187 0.115 0.209 0.205 0.211 0.116 0.149 0.293]
 [0.222 0.187 1.    0.167 0.22  0.137 0.211 0.058 0.103 0.194]
 [0.141 0.115 0.167 1.    0.211 0.094 0.156 0.056 0.04  0.105]
 [0.208 0.209 0.22  0.211 1.    0.126 0.222 0.07  0.098 0.208]
 [0.285 0.205 0.137 0.094 0.126 1.    0.151 0.053 0.129 0.348]
 [0.235 0.211 0.211 0.156 0.222 0.151 1.    0.047 0.061 0.201]
 [0.108 0.116 0.058 0.056 0.07  0.053 0.047 1.    0.041 0.078]
 [0.096 0.149 0.103 0.04  0.098 0.129 0.061 0.041 1.    0.161]
 [0.314 0.293 0.194 0.105 0.208 0.348 0.201 0.078 0.161 1.   ]]


In [57]:
# adding normalization to item based CF
def predict_norm_CF(user, item, k=30):
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    choice = nzero[item_similarity_norm[item, nzero].argsort()[::-1][:k]]
    prediction = (ratings[user, choice] - baseline[choice]).dot(item_similarity_norm[item, choice])\
                / sum(item_similarity_norm[item, choice]) + baseline[item]
    if prediction > 5: 
        prediction = 5
    if prediction < 1: 
        prediction = 1
    return prediction 

In [58]:
test_model(predict_norm_CF, rmse)

_____<function predict_norm_CF at 0x113424510>_____
rmse is 0.8543
