# Introduction

In this tutorial, we will implement some algorithms in Collaborative Filtering Algorithm with MovieLens dataset. All theories used in this tutorial could be found in *Collaborative Filtering.ipynb* file. <br>
**Note**: We need to implement with this dataset because we will compare results in variant models.

# Implementation

## Preprocessing Data

### Get data

In [1]:
import numpy as np
import tensorflow as tf
import sklearn
import csv
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
import os

dir_path = os.path.abspath(os.path.join('', os.pardir))

In [3]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(os.path.join(dir_path, 'data/ml-100k/u.data'), names=names, sep='\t')

In [4]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]

In [5]:
nan = np.nan
ratings_matrix = np.zeros((n_users, n_items)) * nan
for line in df.itertuples():
    ratings_matrix[line[1]-1, line[2]-1] = line[3]

In [6]:
print(ratings_matrix)

[[ 5.  3.  4. ... nan nan nan]
 [ 4. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [ 5. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan  5. nan ... nan nan nan]]


### Work with data

In [7]:
# indices for vector
def specified_rating_indices(u):
    return np.where(np.isfinite(u))

In [8]:
# mean rating for each user i using his specified rating
def mean(u):
    # may use specified_rating_indices but use more time
    specified_ratings = u[specified_rating_indices(u)]#u[np.isfinite(u)]
    m = sum(specified_ratings)/np.shape(specified_ratings)[0]
    return m

In [9]:
def all_user_mean_ratings(ratings_matrix):
    return np.array([mean(ratings_matrix[u, :]) for u in range(ratings_matrix.shape[0])])

In [10]:
def get_mean_centered_ratings_matrix(ratings_matrix):
    users_mean_rating = all_user_mean_ratings(ratings_matrix)
    mean_centered_ratings_matrix = ratings_matrix - np.reshape(users_mean_rating, [-1, 1])
    return mean_centered_ratings_matrix

In [11]:
mean_centered_ratings_matrix = get_mean_centered_ratings_matrix(ratings_matrix)

# Implementation

In [12]:
def pearson(u, v):
    mean_u = mean(u)
    mean_v = mean(v)
    
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_rating_indices_v = set(specified_rating_indices(v)[0])
    
    mutually_specified_ratings_indices = specified_rating_indices_u.intersection(specified_rating_indices_v)
    mutually_specified_ratings_indices = list(mutually_specified_ratings_indices)
    
    u_mutually = u[mutually_specified_ratings_indices]
    v_mutually = v[mutually_specified_ratings_indices]
      
    centralized_mutually_u = u_mutually - mean_u
    centralized_mutually_v = v_mutually - mean_v
#     print(np.sqrt(np.sum(np.square(centralized_mutually_u))))

    result = np.sum(np.multiply(centralized_mutually_u, centralized_mutually_v)) 
    result = result / (np.sqrt(np.sum(np.square(centralized_mutually_u))) * np.sqrt(np.sum(np.square(centralized_mutually_v))))
    
    return result

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from surprise import similarities

In [14]:
def mean_centered(u):
    return u - mean(u)

In [37]:
def get_user_similarity_value_for(u_index, ratings_matrix, func):
    user_ratings = ratings_matrix[u_index, :]
    similarity_value = np.array([func(ratings_matrix[i, :], user_ratings) for i in range(ratings_matrix.shape[0])])
    return similarity_value

In [38]:
from tqdm import tqdm
def get_user_similarity_matrix(ratings_matrix, func):
    similarity_matrix = []
    for u_index in tqdm(range(ratings_matrix.shape[0])):
        similarity_value = get_user_similarity_value_for(u_index, ratings_matrix, func)
        similarity_matrix.append(similarity_value)
    return np.array(similarity_matrix)
    

In [39]:
user_similarity_matrix = get_user_similarity_matrix(ratings_matrix, pearson)

100%|██████████| 943/943 [01:39<00:00,  9.50it/s]


In [18]:
users_mean_rating = all_user_mean_ratings(ratings_matrix)

In [55]:
def predict(u_index, i_index, k):
    
    similarity_value = user_similarity_matrix[u_index]
    sorted_users_similar = np.argsort(similarity_value)
    sorted_users_similar = np.flip(sorted_users_similar, axis=0)
        
    # only for this item
    users_rated_item = specified_rating_indices(ratings_matrix[:, i_index])[0]

    set_2 = frozenset(users_rated_item)
    ranked_similar_user_rated_item = [u for u in sorted_users_similar if u in set_2] 
    
    if k < len(ranked_similar_user_rated_item):
        top_k_similar_user = ranked_similar_user_rated_item[0:k]   
    else:
        top_k_similar_user = np.array(ranked_similar_user_rated_item)
            
    # replace with mean_centered for user
    
    ratings_in_item = mean_centered_ratings_matrix[:, i_index]
    top_k_ratings = ratings_in_item[top_k_similar_user]
    
    top_k_similarity_value = similarity_value[top_k_similar_user]

    r_hat = users_mean_rating[u_index] + np.sum(top_k_ratings * top_k_similarity_value)/np.sum(np.abs(top_k_similarity_value))
    return r_hat

In [56]:
def get_predicted_ratings_matrix():
    predicted_ratings = []
    for u_index in tqdm(range(n_users)):
        user_ratings = []
        for i_index in range(n_items):
#             rating = ratings_matrix[u_index][i_index]
#             if np.isnan(rating):
            rating = predict(u_index, i_index, 100)
            user_ratings.append(rating)
        predicted_ratings.append(user_ratings)
    return predicted_ratings            

In [57]:
predicted_ratings = get_predicted_ratings_matrix()
predicted_ratings = np.array(predicted_ratings)

100%|██████████| 943/943 [05:18<00:00,  2.96it/s]


In [58]:
from sklearn.metrics import mean_squared_error

def get_mse_error(ratings_matrix, predicted_ratings_matrix):    
#     fill_0_matrix = ratings_matrix.copy()
#     for i in range(n_users):
#         for j in range(n_items):
#             if np.isnan(fill_0_matrix[i, j]):
#                 fill_0_matrix[i, j] = 0

#     predict_fill_0_matrix = predicted_ratings_matrix.copy()
#     for i in range(n_users):
#         for j in range(n_items):
#             if np.isnan(predict_fill_0_matrix[i, j]):
#                 predict_fill_0_matrix[i, j] = 0
    
#     return mean_squared_error(fill_0_matrix, predict_fill_0_matrix)

    original_ratings = []
    predicted_ratings = []
    
    for i in range(n_users):
        for j in range(n_items):
            rating = ratings_matrix[i, j]
            if np.isfinite(rating):
                original_ratings.append(rating)
                predicted_ratings.append(predicted_ratings_matrix[i, j])
    
    return mean_squared_error(original_ratings, predicted_ratings)


In [59]:
print(get_mse_error(ratings_matrix, predicted_ratings))

0.7750288857939598


**OOPS!!!** We realize that total time that we need to use in predicting value is very long. **So we must change our code!!!**

We will look into every function that we use one more time!

In [60]:
# import _thread as thread
# import time

# # Define a function for the thread
# def fill_matrix_2(start, end):
#     for u_index in range(start, end):
#         for i_index in range(n_items):
#             if np.isnan(ratings_matrix[u_index][i_index]):
#                 rating = predict(u_index, i_index, 100)
#         print(u_index)       
                
# # fill_matrix()

# try:
#     thread.start_new_thread(fill_matrix_2, (0, 200 ))
#     thread.start_new_thread(fill_matrix_2, (200, 400 ))
#     thread.start_new_thread(fill_matrix_2, (400, 600 ))
#     thread.start_new_thread(fill_matrix_2, (600, 800 ))
#     thread.start_new_thread(fill_matrix_2, (800, n_users ))

# except:
#     print("Error: unable to start thread")
# while 1:
#     pass

## Other similarity functions

### Raw Cosine

In [61]:
def raw_cosine(u, v):
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_rating_indices_v = set(specified_rating_indices(v)[0])
    
    mutually_specified_ratings_indices = specified_rating_indices_u.intersection(specified_rating_indices_v)
    mutually_specified_ratings_indices = list(mutually_specified_ratings_indices)
    
    u_mutually = u[mutually_specified_ratings_indices]
    v_mutually = v[mutually_specified_ratings_indices]
    
    result = np.sum(np.multiply(u_mutually, v_mutually)) / (np.sqrt(np.sum(np.square(u_mutually))) * np.sqrt(np.sum(np.square(v_mutually))))

    return result

In [62]:
user_similarity_matrix = get_user_similarity_matrix(ratings_matrix, raw_cosine)

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 943/943 [00:58<00:00, 16.17it/s]


In [63]:
predicted_ratings = get_predicted_ratings_matrix()
predicted_ratings = np.array(predicted_ratings)

100%|██████████| 943/943 [04:51<00:00,  3.24it/s]


In [64]:
print(get_mse_error(ratings_matrix, predicted_ratings))

0.7710586634606635


### Raw Cosine 2

In [65]:
def raw_cosine_2(u, v):
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_rating_indices_v = set(specified_rating_indices(v)[0])
    
    mutually_specified_ratings_indices = specified_rating_indices_u.intersection(specified_rating_indices_v)
    mutually_specified_ratings_indices = list(mutually_specified_ratings_indices)
    
    specified_ratings_u = u[list(specified_rating_indices_u)]
    specified_ratings_v = v[list(specified_rating_indices_v)]
    
    u_mutually = u[mutually_specified_ratings_indices]
    v_mutually = v[mutually_specified_ratings_indices]
    
    result = np.sum(np.multiply(u_mutually, v_mutually)) / (np.sqrt(np.sum(np.square(specified_ratings_u))) * np.sqrt(np.sum(np.square(specified_ratings_v))))

    return result

In [66]:
user_similarity_matrix = get_user_similarity_matrix(ratings_matrix, raw_cosine_2)

100%|██████████| 943/943 [01:19<00:00, 11.90it/s]


In [67]:
predicted_ratings = get_predicted_ratings_matrix()
predicted_ratings = np.array(predicted_ratings)

100%|██████████| 943/943 [05:15<00:00,  2.99it/s]


In [68]:
print(get_mse_error(ratings_matrix, predicted_ratings))

0.7750288857939598


### Discounted Similarity

In [74]:
def discounted_sim(u, v, beta, sim_func):
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_rating_indices_v = set(specified_rating_indices(v)[0])
    
    mutually_specified_ratings_indices = specified_rating_indices_u.intersection(specified_rating_indices_v)
    mutually_specified_ratings_indices = list(mutually_specified_ratings_indices)
    
    result = sim_func(u, v) * min(len(mutually_specified_ratings_indices), beta) / beta
    
    return result

#### With Pearson

In [78]:
beta = 5

In [79]:
def discounted_sim_pearson(u, v):
    return discounted_sim(u, v, beta, pearson)

In [80]:
user_similarity_matrix = get_user_similarity_matrix(ratings_matrix, discounted_sim_pearson)

100%|██████████| 943/943 [02:16<00:00,  6.92it/s]


In [81]:
predicted_ratings = get_predicted_ratings_matrix()
predicted_ratings = np.array(predicted_ratings)

100%|██████████| 943/943 [05:30<00:00,  2.85it/s]


In [None]:
print(get_mse_error(ratings_matrix, predicted_ratings))

#### With Cosine

In [83]:
beta = 5

In [84]:
def discounted_sim_cosine(u, v):
    return discounted_sim(u, v, beta, cosine)

In [None]:
predicted_ratings = get_predicted_ratings_matrix()
predicted_ratings = np.array(predicted_ratings)

In [None]:
print(get_mse_error(ratings_matrix, predicted_ratings))

## Variants of the Prediction Function 

In [85]:
def standard_deviation(u):
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_ratings_u = u[list(specified_rating_indices_u)]
    m = mean(u)
    
#     print(specified_ratings_u)
#     print(m)
    result = np.sqrt(np.sum(np.square(specified_ratings_u - m)) / (len(list(specified_rating_indices_u)) - 1))
    
    return result

In [95]:
def get_all_user_standard_deviation():
    results = []
    for u in ratings_matrix:
        results.append(standard_deviation(u))
    return results

all_user_standard_deviation = get_all_user_standard_deviation()

In [86]:
def get_standardized_ratings(u):
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_ratings_u = u[list(specified_rating_indices_u)]
    m = mean(u)
    
    sigma = standard_deviation(u)
    
    result = (specified_ratings_u - m) / sigma

    r = []
    count = 0
    for i in range(len(u)):
        if np.isnan(u[i]):
            r.append(nan)
        else:
            r.append(result[count])
            count = count + 1
    return r

In [87]:
def get_standardized_ratings_matrix(ratings_matrix):
    result = []
    for u_index in range(ratings_matrix.shape[0]):
        u = get_standardized_ratings(ratings_matrix[u_index, :])
        result.append(u)
    return np.array(result)

In [88]:
standardized_ratings_matrix = get_standardized_ratings_matrix(ratings_matrix)
print(standardized_ratings_matrix)

[[ 1.09981217 -0.4829863   0.30841294 ...         nan         nan
          nan]
 [ 0.28173754         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 ...
 [ 1.06233915         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan  1.26245375         nan ...         nan         nan
          nan]]


In [94]:
print(users_mean_rating.shape)

(943,)


In [97]:
users_mean_rating = all_user_mean_ratings(ratings_matrix)

def predict_2(u_index, i_index, k):
    
    similarity_value = user_similarity_matrix[u_index]
    sorted_users_similar = np.argsort(similarity_value)
    sorted_users_similar = np.flip(sorted_users_similar, axis=0)
        
    # only for this item
    users_rated_item = specified_rating_indices(ratings_matrix[:, i_index])[0]
    
    set_2 = frozenset(users_rated_item)
    ranked_similar_user_rated_item = [u for u in sorted_users_similar if u in set_2] 
    
    if k < len(ranked_similar_user_rated_item):
        top_k_similar_user = ranked_similar_user_rated_item[0:k]   
    else:
        top_k_similar_user = np.array(ranked_similar_user_rated_item)
        
    # replace with mean_centered for user
    
    ratings_in_item = standardized_ratings_matrix[:, i_index]
    top_k_ratings = ratings_in_item[top_k_similar_user]

    top_k_similarity_value = similarity_value[top_k_similar_user]

    sigma = all_user_standard_deviation[u_index]#standard_deviation(users_mean_rating)
    
    r_hat = users_mean_rating[u_index] + sigma * np.sum(top_k_ratings * top_k_similarity_value)/np.sum(np.abs(top_k_similarity_value))
    return r_hat

In [98]:
def get_predicted_ratings_matrix_2():
    predicted_ratings = []
    for u_index in tqdm(range(n_users)):
        user_ratings = []
        for i_index in range(n_items):
#             rating = ratings_matrix[u_index][i_index]
#             if np.isnan(rating):
            rating = predict_2(u_index, i_index, 100)
            user_ratings.append(rating)
        predicted_ratings.append(user_ratings)
    return predicted_ratings 

In [90]:
user_similarity_matrix = get_user_similarity_matrix(ratings_matrix, raw_cosine)


  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 943/943 [01:03<00:00, 14.83it/s]


In [None]:
predicted_ratings = get_predicted_ratings_matrix_2()
predicted_ratings = np.array(predicted_ratings)

In [100]:
print(get_mse_error(ratings_matrix, predicted_ratings))

1.007600322130269
