# Introduction

In this tutorial, we will implement some algorithms in Collaborative Filtering Algorithm with MovieLens dataset. All theories used in this tutorial could be found in *Collaborative Filtering.ipynb* file. <br>
**Note**: We need to implement with this dataset because we will compare results in variant models.

# Implementation

## Preprocessing Data

### Get data

In [1]:
import numpy as np
import tensorflow as tf
import sklearn
import csv
import pandas as pd

  from ._conv import register_converters as _register_converters


In [2]:
import os

dir_path = os.path.abspath(os.path.join('', os.pardir))

In [3]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(os.path.join(dir_path, 'data/ml-100k/u.data'), names=names, sep='\t')

In [4]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]

In [5]:
nan = np.nan
ratings_matrix = np.zeros((n_users, n_items)) * nan
for line in df.itertuples():
    ratings_matrix[line[1]-1, line[2]-1] = line[3]

In [6]:
print(ratings_matrix)

[[ 5.  3.  4. ... nan nan nan]
 [ 4. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [ 5. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan  5. nan ... nan nan nan]]


### Work with data

In [7]:
# indices for vector
def specified_rating_indices(u):
    return np.where(np.isfinite(u))

In [8]:
# mean rating for each user i using his specified rating
def mean(u):
    # may use specified_rating_indices but use more time
    specified_ratings = u[specified_rating_indices(u)]#u[np.isfinite(u)]
    m = sum(specified_ratings)/np.shape(specified_ratings)[0]
    return m

In [9]:
def all_user_mean_ratings(ratings_matrix):
    return np.array([mean(ratings_matrix[u, :]) for u in range(ratings_matrix.shape[0])])

In [10]:
def get_mean_centered_ratings_matrix(ratings_matrix):
    users_mean_rating = all_user_mean_ratings(ratings_matrix)
    mean_centered_ratings_matrix = ratings_matrix - np.reshape(users_mean_rating, [-1, 1])
    return mean_centered_ratings_matrix

In [11]:
mean_centered_ratings_matrix = get_mean_centered_ratings_matrix(ratings_matrix)

# Implementation

In [12]:
def pearson(u, v):
    mean_u = mean(u)
    mean_v = mean(v)
    
    specified_rating_indices_u = set(specified_rating_indices(u)[0])
    specified_rating_indices_v = set(specified_rating_indices(v)[0])
    
    mutually_specified_ratings_indices = specified_rating_indices_u.intersection(specified_rating_indices_v)
    mutually_specified_ratings_indices = list(mutually_specified_ratings_indices)
    
    u_mutually = u[mutually_specified_ratings_indices]
    v_mutually = v[mutually_specified_ratings_indices]
      
    centralized_mutually_u = u_mutually - mean_u
    centralized_mutually_v = v_mutually - mean_v
#     print(np.sqrt(np.sum(np.square(centralized_mutually_u))))

    result = np.sum(np.multiply(centralized_mutually_u, centralized_mutually_v)) 
    result = result / (np.sqrt(np.sum(np.square(centralized_mutually_u))) * np.sqrt(np.sum(np.square(centralized_mutually_v))))
    
    return result

In [13]:
print(pearson(ratings_matrix[1, :], ratings_matrix[2, :]))

0.13955602954011723


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from surprise import similarities

In [15]:
print(pearson(ratings_matrix[1, :], ratings_matrix[2, :]))

0.13955602954011723


In [16]:
def mean_centered(u):
    return u - mean(u)

In [17]:
def get_user_similarity_value_for(u_index, ratings_matrix):
    user_ratings = ratings_matrix[u_index, :]
    similarity_value = np.array([pearson(ratings_matrix[i, :], user_ratings) for i in range(ratings_matrix.shape[0])])
    return similarity_value

In [18]:
from tqdm import tqdm
def get_user_similarity_matrix(ratings_matrix):
    similarity_matrix = []
    for u_index in tqdm(range(ratings_matrix.shape[0])):
        similarity_value = get_user_similarity_value_for(u_index, ratings_matrix)
        similarity_matrix.append(similarity_value)
    return np.array(similarity_matrix)
    

In [19]:
user_similarity_matrix = get_user_similarity_matrix(ratings_matrix)

100%|██████████| 943/943 [01:41<00:00,  9.32it/s]


In [20]:
users_mean_rating = all_user_mean_ratings(ratings_matrix)

def predict(u_index, i_index, k):
    
    similarity_value = user_similarity_matrix[u_index]
    sorted_users_similar = np.argsort(similarity_value)
    sorted_users_similar = np.flip(sorted_users_similar, axis=0)
        
    # only for this item
    users_rated_item = specified_rating_indices(ratings_matrix[:, i_index])[0]

    set_2 = frozenset(users_rated_item)
    ranked_similar_user_rated_item = [u for u in sorted_users_similar if u in set_2] 
    
    if k < len(ranked_similar_user_rated_item):
        top_k_similar_user = ranked_similar_user_rated_item[0:k]   
    else:
        top_k_similar_user = np.array(ranked_similar_user_rated_item)
            
    # replace with mean_centered for user
    
    ratings_in_item = mean_centered_ratings_matrix[:, i_index]
    top_k_ratings = ratings_in_item[top_k_similar_user]
    
    top_k_similarity_value = similarity_value[top_k_similar_user]

    r_hat = users_mean_rating[u_index] + np.sum(top_k_ratings * top_k_similarity_value)/np.sum(np.abs(top_k_similarity_value))
    return r_hat

In [21]:
def fill_matrix():
    for u_index in tqdm(range(n_users)):
        for i_index in range(n_items):
            if np.isnan(ratings_matrix[u_index][i_index]):
                rating = predict(u_index, i_index, 100)
                
fill_matrix()

  2%|▏         | 23/943 [02:14<1:29:40,  5.85s/it]

KeyboardInterrupt: 

**OOPS!!!** We realize that total time that we need to use in predicting value is very long. **So we must change our code!!!**

We will look into every function that we use one more time!