## Library

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

## Load DataSet

In [2]:
smd = pd.read_csv('../the-movies-dataset/movies_metadata_equal_ratings.csv')
ratings = pd.read_csv('../the-movies-dataset/ratings_equal_movies_metadata.csv')
train = pd.read_csv('../the-movies-dataset/ratings_train.csv')
test = pd.read_csv('../the-movies-dataset/ratings_test.csv')

In [3]:
smd

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"Led by Woody, Andy's toys live happily in his ...",1995
1,False,,65000000,"['Adventure', 'Fantasy', 'Family']",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,When siblings Judy and Peter discover an encha...,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"['Romance', 'Comedy']",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,A family wedding reignites the ancient feud be...,1995
3,False,,16000000,"['Comedy', 'Drama', 'Romance']",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"Cheated on, mistreated and stepped on, the wom...",1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,['Comedy'],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Just when George Banks has recovered from his ...,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9020,False,"{'id': 286023, 'name': 'Sharknado Collection',...",0,"['Comedy', 'Horror', 'Science Fiction']",http://www.syfy.com/sharknado4,390989,tt4831420,en,Sharknado 4: The 4th Awakens,The new installment of the Sharknado franchise...,...,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"What happens in Vegas, stays in Vegas. Unless ...",Sharknado 4: The 4th Awakens,False,4.3,88.0,The new installment of the Sharknado franchise...,2016
9021,False,,8000000,['Drama'],,159550,tt0255313,en,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,...,85.0,[],Released,,The Last Brickmaker in America,False,7.0,1.0,A man must cope with the loss of his wife and ...,2001
9022,False,,1000000,"['Thriller', 'Romance']",,392572,tt5165344,hi,रुस्तम,"Rustom Pavri, an honourable officer of the Ind...",...,150.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,Decorated Officer. Devoted Family Man. Defendi...,Rustom,False,7.3,25.0,"Rustom Pavri, an honourable officer of the Ind...",2016
9023,False,,15050000,"['Adventure', 'Drama', 'History', 'Romance']",,402672,tt3859980,hi,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",...,155.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,,Mohenjo Daro,False,6.7,26.0,"Village lad Sarman is drawn to big, bad Mohenj...",2016


In [4]:
train = train[['userId','movieIndex','rating']]

In [5]:
train

Unnamed: 0,userId,movieIndex,rating
0,0,2375,1.0
1,0,1958,2.5
2,0,1107,2.5
3,0,1083,2.0
4,0,1037,2.0
...,...,...,...
79843,670,3838,4.5
79844,670,2335,4.0
79845,670,3223,4.0
79846,670,1569,3.5


In [6]:
test = test[['userId','movieIndex','rating']]

In [7]:
train_matrix = train.values
train_matrix.shape

(79848, 3)

In [8]:
test_matrix = test.values
test_matrix.shape

(19962, 3)

In [9]:
#r_cols = ['user_id','item_id','rating']
#ratings_example = pd.read_csv('./ex.dat',sep =' ', names= r_cols, encoding = 'latin-1')

In [10]:
#ratings_example

In [11]:
#ratings_example_matrix = ratings_example.values
#ratings_example_matrix.shape

In [12]:
class CF(object):
    # Y_data is user-item matrix with have 3 column - (user_id, item_id, ratings) 
    # uuCF mode (user-user or item-item)
    # k is number of user similar to the target-user
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF 
        self.Y_data = Y_data if uuCF else Y_data[:,[1,0,2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. 
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
        self.items_id = np.unique(self.Y_data[:,1])
        self.items_id = [int(i) for i in self.items_id]
    
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
  
    def normalize_Y(self):
        # get all list of user
        users = self.Y_data[:, 0]
        self.Ybar_data = self.Y_data.copy() # save normalized user-item matrix
        # 1 dimention contain list of unique users which
        # after this with contain rating mean value corresponding user_id
        self.means = np.zeros((self.n_users,))

        for n in range(self.n_users):
            # row indices of rating done by user n 
            ids = np.where(users == n)[0].astype(np.int32)
            
            #indices of all item associated with user n
            item_ids = self.Y_data[ids, 1]
            
            # indices of all ratings associated with user n
            ratings = self.Y_data[ids, 2]
            
            #take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
                
            self.means[n] = m  # save mean value
            self.Ybar_data[ids, 2] = ratings - self.means[n]
        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their location
        
        # and sparse matrix which is usually contain non-zero value and index of it
        # systax : sparse_matrix = coo_matrix((data,(row,col)), shape=(length(row),length(column)))
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:,2],(self.Ybar_data[:, 1], self.Ybar_data[:,0])), (self.n_items, self.n_users))
       
        # sort row and column
        self.Ybar = self.Ybar.tocsr()
        
    def similarity(self):
        # The "close-enough" factor. lay phan sau gia tri thap phan
        # self.Ybar.T : vì ma trận thực ra là một mảng lưu nhiều 
        # mảng khác. nên mỗi hàng đại diện một mảng 
        # self.Ybar.T đảo cột thành hàng. 1 mảng con đại diện cho 
        # 1 user 
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
    def refesh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity()
    
    def fit(self):
        self.refesh()
    
    # u - current user
    # i - current item need to predict rating
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all index that contain item index equal i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
      
        # Step 2: find all user that corresponing to list index
        # which mean user that rated item i
        users_rated_i = (self.Y_data[ids, 0].astype(np.int32))
        
        # Step 3: find similarity btw the current user and  others
        # who already rated i
        sim = self.S[u, users_rated_i]
        
        #Step 4: find the k most similarity users
        # cái hay của argsort là nó sort giá trị xong thì lưu lại 
        # danh sách index đã được sort chứ không thay đổi array
        a = np.argsort(sim)[-self.k:]
        # and corresponding similarity 
        nearest_s = sim[a]
        # Value of ratings to target item bae on 'near' users r
        r = self.Ybar[i, users_rated_i[a]]
        
        if normalized:
        # adding a small number, for instance, 1e-8, to avoid 
        #dividing by 0
        # np.abs lấy giá trị tuyệt đối của mảng 
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)
        
        # lấy giá trị rating trước khi normalized
        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.means[u]
   

    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = {}
        
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.pred(u,i,0)
                recommended_items[i] = rating
        
        
        return (recommended_items)
    

In [13]:
#rs = CF(ratings_example_matrix, k = 2, uuCF = 1)
#rs.fit()
#rs.recommend(0)

In [14]:
#ratings_example_1 = ratings_example[['item_id','user_id','rating']]
#ratings_example_matrix_1 = ratings_example_1.values
#ratings_example_matrix_1.shape

In [15]:
#rs = CF(ratings_example_matrix_1, k = 2, uuCF = 0)
#rs.fit()
#rs.recommend(0)

In [16]:
rs = CF(train_matrix, k = 30, uuCF = 1)
rs.fit()

In [17]:
number_row_test = test_matrix.shape[0]
SE = 0 # squared error

In [18]:
for i in range(number_row_test):
    pred = rs.pred(int(test_matrix[i,0]),int(test_matrix[i,1]),0)
    SE += (pred - int(test_matrix[i, 2]))**2 # [a**2] mean power 2 - a^2

RMSE = np.sqrt(SE/number_row_test)    
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 0.95571937310441


In [19]:
rs = CF(train_matrix, k = 30, uuCF = 0)
rs.fit()
#result = rs.recommend(0)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [20]:
#len(result)

In [21]:
#result[83]

In [22]:
#test[test['movieIndex'] == 0]

In [23]:
number_row_test = test_matrix.shape[0]
SE = 0

In [25]:
for i in range(number_row_test):
    pred = rs.pred(int(test_matrix[i,1]),int(test_matrix[i,0]),normalized = 0)
    SE += (pred - int(test_matrix[i,2]))**2

RMSE = np.sqrt(SE/number_row_test) 
print('Item-Item CF, RMSE =', RMSE)

Item-Item CF, RMSE = 1.118599008287405
