In [117]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
# r_cols = ['user_id','item_id','rating']
# df = pd.read_csv('ex.dat.txt',sep =' ', names =r_cols, encoding ='latin-1' )
# print(df)

class CF(object):
    # collaborative fitering
    def __init__(self,Y_data, k,dist_func = cosine_similarity, uuCF =1 ):
        self.uuCF = uuCF # user-user CF(1), item-item CF (0)
        self.k = k
        self.dist_func = dist_func
        self.Y_data = Y_data if uuCF ==1 else Y_data[:,[1,0,2]] # utility matrix
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = np.int(np.max(self.Y_data[:,0])) +1
        self.n_items = np.int(np.max(self.Y_data[:,1])) +1
        self.Ybar_data = None # matrix normalized
    # khi có dữ liệu mới thì thêm vào ( function add)
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis =0)
        
        
    # TÍNH NORMALIZED MATRIX , SIMILARITY MATRIX
    
    
    def normalize_Y(self):
        users = self.Y_data[:,0] # all users : columns 0 of Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mean_user   = np.zeros((self.n_users,)) #  vector : mean ratings of each user
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            user_ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[user_ids,1]
            # rating of each user_ids
            ratings  = self.Y_data[user_ids, 2]
            #take mean
            mean = np.mean(ratings)
            if np.isnan(mean):
                mean = 0 # to avoid array empty or nan value
            self.mean_user[n] = mean
            #normalized data
            self.Ybar_data[user_ids, 2] = ratings - self.mean_user[n]
        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:,2],(self.Ybar_data[:,1], self.Ybar_data[:,0])),(self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()


    #function similarity of 2 user-user, item-item ( cosin)
    def similarity( self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    # Thực hiện lai 2 hàm trên khi có thêm dữ liệu
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity()
    def fit(self):
        self.refresh()
    
    # DỰ ĐOÁN KẾT QUẢ
    
    
        # phương thức private cho lớp CF
    def __pred(self, u, i, normalize =1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:,1] == i)[0].astype(np.int32)
        # Step 2:
        user_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, user_rated_i]
        # Step 4: find the k most similarity users (sorted)
        a = np.argsort(sim)[-self.k:] # argsort: lấy k chỉ số cuối
        # and the corresponding similarity levels
        nearest_s = sim[a] # mức độ tương quan của k user gần nó nhất
        # How did each of 'near' users rated item i
        r = self.Ybar[i, user_rated_i[a]]
        
        if normalize:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)
        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mean_user[u]
    def pred(self, u, i, normalize =1):
        if self.uuCF ==1:
            return self.__pred(u,i,normalize)
        else:
            return self.__pred(i,u,normalize) # cho item-item CF
    
    # TÌM TẤT CẢ CÁC ITEM
    
    
    def recommend(self, u, normalize =1):
        """
        Determine all items should be recommended for user u. (uuCF =1)
        or all users who might have interest on item u (uuCF = 0)
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:,0] == u)[0]
        item_rated_by_u = self.Y_data[ids, 1].tolist()
        recommend_items =[]
        for i in range(self.n_items):
            if i not in item_rated_by_u:
                rating = self.__pred(u, i)
                if rating >0 :
                    recommend_items.append(i)
        return recommend_items
    def  print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommend_items = self.recommend(u)
            if self.uuCF ==1:
                print('Recommend item ', recommend_items, 'to user', u)
            else :
                print('Recommend item ', u,'to user', recommend_items)
    

In [118]:
# Áp dụng vào ví dụ nhỏ trước ( User-User CF)
r_cols = ['user_ids','item_ids','rating']
ratings = pd.read_csv('ex.dat.txt', sep =' ',encoding ='latin-1', names = r_cols)
print(ratings)
Y_data = ratings.as_matrix() # utility matrix
print(Y_data)
rs = CF(Y_data, 2, uuCF =1)
rs.fit()
rs.print_recommendation()

    user_ids  item_ids  rating
0          0         0     5.0
1          0         1     4.0
2          0         3     2.0
3          0         4     2.0
4          1         0     5.0
5          1         2     4.0
6          1         3     2.0
7          1         4     0.0
8          2         0     2.0
9          2         2     1.0
10         2         3     3.0
11         2         4     4.0
12         3         0     0.0
13         3         1     0.0
14         3         3     4.0
15         4         0     1.0
16         4         3     4.0
17         5         1     2.0
18         5         2     1.0
19         6         2     1.0
20         6         3     4.0
21         6         4     5.0
[[0. 0. 5.]
 [0. 1. 4.]
 [0. 3. 2.]
 [0. 4. 2.]
 [1. 0. 5.]
 [1. 2. 4.]
 [1. 3. 2.]
 [1. 4. 0.]
 [2. 0. 2.]
 [2. 2. 1.]
 [2. 3. 3.]
 [2. 4. 4.]
 [3. 0. 0.]
 [3. 1. 0.]
 [3. 3. 4.]
 [4. 0. 1.]
 [4. 3. 4.]
 [5. 1. 2.]
 [5. 2. 1.]
 [6. 2. 1.]
 [6. 3. 4.]
 [6. 4. 5.]]
Recommendation: 
Recom

  """


In [119]:
# Áp dụng vào ví dụ nhỏ trước ( Item-item CF)
r_cols = ['user_ids','item_ids','rating']
ratings = pd.read_csv('ex.dat.txt', sep =' ',encoding ='latin-1', names = r_cols)
print(ratings)
Y_data = ratings.as_matrix() # utility matrix
print(Y_data)
rs = CF(Y_data, 2, uuCF =0)
rs.fit()
rs.print_recommendation()

    user_ids  item_ids  rating
0          0         0     5.0
1          0         1     4.0
2          0         3     2.0
3          0         4     2.0
4          1         0     5.0
5          1         2     4.0
6          1         3     2.0
7          1         4     0.0
8          2         0     2.0
9          2         2     1.0
10         2         3     3.0
11         2         4     4.0
12         3         0     0.0
13         3         1     0.0
14         3         3     4.0
15         4         0     1.0
16         4         3     4.0
17         5         1     2.0
18         5         2     1.0
19         6         2     1.0
20         6         3     4.0
21         6         4     5.0
[[0. 0. 5.]
 [0. 1. 4.]
 [0. 3. 2.]
 [0. 4. 2.]
 [1. 0. 5.]
 [1. 2. 4.]
 [1. 3. 2.]
 [1. 4. 0.]
 [2. 0. 2.]
 [2. 2. 1.]
 [2. 3. 3.]
 [2. 4. 4.]
 [3. 0. 0.]
 [3. 1. 0.]
 [3. 3. 4.]
 [4. 0. 1.]
 [4. 3. 4.]
 [5. 1. 2.]
 [5. 2. 1.]
 [6. 2. 1.]
 [6. 3. 4.]
 [6. 4. 5.]]
Recommendation: 
Recom

  """


# Áp dụng cho movie len 100k

In [126]:
# Kết quả dự đoán : User-user CF
r_cols =['user_ids','movie_ids','rating','time']
rating_base = pd.read_csv('ml-100k/ml-100k/ua.base', sep ='\t', names =r_cols)
rating_test = pd.read_csv('ml-100k/ml-100k/ua.test', sep ='\t', names =r_cols)

rate_train = rating_base.as_matrix()
rate_test  = rating_test.as_matrix()
# Vì các dữ liệu user, movie trong data đánh chỉ số từ 1 mà code thì from =0 
rate_train[:,:2] -=1
rate_test[:,:2]  -=1
print(rate_train)

rs = CF(rate_train, k=30, uuCF =1)
rs.fit()

n_test = rate_test.shape[0]
SE =0
for n in range(n_test):
    pred = rs.pred(rate_test[n, 0], rate_test[n,1], normalize =0)
    SE += (pred - rate_test[n,2])**2
RMSE = np.sqrt(SE/n_test)
print( 'User- user CF:, RMSE = ', RMSE)

  
  import sys


[[        0         0         5 874965758]
 [        0         1         3 876893171]
 [        0         2         4 878542960]
 ...
 [      942      1187         3 888640250]
 [      942      1227         3 888640275]
 [      942      1329         3 888692465]]
User- user CF:, RMSE =  0.9767347622900232


In [127]:
# Kết quả dự đoán cho Item-item CF
r_cols =['user_ids','movie_ids','rating','time']
rating_base = pd.read_csv('ml-100k/ml-100k/ua.base', sep ='\t', names =r_cols)
rating_test = pd.read_csv('ml-100k/ml-100k/ua.test', sep ='\t', names =r_cols)

rate_train = rating_base.as_matrix()
rate_test  = rating_test.as_matrix()
# Vì các dữ liệu user, movie trong data đánh chỉ số từ 1 mà code thì from =0 
rate_train[:,:2] -=1
rate_test[:,:2]  -=1
print(rate_train)

rs =CF(rate_train, k =40, uuCF =0)
rs.fit()

n_test = rate_test.shape[0]
SE =0
for n in range(n_test):
    pred = rs.pred(rate_test[n, 0], rate_test[n,1], normalize =0)
    SE += (pred - rate_test[n,2])**2
RMSE = np.sqrt(SE/n_test)
print( 'Item- item CF:, RMSE = ', RMSE)

  
  import sys


[[        0         0         5 874965758]
 [        0         1         3 876893171]
 [        0         2         4 878542960]
 ...
 [      942      1187         3 888640250]
 [      942      1227         3 888640275]
 [      942      1329         3 888692465]]
Item- item CF:, RMSE =  0.9688460838682366


# Nhận thấy Item-item CF tốt hơn  User-user CF và tốt hơn Content-Based RS