# Collaborative Filtering example 

In [262]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in xrange(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean
            self.mu[n] = np.mean(ratings) 
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
        
    def fit(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            return (r*nearest_s)[0]/np.abs(nearest_s).sum()

        return (r*nearest_s)[0]/np.abs(nearest_s).sum() + self.mu[n]
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i)
        return self.__pred(i, u)
            
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in xrange(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 
    
    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
    
        for i in xrange(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 

    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print 'Recommendation: '
        for u in xrange(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print '    Recommend item(s):', recommended_items, 'for user', u
            else: 
                print '    Recommend item', u, 'for user(s) : ', recommended_items



In [263]:
# data file 
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.as_matrix()

# print Y_data
Y_data2 = Y_data.copy()
Y_data2 = Y_data2[:, [1, 0, 2]]
# print Y_data[:, [1, 0, 2]]
rs = CF(Y_data, k = 2, uuCF = 0)
rs.fit()

rs.print_recommendation()

Recommendation: 
    Recommend item 0 for user(s) :  []
    Recommend item 1 for user(s) :  [1]
    Recommend item 2 for user(s) :  [0]
    Recommend item 3 for user(s) :  [5]
    Recommend item 4 for user(s) :  [3, 4, 5]


In [264]:
print rs.pred(6, 1)

-2.25


In [261]:
np.set_printoptions(precision=2)
print rs.S

[[ 1.    0.77  0.49 -0.89 -0.52]
 [ 0.77  1.    0.   -0.64 -0.14]
 [ 0.49  0.    1.   -0.55 -0.88]
 [-0.89 -0.64 -0.55  1.    0.68]
 [-0.52 -0.14 -0.88  0.68  1.  ]]


In [265]:
print rs.mu


[ 2.6   2.    1.75  3.17  2.75]


In [266]:
print rs.Ybar

  (0, 0)	2.4
  (0, 1)	2.0
  (0, 3)	-1.16666666667
  (0, 4)	-0.75
  (1, 0)	2.4
  (1, 2)	2.25
  (1, 3)	-1.16666666667
  (1, 4)	-2.75
  (2, 0)	-0.6
  (2, 2)	-0.75
  (2, 3)	-0.166666666667
  (2, 4)	1.25
  (3, 0)	-2.6
  (3, 1)	-2.0
  (3, 3)	0.833333333333
  (4, 0)	-1.6
  (4, 3)	0.833333333333
  (5, 1)	0.0
  (5, 2)	-0.75
  (6, 2)	-0.75
  (6, 3)	0.833333333333
  (6, 4)	2.25


In [267]:
print rs.pred(0, 2)

2.4


In [268]:
for p in [[0, 2], [1, 1], [2, 1], [3, 2], [3, 4], [4, 1], [4, 2], [4, 4], [5, 0], [5, 3], [5, 4], [6, 0], [6, 1]]:
    print p[0], p[1], rs.pred(p[0], p[1])
    

0 2 2.4
1 1 2.4
2 1 -0.6
3 2 -2.6
3 4 1.03001610441
4 1 -1.25051446472
4 2 -1.19517941575
4 4 1.16434850578
5 0 -0.291807704984
5 3 0.344707916846
5 4 0.648592324687
6 0 -1.52116760027
6 1 -2.25
