# Collaborative Filtering example 


In [315]:
import pandas as pd 
import numpy as np

# data file 
r_cols = ['user_id', 'item_id', 'rating']

ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')

Y_data = ratings.as_matrix()#.astype(np.float32)
n_users = int(np.max(Y_data[:, 0]))
n_items = int(np.max(Y_data[:, 1]))

# since indices in python start from 0, we need to -1
Y_data[:, 0] -= 1
Y_data[:, 1] -= 1

print 'Number of users: ', n_users
print 'Number of items: ', n_items

Number of users:  7
Number of items:  5


In [313]:
import numpy as np
# Find mean rating of each user 
# and normalize the rating matrix

def normalize(rating_data):
    y = rating_data[:, 0] # all users
    normalized_rating_data = rating_data.copy()
    mu = np.zeros((n_users,))
    for n in xrange(n_users):
        # row indices of rating done by user n
        ids = np.where(y == n)[0].astype(np.int32)
        # indices of all items rated by user n
        item_ids = rating_data[ids, 1] 
        # and the corresponding ratings 
        ratings = rating_data[ids, 2]
        # take mean
        mu[n] = np.mean(ratings) 
        # normalize
        normalized_rating_data[ids, 2] = ratings - mu[n]
        
    return normalized_rating_data, mu


Ybar_data, mu = normalize(rating_data)
# print Ybar_data, mu

In [302]:
# form the rating matrix as a sparse matrix 
# the sparsity is important for both memory and computing 
# efficient. For example, if #user = 1M, #item = 100k
# then shape of the rating matrix would be (100k, 1M), you may not
# have enough memory to store this. 
# Then, instead, we store nonzeros only, 
# and, of course, their locations
from scipy import sparse 
Ybar = sparse.coo_matrix((Ybar_data[:, 2],
            (Ybar_data[:, 1], Ybar_data[:, 0])), (n_items, n_users))

In [314]:
# cosine similarity matrix 
# If number of users is small, we can calculate all at once. 
# If not, we calculate cosine similarity of one user with others
# one by one. Note: the cosine_similarity function takes inputdata
# in form of row by row (each row is a datapoint). Then we need to 
# transpose the normalized raring matrix
from sklearn.metrics.pairwise import cosine_similarity
S = cosine_similarity(Ybar.T, Ybar.T)

# print S. For simplicity, we need only 2 digits after 
# the decimal point, 
np.set_printoptions(precision=2)
print S


[[ 1.    0.85 -0.55 -0.67 -0.87  0.   -0.31]
 [ 0.85  1.   -0.87 -0.4  -0.55 -0.23 -0.72]
 [-0.55 -0.87  1.    0.27  0.32  0.47  0.97]
 [-0.67 -0.4   0.27  1.    0.87 -0.29  0.13]
 [-0.87 -0.55  0.32  0.87  1.    0.    0.11]
 [ 0.   -0.23  0.47 -0.29  0.    1.    0.55]
 [-0.31 -0.72  0.97  0.13  0.11  0.55  1.  ]]


In [304]:
# d) 
Y_data = rating_data
# Ybar_data = normalized_rating_data
# example first u = 0, i = 2 
u = 0 
i = 2 
# Step 1: find all users who rated i_2 
ids = np.where(Y_data[:, 1] == i)[0].astype(np.int32)
# print Y_data[users_rated_i, :]
users_rated_i = (Y_data[ids, 0]).astype(np.int32)

print users_rated_i
# print Y_data

[1 2 5 6]


In [305]:
sim = S[u, users_rated_i]
print sim
# print Y_data[users_rated_i, 0]

[ 0.85 -0.55  0.   -0.31]


In [306]:
k = 2
a = np.argsort(sim)[-k:] 
nearest_s = sim[a]
print nearest_s

[ 0.    0.85]


In [307]:
# find rated 
Ybar = Ybar.tocsr()
# print Ybar[i, users_rated_i[0]], a, users_rated_i[a]
r = Ybar[i, users_rated_i[a]]
# print r[0, 0]
print r[0,0]*nearest_s[0] + r[0, 1]*nearest_s[1]
print np.abs(nearest_s).sum()
print r*nearest_s/np.abs(nearest_s).sum()

1.06298800691
0.850390405524
[ 1.25]


In [308]:
def CF_pred(s, r):
    return r*s/np.abs(s).sum()

In [310]:
def pred(i, u):
    # Step 1: find all users who rated i_2 
    ids = np.where(Y_data[:, 1] == i)[0].astype(np.int32)
    # Step 2 
    users_rated_i = (Y_data[ids, 0]).astype(np.int32)
    sim = S[u, users_rated_i]
    a = np.argsort(sim)[-k:] 
    nearest_s = sim[a]
    
    r = Ybar[i, users_rated_i[a]]
#     print r, nearest_s
    return (r*nearest_s)[0]/np.abs(nearest_s).sum()

Ybar = Ybar.tocsr()
print pred(4, 5) + mu[4]
    

3.91084877375


In [311]:
print 'user item   rating'
for u in xrange(n_users):
    ids = np.where(Y_data[:, 0] == u)[0]
    items_rated_by_u = Y_data[ids, 1].tolist()
    for i in xrange(n_items):
        if i not in items_rated_by_u:
            
            print u,'   ', i, '  ', pred(i, u) + mu[u]

user item   rating
0     2    4.25
1     1    2.53700934751
2     1    2.6459407245
3     2    -0.219181499415
3     4    2.78081850058
4     1    1.16666666667
4     2    0.957246050693
4     4    3.95724605069
5     0    1.5
5     3    2.41084877375
5     4    3.41084877375
6     0    3.07052889554
6     1    4.22890349047


# Class for hanlding data easier 

In [None]:
from scipy import sparse 
import pandas as pd 
import numpy as np

class CF(object):
    def _int_(self, Y_data, k, dist_fuc = cosine_similarity):
        self.Y_data = Y_data # a numpy array of shape (N, 3)
                    # each row is a (user, item, rating) data
        self.k = k  # number of nearest points
        self.distance = distance # similarity
        self.Ybar_data = None
        self.n_users = int(np.max(self.Y_data[:, 0]))
        self.n_items = int(np.max(self.Y_data[:, 1]))
    
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in xrange(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = rating_data[ids, 1] 
            # and the corresponding ratings 
            ratings = rating_data[ids, 2]
            # take mean
            mu[n] = np.mean(ratings) 
            # normalize
            self.Ybar_data[ids, 2] = ratings - mu[n]
        ################################################
        # form the rating matrix as a sparse matrix 
        # the sparsity is important for both memory and computing 
        # efficient. For example, if #user = 1M, #item = 100k
        # then shape of the rating matrix would be (100k, 1M), you may not
        # have enough memory to store this. 
        # Then, instead, we store nonzeros only, 
        # and, of course, their locations
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (n_items, n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        self.S = dist_func(self.Ybar.T, self.Ybar.T)
        
    def fit(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalized_Y()
        self.similairity() 
        
    def pred(self, u, i):
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rate item i
        r = Ybar[i, users_rated_i[a]]
        return (r*nearest_s)[0]/np.abs(nearest_s).sum()
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        # Find all rows corresponding to user u
        ids = np.where(Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist() 
        recommended_items = []
        for i in xrange(self.n_items):
            if i not in items_rated_by_u:
                rating = pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 

In [316]:


# data file 
r_cols = ['user_id', 'item_id', 'rating']

ratings = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')

Y_data = ratings.as_matrix()#.astype(np.float32)
n_users = int(np.max(Y_data[:, 0]))
n_items = int(np.max(Y_data[:, 1]))


rs = CF(Y_data, k = 2)

[[1 2]
 [3 4]
 [5 6]]


In [None]:
print 'user item   rating'
for u in xrange(n_users):
    ids = np.where(Y_data[:, 0] == u)[0]
    items_rated_by_u = Y_data[ids, 1].tolist()
    for i in xrange(n_items):
        if i not in items_rated_by_u:
            
            print u,'   ', i, '  ', pred(i, u) + mu[u]