In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse


class CF(object):
    """docstring for CF"""

    def __init__(self, Y_data, k, dist_func=cosine_similarity, uuCF=1):
        self.uuCF = uuCF  # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1

    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis=0)

    def normalize_Y(self):
        users = self.Y_data[:, 0]  # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1]
            # and the corresponding ratings
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0  # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important
        # for both memory and computing efficiency. For example, if #user = 1M,
        # #item = 100k, then shape of the rating matrix would be (100k, 1M),
        # you may not have enough memory to store this. Then, instead, we store
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
                                       (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)

    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity()

    def fit(self):
        self.refresh()

    def __pred(self, u, i, normalized=1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2:
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:]
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]

    def pred(self, u, i, normalized=1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF:
            return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)

    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 2.5:
                    recommended_items.append(i)

        return recommended_items

    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []

        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 2.5:
                    recommended_items.append([i,rating])

        return recommended_items

    def print_recommendation(self):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('Recommend item(s):', recommended_items, 'for user', u)
            else:
                print('Recommend item', u, 'for user(s) : ', recommended_items)

    def print_recommendation2(self ,x):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        recommended_items = self.recommend(x)
        if self.uuCF:
            print('Recommend item(s):', recommended_items, 'for user', x)
        else:
            print('Recommend item', x, 'for user(s) : ', recommended_items)


In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
np.set_printoptions(formatter={'float': "{:6.11g}".format})


rating = pd.read_csv('../BaseContent/input/ratings_small.csv')

rating = rating.drop(columns='timestamp')
rating_train, rating_test = train_test_split(
    rating, test_size=0.15, random_state=42)


train_set = rating_train.values
test_set = rating_test.values


print(test_set.shape)
# print(rating_train.loc[rating['userId']== 530])


(15001, 3)


In [17]:
import joblib

def store_model(model, model_name=""):
    # NOTE: sklearn.joblib faster than pickle of Python
    # INFO: can store only ONE object in a file
    if model_name == "":
        model_name = type(model).__name__
    joblib.dump(model, '../Collaberative/model_users/' + model_name + '_model.pkl')


def load_model(model_name):
    # Load objects into memory
    #del model
    model = joblib.load('../Collaberative/model_users/' + model_name + '_model.pkl')
    # print(model)
    return model

In [8]:
def evaluate(model,dataset):
    dataset = dataset.astype(int)
    n_tests = dataset.shape[0]
    SE = 0  # squared error
    for n in range(n_tests):
        pred = model.pred(dataset[n, 0], dataset[n, 1], normalized=0)
        SE += (pred - dataset[n, 2])**2
    return np.sqrt(SE/n_tests)


In [25]:


rs = CF(train_set, k=2, uuCF=1)
rs.fit()
rs_test = CF(test_set, k=30, uuCF=1)
rs_test.fit()
store_model(rs)
rs.recommend2(2)
# rs.print_recommendation()

# print('User-user CF train, RMSE =', evaluate(rs, train_set))
# print('User-user CF test, RMSE =', evaluate(rs_test, test_set))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


[[96, 2.725489389366752],
 [573, 2.532251572440932],
 [981, 2.6387080581780538],
 [1826, 3.2260256561101177],
 [4492, 3.899996076247913],
 [6203, 2.716575644361859],
 [6219, 3.353260024568302],
 [6298, 2.875911730054926],
 [6772, 3.053907938733704],
 [7759, 2.6362860980960177],
 [8811, 2.875911730054926],
 [8859, 3.173231858525556],
 [31290, 2.875911730054926],
 [39408, 3.4102550949249744],
 [43177, 2.6701366954302106],
 [47815, 2.875911730054926],
 [48591, 2.875911730054926],
 [54290, 3.173231858525556],
 [56079, 3.14686340049742],
 [58146, 2.875911730054926],
 [61348, 2.875911730054926],
 [66066, 2.875911730054926],
 [68965, 2.875911730054926],
 [70946, 2.875911730054926],
 [71460, 2.580621140470801],
 [78321, 2.525209296439259],
 [106441, 2.5065334254331897],
 [107559, 2.5065334254331897]]

In [19]:
np.set_printoptions(formatter={'float':"{:6.11g}".format})
test = load_model("CF")

user = 530
users = test.Y_data[:, 0]  # all users - first col of the Y_data
ids = np.where(users == user)[0].astype(np.int32)

item_ids = test.Y_data[ids, 1]
ratings = test.Y_data[ids, 2]
predict_ratings = []
for n in item_ids:
    n = n.astype(int)
    tmp = test.pred(user, n, 0)
    predict_ratings.append(tmp)

print('Rated movies ids :', item_ids)
print('True ratings     :', ratings)
print('Predicted ratings:', predict_ratings)



Rated movies ids : [  1196   1556   1064   1485    261   1605    832   1690    802    356
    367    804     36   1489    558   1148   1586    838   1573    480
   1025   1566   1584    780    260   1183   1606     62   1210   1367
   1029    733    673   1416   1073    661   1353    720   1042   1569
    239    647    783    991     95    185    158   1032   1580    527
     48      1    313   1357     40   1588    595   1721     14    745
    608   1223   1033    364]
True ratings     : [     5      2      2      2      4      3      5      2      4      5
      3      2      5      5      2      5      2      4      5      5
      4      5      5      3      5      5      1      4      5      4
      4      4      1      4      4      5      3      5      4      4
      4      4      5      3      5      3      3      4      5      5
      4      5      2      5      3      3      5      5      4      5
      5      4      4      5]
Predicted ratings: [5.019821302795635, 1.980733159