In [113]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse


class CF(object):
    """docstring for CF"""

    def __init__(self, Y_data, k, dist_func=cosine_similarity, uuCF=1):
        self.uuCF = uuCF  # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1

    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis=0)

    def normalize_Y(self):
        users = self.Y_data[:, 0]  # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1]
            # and the corresponding ratings
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0  # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important
        # for both memory and computing efficiency. For example, if #user = 1M,
        # #item = 100k, then shape of the rating matrix would be (100k, 1M),
        # you may not have enough memory to store this. Then, instead, we store
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
                                       (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)

    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity()

    def fit(self):
        self.refresh()

    def __pred(self, u, i, normalized=1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2:
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:]
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]

    def pred(self, u, i, normalized=1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF:
            return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)

    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append(i)

        return recommended_items

    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []

        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append(i)

        return recommended_items

    def print_recommendation(self):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print('Recommend item(s):', recommended_items, 'for user', u)
            else:
                print('Recommend item', u, 'for user(s) : ', recommended_items)

    def print_recommendation2(self ,x):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        recommended_items = self.recommend(x)
        if self.uuCF:
            print('Recommend item(s):', recommended_items, 'for user', x)
        else:
            print('Recommend item', x, 'for user(s) : ', recommended_items)


In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
np.set_printoptions(formatter={'float': "{:6.11g}".format})


rating = pd.read_csv('../BaseContent/input/ratings_small - Copy.csv')

rating = rating.drop(columns='timestamp')
rating_train, rating_test = train_test_split(
    rating, test_size=0.15, random_state=42)


train_set = rating_train.values
test_set = rating_test.values

# r_cols = ['user_id', 'item_id', 'rating']
# ratings = pd.read_csv('../BaseContent/ml-100k/ex.dat', sep = ' ', names = r_cols, encoding='latin-1')

# print(ratings.values)
print(train_set)


[[   530   1196      5]
 [   452   1953      4]
 [    30   2159      2]
 ...
 [   529   3712      4]
 [    15    110      3]
 [   152    185    1.5]]


In [117]:
import joblib

def store_model(model, model_name=""):
    # NOTE: sklearn.joblib faster than pickle of Python
    # INFO: can store only ONE object in a file
    if model_name == "":
        model_name = type(model).__name__
    joblib.dump(model, '../Collaberative/model_users/' + model_name + '_model.pkl')


def load_model(model_name):
    # Load objects into memory
    #del model
    model = joblib.load('../Collaberative/model_users/' + model_name + '_model.pkl')
    # print(model)
    return model

In [135]:
rs = CF(train_set, k=30, uuCF=1)
rs.fit()
store_model(rs)
# rs.print_recommendation2(1)
print(type(test_set))
train_set = train_set.astype(int)
n_tests = test_set.shape[0]
SE = 0  # squared error
for n in range(n_tests):
    pred = rs.pred(test_set[n, 0], test_set[n, 1], normalized=0)
    SE += (pred - test_set[n, 2])**2

RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


<class 'numpy.ndarray'>
User-user CF, RMSE = 0.9463091058560089


In [123]:
test = load_model("CF")

print(test)

print(test.recommend(1))

<__main__.CF object at 0x00000170544C19D0>
[1, 3, 6, 11, 13, 15, 16, 17, 20, 21, 24, 28, 29, 30, 32, 36, 37, 38, 39, 42, 45, 46, 47, 50, 55, 60, 62, 64, 72, 74, 77, 78, 81, 82, 83, 84, 85, 92, 93, 94, 96, 99, 101, 102, 103, 104, 108, 110, 111, 121, 123, 125, 130, 146, 149, 150, 152, 155, 157, 159, 163, 165, 166, 169, 171, 175, 178, 179, 181, 183, 184, 189, 190, 193, 199, 200, 206, 207, 219, 223, 227, 228, 232, 235, 236, 245, 246, 247, 250, 253, 258, 259, 260, 261, 262, 265, 275, 276, 278, 287, 288, 291, 293, 296, 299, 300, 302, 305, 306, 307, 312, 314, 318, 326, 331, 332, 337, 339, 341, 345, 346, 347, 348, 349, 353, 356, 360, 361, 363, 364, 366, 372, 373, 374, 375, 378, 380, 382, 383, 384, 387, 390, 391, 408, 409, 414, 418, 421, 422, 427, 429, 431, 437, 441, 442, 445, 447, 449, 450, 451, 452, 453, 455, 457, 458, 464, 465, 466, 468, 469, 470, 471, 473, 480, 483, 487, 488, 494, 495, 496, 497, 498, 502, 505, 506, 509, 510, 511, 515, 516, 517, 518, 519, 521, 524, 527, 528, 529, 532, 533, 5