In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
file_path = '/content/drive/My Drive/Data/ml-latest-small'
users = pd.read_csv(file_path+'/users.csv')
ratings = pd.read_csv(file_path+'/ratings.csv')
movies = pd.read_csv(file_path+'/movies.csv')


n_users = users.shape[0]
print('Number of users:', n_users)

Number of users: 943


In [None]:
ratings_base = pd.read_csv(file_path+'/train_data.csv')
ratings_test = pd.read_csv(file_path+'/test_data.csv')

rate_train = ratings_base.values
rate_test = ratings_test.values

print ('Number of traing rates:', rate_train.shape[0])
print ('Number of test rates:', rate_test.shape[0])

Number of traing rates: 80000
Number of test rates: 20000


In [None]:
items = pd.read_csv(file_path+'/movies.csv', encoding='latin-1')
n_items = items.shape[0]
print('Number of items:', n_items)

Number of items: 1682


In [None]:
X0 = items.values
X_train_counts = X0[:, -19:]
print(X_train_counts )

[[0 0 0 ... 0 0 0]
 [0 1 1 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()
print(tfidf)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.53676706 0.65097024 ... 0.53676706 0.         0.        ]
 [0.         0.         0.         ... 1.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:,0]
    ids = np.where(y == user_id +1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [None]:
class Contentbased:
    def __init__(self, Y, X_train, n_users, n_items, lamda = 1):
        self.Y = Y
        self.lamda = lamda
        self.X_train = X_train
        self.n_users = n_users
        self.n_items = n_items

    def fit(self):
        transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
        tfidf = transformer.fit_transform(self.X_train.tolist()).toarray()
        d = tfidf.shape[1] # data dimension
        W = np.zeros((d, self.n_users))
        b = np.zeros((1, self.n_users))
        for n in range(self.n_users):
            ids, scores = get_items_rated_by_user(self.Y, n)
            clf = Ridge(alpha= self.lamda, fit_intercept  = True)
            Xhat = tfidf[ids, :]
            clf.fit(Xhat, scores)
            W[:, n] = clf.coef_
            b[0, n] = clf.intercept_
        self.Yhat = tfidf.dot(W) + b

    def RMSE(self, Data_test):
        se = cnt = 0
        for n in range(self.n_users):
            ids, scores_truth = get_items_rated_by_user(Data_test, n)
            scores_pred = self.Yhat[ids, n]
            e = scores_truth - scores_pred
            se += (e*e).sum(axis = 0)
            cnt += e.size
        return np.sqrt(se/cnt)

    def recommend(self, user_id, top):
        a = np.zeros((self.n_items,))
        recommended_items = []
        items_rated_by_user, score = get_items_rated_by_user(self.Y, user_id)
        for i in range(self.n_items):
            if i not in items_rated_by_user:
                a[i] = self.Yhat[i, user_id]
        if len(a) < top:
            recommended_items = np.argsort(a)[-len(a):]
        else:
            recommended_items = np.argsort(a)[-top:]
        return recommended_items

In [None]:
for lamda in [1, 3, 5, 7, 9]:
    cb = Contentbased(rate_train, X_train_counts, n_users= n_users, n_items = n_items, lamda=lamda)
    cb.fit()
    RMSE = cb.RMSE(Data_test=rate_train)
    print(f"lamda {lamda}: ",RMSE)
    _lamda = min(lamda, RMSE)

lamda 1:  0.914272803909531
lamda 3:  0.9462376421311541
lamda 5:  0.9641451356615657
lamda 7:  0.9763004410314862
lamda 9:  0.9853143122695543


In [None]:
n_users = 100
cb = Contentbased(rate_train, X_train_counts, n_users= n_users, n_items = n_items, lamda=_lamda)
cb.fit()
cb.RMSE(Data_test=rate_train)


0.9139241142767481

In [None]:
cb.RMSE(Data_test=rate_test)

1.0867415309002284

In [None]:
rcm_list =  list(cb.recommend(99, 10))
rcm_movie = items[items['movie id'].isin(rcm_list)]['movie title']
print(rcm_movie)

9                         Richard III (1995)
10                      Seven (Se7en) (1995)
154                     Dirty Dancing (1987)
334                How to Be a Player (1997)
359                        Wonderland (1997)
581                        Piano, The (1993)
912     Love and Death on Long Island (1997)
1081               Female Perversions (1996)
1196                  Family Thing, A (1996)
1596                   Romper Stomper (1992)
Name: movie title, dtype: object


**Collaborative Filtering**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1

    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)

    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1]
            # and the corresponding ratings
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important
        # for both memory and computing efficiency. For example, if #user = 1M,
        # #item = 100k, then shape of the rating matrix would be (100k, 1M),
        # you may not have enough memory to store this. Then, instead, we store
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)


    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity()

    def fit(self):
        self.refresh()


    def __pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2:
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:]
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]

    def pred(self, u, i, normalized = 1):
        """
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)


    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append(i)

        return recommended_items

    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which
        have not been rated by u yet.
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()
        recommended_items = []

        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0:
                    recommended_items.append(i)

        return recommended_items

    def print_recommendation(self):
        """
        print all items which should be recommended for each user
        """
        print('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print ('Recommend item(s):', recommended_items, 'for user', u)
            else:
                print ('Recommend item', u, 'for user(s) : ', recommended_items)

In [None]:
rate_train_2 = ratings_base.values
rate_test_2 = ratings_test.values

rate_train_2[:, :2] -= 1
rate_test_2[:, :2] -= 1

In [None]:
rs = CF(rate_train_2, k = 30, uuCF = 1)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2

RMSE = np.sqrt(SE/n_tests)
print ('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 0.9803853931638371


In [None]:
rs = CF(rate_train_2, k = 30, uuCF = 0)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2

RMSE = np.sqrt(SE/n_tests)
print('Item-item CF, RMSE =', RMSE)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Item-item CF, RMSE = 0.9648439493582727


In [None]:
rs = CF(rate_test_2, k = 10, uuCF = 1)
rs.fit()
rs.print_recommendation()

Recommendation: 
Recommend item(s): [0, 1, 3, 6, 7, 8, 10, 12, 14, 15, 24, 25, 27, 41, 44, 45, 47, 49, 51, 54, 56, 57, 58, 65, 70, 78, 82, 86, 87, 88, 93, 98, 100, 108, 115, 118, 123, 125, 126, 130, 132, 134, 135, 136, 143, 148, 151, 152, 155, 161, 164, 168, 171, 172, 177, 178, 180, 181, 186, 190, 191, 193, 194, 196, 197, 198, 202, 204, 206, 210, 222, 236, 237, 245, 250, 255, 256, 262, 267, 268, 274, 275, 276, 277, 279, 281, 284, 285, 292, 294, 296, 297, 301, 303, 305, 306, 309, 310, 314, 315, 316, 317, 327, 345, 356, 365, 371, 377, 379, 380, 381, 384, 387, 389, 402, 403, 407, 413, 418, 420, 421, 422, 424, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 442, 444, 446, 448, 450, 460, 462, 463, 464, 465, 466, 468, 469, 472, 473, 474, 477, 478, 479, 480, 482, 483, 484, 485, 486, 487, 489, 490, 491, 492, 495, 497, 498, 501, 503, 504, 505, 506, 508, 509, 510, 511, 512, 513, 514, 515, 516, 518, 519, 520, 521, 522, 524, 525, 526, 528, 529, 530, 546, 548, 549, 557, 558, 565, 567, 569, 574, 5