## Library

In [74]:
import pandas as pd
import numpy as np

from collections import Counter
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import Reader, SVD, Dataset 
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

## Loading Data

In [2]:
smd = pd.read_csv('../the-movies-dataset/movies_metadata_merge_crew_keywords.csv')
id_map = pd.read_csv('../the-movies-dataset/links_small.csv')[['movieId','tmdbId']]

In [3]:
smd.id.max()

416437

In [4]:
titles = smd['title']

In [5]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [6]:
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId','id']

In [7]:
id_map = id_map.merge(smd[['title','id']], on='id').set_index('title')
id_map

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
Jumanji,2,8844.0
Grumpier Old Men,3,15602.0
Waiting to Exhale,4,31357.0
Father of the Bride Part II,5,11862.0
...,...,...
The Last Brickmaker in America,161944,159550.0
Rustom,162542,392572.0
Mohenjo Daro,162672,402672.0
Shin Godzilla,163056,315011.0


In [8]:
indicase_map = id_map.set_index('id')
indicase_map

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5
...,...
159550.0,161944
392572.0,162542
402672.0,162672
315011.0,163056


## Collaborative Filter Using Suprise Framework 

In [123]:
reader = Reader()
algo = SVD()

In [124]:
ratings = pd.read_csv('../the-movies-dataset/ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [129]:
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f7172bb9e80>

In [130]:
n_users = ratings.userId.unique().shape[0]
n_items = ratings.movieId.unique().shape[0]

In [131]:
algo.predict(1,318,3)

Prediction(uid=1, iid=318, r_ui=3, est=3.639681348906029, details={'was_impossible': False})

In [132]:
import operator
result = {}
for i in range(n_items):
    result[1,i] = algo.predict(1,i,3).est

result = sorted(result.items(), key = operator.itemgetter(1))
result = result[-10:]
result = list(reversed(result))
result

[((1, 50), 3.775946000580255),
 ((1, 1267), 3.6642868026149666),
 ((1, 1276), 3.6446566317977287),
 ((1, 318), 3.639681348906029),
 ((1, 2064), 3.635444318292605),
 ((1, 1203), 3.6354103169277487),
 ((1, 898), 3.629742612695806),
 ((1, 2571), 3.624740808694967),
 ((1, 2395), 3.619470337707355),
 ((1, 1136), 3.6136200804940355)]

## Hibrid 

In [14]:
from ipynb.fs.full.Cast_Director_Gernes_Content_Base_Recommender import  get_recommendations as module2_recommender

In [134]:
id_map = pd.read_csv('../the-movies-dataset/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map

Unnamed: 0,movieId,id
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0
...,...,...
9120,162672,402672.0
9121,163056,315011.0
9122,163949,391698.0
9123,164977,137608.0


In [135]:
len(id_map.id.unique())

9113

In [136]:
check_id = ratings[ratings['movieId'].notnull()]['movieId'].astype('int')
check_id = check_id.unique()

list_movie_id = list(id_map.movieId.astype('int'))
count = []

for i in check_id:
    if i in list_movie_id: count.append(i)
len(count)        

9066

In [137]:
len(check_id)

9066

In [138]:
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
id_map.shape

(9219, 2)

In [139]:
# make a column become a index column 
indices_map = id_map.set_index('id')
indices_map

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5
...,...
159550.0,161944
392572.0,162542
402672.0,162672
315011.0,163056


In [140]:
userId = 1
title = 'Avatar'

In [141]:
tmdbId = id_map.loc[title]['id']
tmdbId

19995.0

In [142]:
movie_id = id_map.loc[title]['movieId']

In [143]:
result = module2_recommender(title,26)

In [144]:
for i in result:
    movie_indices = list(result[i])

for i in movie_indices:
    print(titles[i]) 

Aliens
Terminator 2: Judgment Day
The Terminator
The Abyss
Piranha Part Two: The Spawning
True Lies
Titanic
Star Trek Into Darkness
Dungeons & Dragons
Jupiter Ascending
Small Soldiers
Dragonball Evolution
Sinbad and the Eye of the Tiger
Hercules in New York
Beastmaster 2: Through the Portal of Time
Man of Steel
Return from Witch Mountain
Hawk the Slayer
Superman II
X-Men: Days of Future Past
Teenage Mutant Ninja Turtles III
Star Wars: The Clone Wars
Escape to Witch Mountain
Darby O'Gill and the Little People
Fantastic Planet


In [145]:
movies = smd.iloc[movie_indices][['title','vote_count','vote_average','year','id']]
movies['est'] = movies['id'].apply(lambda x: algo.predict(userId,indices_map.loc[x]['movieId']).est)
movies = movies.sort_values('est', ascending = False)

movies.head(10)

Unnamed: 0,title,vote_count,vote_average,year,id,est
1011,The Terminator,4208.0,7.4,1984,218,3.23328
974,Aliens,3282.0,7.7,1986,679,3.120768
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,3.033041
522,Terminator 2: Judgment Day,4274.0,7.7,1991,280,3.03275
1621,Darby O'Gill and the Little People,35.0,6.7,1959,18887,2.930777
1668,Return from Witch Mountain,38.0,5.6,1978,14822,2.928221
8401,Star Trek Into Darkness,4479.0,7.4,2013,54138,2.912751
922,The Abyss,822.0,7.1,1989,2756,2.835549
2014,Fantastic Planet,140.0,7.6,1973,16306,2.789065
7088,Star Wars: The Clone Wars,434.0,5.8,2008,12180,2.74644


## From scatch

In [28]:
# hàm tính đố tương đồng giữa 2 vector 
def cosine_sim_calculate(a,b):
    cos_sim = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [29]:
movies = movies[['title','vote_count','vote_average','year','id']]
movies['id']

8658    127585
1011       218
974        679
8401     54138
1668     14822
1376       597
3060     11940
2132      8536
1621     18887
522        280
922       2756
7265     14164
2014     16306
344      36955
6084     27549
4966      5227
4347     31646
4017     25628
7088     12180
831      14821
8419     49521
8724     76757
2761      1499
1500     11551
3216     11849
Name: id, dtype: int64

In [30]:
# the shape attribute describe the len base on the dimention value input
# o mean rows of the matrix and 1 mean columns of the matrix
n_users = ratings.userId.unique().shape[0]
n_items = ratings.movieId.unique().shape[0]

n_items_table = ratings.movieId.unique()
n_items_table = list(n_items_table)


In [31]:
check_movieId = list(ratings.movieId)
index_of_movieId =  Counter(check_movieId)

In [32]:
index_of_movieId.values()

dict_values([42, 42, 33, 48, 46, 48, 46, 46, 52, 39, 47, 46, 46, 47, 36, 42, 53, 47, 43, 62, 122, 86, 120, 201, 201, 51, 87, 228, 26, 200, 129, 80, 142, 43, 102, 48, 113, 22, 109, 52, 78, 13, 125, 51, 62, 73, 39, 27, 110, 324, 90, 17, 70, 38, 101, 115, 70, 341, 122, 200, 157, 67, 20, 34, 180, 19, 16, 73, 110, 213, 33, 80, 274, 53, 60, 153, 86, 78, 46, 244, 10, 125, 11, 90, 50, 41, 129, 126, 215, 237, 202, 196, 304, 42, 53, 45, 32, 43, 13, 311, 39, 176, 150, 124, 27, 163, 217, 44, 40, 28, 190, 164, 44, 191, 23, 18, 29, 21, 147, 193, 22, 220, 202, 8, 41, 69, 134, 58, 122, 176, 106, 46, 84, 2, 73, 4, 11, 121, 1, 148, 49, 93, 70, 291, 14, 114, 198, 32, 106, 55, 90, 95, 9, 146, 71, 66, 33, 200, 69, 66, 62, 117, 9, 10, 55, 78, 18, 22, 49, 21, 151, 148, 89, 132, 160, 28, 71, 145, 15, 234, 220, 125, 121, 112, 131, 127, 77, 94, 104, 100, 80, 158, 22, 24, 101, 112, 165, 226, 74, 64, 61, 82, 147, 24, 87, 7, 15, 19, 82, 47, 72, 65, 64, 90, 108, 27, 51, 67, 40, 56, 39, 6, 113, 42, 69, 119, 36, 117,

In [33]:
len(n_items_table)

9066

In [34]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [35]:
data_matrix = np.zeros((n_users,n_items))

for line in ratings.itertuples():
    data_matrix[line[1] - 1,n_items_table.index(line[2])] = line[3]

In [36]:
data_matrix.shape

(671, 9066)

In [37]:
ratings_fix = ratings.drop(['timestamp'], axis = 1)
ratings_fix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 3 columns):
userId     100004 non-null int64
movieId    100004 non-null int64
rating     100004 non-null float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [38]:
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    def add(self, new_data):
        """
        Update Y_data matrix when new ratings come.
        For simplicity, suppose that there is no new user or item.
        """
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        print(users)
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            # ids is the index of value equal n-user_index in users-arrays 
            ids = np.where(users == n)[0].astype(np.int32)
            print(ids)
            # indices of all ratings associated with user n
            #get item_id corespone with index values of user in users_arrays
            item_ids = self.Y_data[ids, 1]
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            # mean value corespone with user_id    
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.
        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
        
    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
            
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 
    
    def recommend2(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
    
        for i in xrange(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 

    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print ('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print ('    Recommend item(s):', recommended_items, 'for user', u)
            else: 
                print ('    Recommend item', u, 'for user(s) : ', recommended_items)

In [39]:
# data file 
r_cols = ['user_id', 'item_id', 'rating']
ratings_sample = pd.read_csv('ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings_sample.values

rs = CF(Y_data, k = 2, uuCF = 1)
rs.fit()

rs.print_recommendation()

[0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2. 3. 3. 3. 4. 4. 5. 5. 6. 6. 6.]
[0 1 2 3]
[4 5 6 7]
[ 8  9 10 11]
[12 13 14]
[15 16]
[17 18]
[19 20 21]
Recommendation: 
    Recommend item(s): [2] for user 0
    Recommend item(s): [1] for user 1
    Recommend item(s): [] for user 2
    Recommend item(s): [4] for user 3
    Recommend item(s): [4] for user 4
    Recommend item(s): [0, 3, 4] for user 5
    Recommend item(s): [1] for user 6


In [40]:
rs = CF(Y_data, k = 2, uuCF = 0)
rs.fit()

rs.print_recommendation()

[0. 0. 0. 0. 1. 1. 1. 1. 2. 2. 2. 2. 3. 3. 3. 4. 4. 5. 5. 6. 6. 6.]
[0 1 2 3]
[4 5 6 7]
[ 8  9 10 11]
[12 13 14]
[15 16]
[17 18]
[19 20 21]
Recommendation: 
    Recommend item 0 for user(s) :  [2]
    Recommend item 1 for user(s) :  [1]
    Recommend item 2 for user(s) :  []
    Recommend item 3 for user(s) :  [4]
    Recommend item 4 for user(s) :  [4]
    Recommend item 5 for user(s) :  [0, 3, 4]
    Recommend item 6 for user(s) :  [1]
