# Movie Recommendation System

Content-Based Filtering, Collaborative Filtering and Hybrid system which combine two previous systems.

Import libraries

In [1]:
# Load,preprocess and save data/model
import json
from collections import Counter
import pickle

#Build an embedding model
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot

#Build a classifier, collaborative filtering model
from sklearn.svm import SVC
from scipy.sparse.linalg import svds

#To build a recommendation system
import random
import numpy as np
import pandas as pd

#Evaluation
from sklearn.model_selection import train_test_split

Using TensorFlow backend.



<I>aaa</I><br>
We are going to use 2 datasets to build this movie recommendation system.<br>
First need a rating for 10,000 movies rated by 1,000 users.
The data used for this model is not real data, but randomly generated by numpy module to show how to create collaborative filtering model.

In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

with open('data/rating.ndjson') as finn:
    rates=[li.replace('\n', '').split(',') for li in finn]

In [3]:
rating_df = pd.DataFrame(rates, columns=['movie_id', 'person_id', 'rating'])
rating_df['movie_id'] = rating_df['movie_id'].apply(int)
rating_df['person_id'] = rating_df['person_id'].apply(int)
rating_df['rating'] = rating_df['rating'].apply(float)

rating_df = rating_df.groupby(['person_id', 'movie_id']).agg(np.mean)
rating_full_df = rating_df.reset_index()
rating_full_df.shape

del rating_df

rating_full_df.shape

(220251, 3)

In [4]:
rating = np.asarray(rating_full_df['rating'])
norm = rating / np.linalg.norm(rating)
rating_full_df['rating_norm'] = norm

### Load movie data

Train Embedding model using metadata(link) as an connector.

In [5]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(3)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867)]

#(link index, movie index): links connected to movie is in meaningful
Links <br>
indexing links appeared more than 3 times<br>
indexing every movies<br>
66913 links and 10000 movies in data

In [6]:
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link:idx for idx, link in enumerate(top_links)}

movie_to_idx = {movie[0]:idx for idx, movie in enumerate(movies)}
idx_to_movie = [movie[0] for movie in movies]

len(top_links), len(movie_to_idx)

(66913, 10000)

In [7]:
pairs=[]

for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) 
                 for link in movie[2] 
                 if link in link_to_idx)

pairs_set = set(pairs)
len(pairs_set)

671403

Create Embedding Model

In [8]:
def movie_embedding_model(embedding_size=30):
    
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    
    movie_embedding = Embedding(name='movie_embedding',
                               input_dim=len(movie_to_idx),
                               output_dim=embedding_size)(movie)
    
    dot=Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    
    merged = Reshape((1,))(dot)
    
    model=Model(inputs=[link, movie], outputs=[merged])
    
    model.compile(optimizer='nadam', loss='mse')
    
    return model

In [11]:
model = movie_embedding_model()
model.summary()





Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 30)        2007390     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 30)        300000      movie[0][0]                      
________________________________________________________________________________________

Create Data Generator 

In [9]:
def batchifier(pairs, positive_samples=50, negative_ratio=5):
    batch_size = positive_samples * (1+negative_ratio)
    batch=np.zeros((batch_size, 3))
    
    while True:
        
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx,:] = (link_id, movie_id, 1)
        idx = positive_samples
        
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            
            if not (link_id, movie_id) in pairs_set:
                batch[idx,:] = (link_id, movie_id, -1)
                idx += 1
        
        np.random.shuffle(batch)
        yield {'link':batch[:,0], 'movie':batch[:,1]}, batch[:,2]

In [12]:
model.fit_generator(batchifier(pairs, positive_samples=512, negative_ratio=10), epochs=5,
                   steps_per_epoch= len(pairs)//512)

with open('embedding_movie_model.pkl', 'wb') as fout:
    pickle.dump(model, fout)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Saving Model

In [56]:
import pickle
with open('embedding_movie_model.pkl', 'rb') as fin:
    model = pickle.load(fin)





Getting Embedding layer weights
movie_weight = item profile movie-to-features (10000,30)
Normalize weights

In [14]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]

norm_per_movie = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / norm_per_movie).T
normalized_movies.shape

(10000, 30)

# Data Preparation

### Data split for train/test

###
When you recommend items, the item shouldn't be in train set. So define the function to filter items already reviewed.

In [15]:
rating_train_df, rating_test_df = train_test_split(rating_full_df, stratify =rating_full_df['person_id'], test_size=0.3)
rating_indexed_train = rating_train_df.set_index('person_id')
rating_indexed_test = rating_test_df.set_index('person_id')

In [16]:
def get_items_reviewed(person_id, rating_full_df):
    return rating_train_df[rating_train_df['person_id']==person_id].movie_id.tolist()

# Content-Based Filtering Model

In [17]:
class CBF_Recommender:
    
    MODEL_NAME='Content-Based Filtering'
    
    def __init__(self, normalized_movies, rating_train_df):
        self.normalized_movies = normalized_movies
        self.rating_train_df = rating_train_df
    
    def get_model_name(self):
        return self.MODEL_NAME
    
    
    def recommend_items(self, person_id, topn):
        user_df = rating_train_df[rating_train_df['person_id']==person_id]
        user_norm = []
        
        for i in range(user_df.shape[0]):
            r = normalized_movies[user_df['movie_id']][i]*user_df['rating_norm'].values[i]
            user_norm.append(r)
        user_profile = np.sum(user_norm, axis=0)
        
        dists = np.dot(normalized_movies, user_profile)
        similar_items = np.argsort(dists)[-topn:]
        
        items_to_ignore = get_items_reviewed(person_id, rating_full_df)
        similar_items_filtered = list(filter(lambda x: x not in items_to_ignore, similar_items))
                
        recommend = [(x,dists[x]) for x in similar_items_filtered]
        cbf_recs_df = pd.DataFrame(recommend, columns=['movie_id','cbf_recStrength'])
        
        return cbf_recs_df.sort_values('cbf_recStrength', ascending=False)


In [18]:
cbf_rec = CBF_Recommender(normalized_movies, rating_train_df)
cbf_rec.recommend_items(0,10)

Unnamed: 0,movie_id,cbf_recStrength
8,8952,0.305348
7,8901,0.305338
6,7559,0.305255
5,1798,0.304793
4,7049,0.304742
3,6397,0.304657
2,5071,0.30456
1,9557,0.304554
0,6515,0.30454


In [34]:
class CLF_Recommender:
    
    MODEL_NAME='Classifier Model'
    
    def __init__(self, normalized_movies, rating_train_df):
        self.normalized_movies = normalized_movies
        self.rating_train_df = rating_train_df
    
    def get_model_name(self):
        return self.MODEL_NAME
    

    def recommend_items(self, person_id,topn):
        user_df = rating_train_df[rating_train_df['person_id']==person_id]
        mean_review = user_df.rating.mean()
        best = user_df[user_df['rating'] > mean_review].movie_id.tolist()
        worst = user_df[user_df['rating']< mean_review].movie_id.tolist()
        
        X = np.asarray([normalized_movies[movie] for movie in best+worst])
        y = np.asarray([1 for _ in best] + [0 for _ in worst])
        
        clf = SVC(kernel='linear')
        clf.fit(X, y) 

        estimated_movie_ratings = clf.decision_function(normalized_movies)

        best = np.argsort(estimated_movie_ratings)[-topn:]
        items_to_ignore = get_items_reviewed(person_id, rating_train_df)
        recommended_filtered = list(filter(lambda x: x not in items_to_ignore, best))
        
        recommend = [(x,estimated_movie_ratings[x]) for x in recommended_filtered]
        clf_recs_df = pd.DataFrame(recommend, columns=['movie_id','clf_recStrength'])
        
        return clf_recs_df.sort_values('clf_recStrength', ascending=False)

In [35]:
clf_rec = CLF_Recommender(normalized_movies, rating_train_df)
clf_rec.recommend_items(2,10)

Unnamed: 0,movie_id,clf_recStrength
9,3349,2.601849
8,6,2.490982
7,42,2.476291
6,19,2.45421
5,39,2.436628
4,85,2.397752
3,1364,2.311233
2,7,2.27944
1,149,2.277692
0,34,2.243436


# Collaborative Filtering Model

In [36]:
NUMBER_OF_FACTORS=32

class CF_Recommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, normalized_movies, rating_train_df):
        self.normalized_movies = normalized_movies
        self.rating_train_df = rating_train_df
        self.all_prediction = self.get_user_item(rating_train_df)
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def get_user_item(self, rating_train_df):
        user_item = pd.pivot(rating_train_df, index='person_id', columns='movie_id', values='rating_norm').fillna(0)
        user_item_matrix = user_item.values
        U, s, Vt = svds(user_item_matrix, k=NUMBER_OF_FACTORS)
        sigma = np.diag(s)
        all_prediction = np.dot(np.dot(U, sigma), Vt)
        return all_prediction
    
    def recommend_items(self, person_id, topn):
        person_items = self.all_prediction[person_id]
        best_indices = np.argsort(person_items)[-topn:]
        items_to_ignore = get_items_reviewed(person_id, rating_train_df)
        
        recommended_filtered = list(filter(lambda x: x not in items_to_ignore, best_indices))
        recommend = [(x,person_items[x]) for x in recommended_filtered]
        
        cf_recs_df = pd.DataFrame(recommend, columns=['movie_id','cf_recStrength'])
        
        return cf_recs_df.sort_values('cf_recStrength', ascending=False)
    

In [37]:
cf_rec = CF_Recommender(normalized_movies, rating_full_df)
cf_rec_items = cf_rec.recommend_items(2,10)

In [38]:
cf_rec_items

Unnamed: 0,movie_id,cf_recStrength
8,13,0.000796
7,2241,0.000734
6,252,0.000645
5,1082,0.000471
4,81,0.00044
3,6946,0.000423
2,880,0.000386
1,1619,0.000381
0,8848,0.000377


# Hybrid Model

In [59]:
class Hybrid_Recommender:
    
    MODEL_NAME = 'Hybrid Model'
    
    
    def __init__(self, cf_rec_model, cbf_rec_model, clf_rec_model):
        self.cf_rec_model = cf_rec_model
        self.cbf_rec_model = cbf_rec_model
        self.clf_rec_model = clf_rec_model
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, person_id, topn):
        
        cf_recs_df = self.cf_rec_model.recommend_items(person_id, 4000)
        cbf_recs_df = self.cbf_rec_model.recommend_items(person_id, 4000)
        clf_recs_df = self.clf_rec_model.recommend_items(person_id, 4000)
        
        hybrid_df = cf_recs_df.merge(cbf_recs_df, on='movie_id', how='inner')
        hybrid_df = hybrid_df.merge(clf_recs_df, on='movie_id', how='inner')
        hybrid_df['hb_recStrength'] = hybrid_df['cf_recStrength']*hybrid_df['cbf_recStrength']*hybrid_df['clf_recStrength']
        
        recommended_items = hybrid_df.sort_values('hb_recStrength', ascending=False)[:10]
        items_to_ignore = get_items_reviewed(person_id, rating_train_df)
        
        recommended_filtered = recommended_items[~recommended_items.movie_id.isin(items_to_ignore)]

        return recommended_filtered[['hb_recStrength','movie_id']]
    

In [63]:
class Hybrid_Recommender:
    
    MODEL_NAME = 'Hybrid Model'
    
    
    def __init__(self, cf_rec_model, cbf_rec_model):
        self.cf_rec_model = cf_rec_model
        self.cbf_rec_model = cbf_rec_model
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, person_id, topn):
        
        cf_recs_df = self.cf_rec_model.recommend_items(person_id, 4000)
        cbf_recs_df = self.cbf_rec_model.recommend_items(person_id, 4000)

        
        hybrid_df = cf_recs_df.merge(cbf_recs_df, on='movie_id', how='inner')
        hybrid_df['hb_recStrength'] = hybrid_df['cf_recStrength']*hybrid_df['cbf_recStrength']
        
        recommended_items = hybrid_df.sort_values('hb_recStrength', ascending=False)[:10]
        items_to_ignore = get_items_reviewed(person_id, rating_train_df)
        
        recommended_filtered = recommended_items[~recommended_items.movie_id.isin(items_to_ignore)]

        return recommended_filtered[['hb_recStrength','movie_id']]

In [65]:
hb_rec = Hybrid_Recommender(cf_rec, cbf_rec)
hb_rec_df = hb_rec.recommend_items(0,1000)
hb_rec_df

Unnamed: 0,hb_recStrength,movie_id
0,0.000187,4873
1,0.00014,7578
2,0.000115,3896
4,8.5e-05,8296
3,8.4e-05,2611
5,8.3e-05,9778
7,8.3e-05,7554
6,8.3e-05,6795
9,7.8e-05,7985
8,7.7e-05,5829


# Evaluation

In [55]:
EVAL_RANDOM_SAMPLE_NON_REVIEWED_ITEMS = 100

class ModelEvaluator:
    
    def __init__(self, rating_test_df):
        self.rating_test_df = rating_test_df
    
    def get_not_reviewed_sample(self, person_id, sample_size, seed=42):
        reviewed_items = get_items_reviewed(person_id, self.rating_test_df)
        all_items = np.asarray(range(0,10000))
        non_reviewed_items = set(all_items) - set(reviewed_items)

        random.seed(seed)
        non_reviewed_samples = random.sample(non_reviewed_items, sample_size)
        return set(non_reviewed_samples)

    #returning whether if the movie is in topn list and the index
    def _verify_hit_top_n(self, movie_id, recommended_items, topn):        
        try:
            index = next(i for i in range(len(recommended_items)) if recommended_items[i] == movie_id)
        except:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index

        
    def evaluate_model_for_user(self, model, person_id):
        
        #Getting the items in test set
        person_testset = rating_indexed_test.loc[person_id]
        movies_test = person_testset['movie_id'].values
        movies_cnt_test = len(movies_test)

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, topn=1000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        
        #For each item the user has interacted in test set
        for item_id in movies_test:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_reviewed_sample = self.get_not_reviewed_sample(person_id, sample_size=EVAL_RANDOM_SAMPLE_NON_REVIEWED_ITEMS)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_reviewed_sample.union(set([item_id]))
          

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['movie_id'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['movie_id'].values
            
            
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10
            


        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(movies_cnt_test)
        recall_at_10 = hits_at_10_count / float(movies_cnt_test)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'reviewed_count': movies_cnt_test,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    
    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(rating_indexed_test.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        
       

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('reviewed_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['reviewed_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['reviewed_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}  
        
        return global_metrics, detailed_results_df

In [45]:
model_eval = ModelEvaluator(rating_test_df)
cf_result, cf_result_df = model_eval.evaluate_model(cf_rec)
cf_result

{'modelName': 'Collaborative Filtering',
 'recall@5': 0.35171620558145167,
 'recall@10': 0.5193262304013561}

In [53]:
cbf_result, cbf_result_df = model_eval.evaluate_model(cbf_rec)
cbf_result

{'modelName': 'Content-Based Filtering',
 'recall@5': 0.05691930504267813,
 'recall@10': 0.1111447424178219}

In [57]:
clf_result, clf_result_df = model_eval.evaluate_model(clf_rec)
clf_result

{'modelName': 'Classifier Model',
 'recall@5': 0.0498214177613657,
 'recall@10': 0.09929475149827471}

In [66]:
hb_rec_result, hb_rec_result_df = model_eval.evaluate_model(hb_rec)
hb_rec_result

{'modelName': 'Hybrid Model',
 'recall@5': 0.018130637447787395,
 'recall@10': 0.018130637447787395}

In [68]:
with open('modelresult.pkl', 'wb') as fout:
    pickle.dump({
        'cf': [cf_result, cf_result_df],
        'cbf': [cbf_result, cbf_result_df],
        'clf': [clf_result, clf_result_df],
        'hyb': [hb_rec_result, hb_rec_result_df]
    }, fout)