# Movie Recommendation System

### Part 1: Movie Embedding

By using a link data between entities, I trained the embedding model to connect movies and find similarities between them.

### 1-1. Prepare data

In [1]:
# Load,preprocess and save data/model
import json
from collections import Counter
import pickle

#Build an embedding model
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot

#To build a recommendation system
import numpy as np
import pandas as pd

Using TensorFlow backend.


We are going to use 2 datasets to build this movie recommendation system.<br>
First I needed a rating for 10,000 movies rated by 1,000 users.
The data used for this model is not real data, but randomly generated by numpy module to show how to create collaborative filtering model.

In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

with open('data/rating.ndjson') as finn:
    rates=[li.replace('\n', '').split(',') for li in finn]

In [3]:
rating_df = pd.DataFrame(rates, columns=['movie_id', 'person_id', 'rating'])
rating_df['movie_id'] = rating_df['movie_id'].apply(int)
rating_df['person_id'] = rating_df['person_id'].apply(int)
rating_df['rating'] = rating_df['rating'].apply(float)

rating_df = rating_df.groupby(['person_id', 'movie_id']).agg(np.mean)
rating_full_df = rating_df.reset_index()
rating_full_df.shape

del rating_df

rating_full_df.shape

(220251, 3)

In [4]:
rating = np.asarray(rating_full_df['rating'])
norm = rating / np.linalg.norm(rating)
rating_full_df['rating_norm'] = norm

<br>

### 1-2. Build the Embedding Model

By using Counter, I sorted links to screen meaningful links. <br>
After sorting data, I created a tuple (link index, movie index) and feed the Embedding model with negative sampling to train it.

In [5]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(3)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867)]

In [7]:
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link:idx for idx, link in enumerate(top_links)}

movie_to_idx = {movie[0]:idx for idx, movie in enumerate(movies)}
idx_to_movie = [movie[0] for movie in movies]

pairs=[]

for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) 
                 for link in movie[2] 
                 if link in link_to_idx)

pairs_set = set(pairs)
len(pairs_set)

671403

In [11]:
def movie_embedding_model(embedding_size=30):
    
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    
    movie_embedding = Embedding(name='movie_embedding',
                               input_dim=len(movie_to_idx),
                               output_dim=embedding_size)(movie)
    
    dot=Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    
    merged = Reshape((1,))(dot)
    
    model=Model(inputs=[link, movie], outputs=[merged])
    
    model.compile(optimizer='nadam', loss='mse')
    
    return model

model = movie_embedding_model()
model.summary()





Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 30)        2007390     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 30)        300000      movie[0][0]                      
________________________________________________________________________________________

In [9]:
def batchifier(pairs, positive_samples=50, negative_ratio=5):
    batch_size = positive_samples * (1+negative_ratio)
    batch=np.zeros((batch_size, 3))
    
    while True:
        
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx,:] = (link_id, movie_id, 1)
        idx = positive_samples
        
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            
            if not (link_id, movie_id) in pairs_set:
                batch[idx,:] = (link_id, movie_id, -1)
                idx += 1
        
        np.random.shuffle(batch)
        yield {'link':batch[:,0], 'movie':batch[:,1]}, batch[:,2]

In [12]:
model.fit_generator(batchifier(pairs, positive_samples=512, negative_ratio=10), epochs=5,
                   steps_per_epoch= len(pairs)//512)

with open('embedding_movie_model.pkl', 'wb') as fout:
    pickle.dump(model, fout)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Extracting the weights from the embedding layer and normalising them to create the 
 item profile (movie-to-features).

In [14]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]

norm_per_movie = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / norm_per_movie).T
normalized_movies.shape

(10000, 30)