Source code originally from
https://github.com/Wann-Jiun/nycdsa_project_5_recommender/blob/master/deep_learning.ipynb

Blog post from
https://nycdatascience.com/blog/student-works/deep-learning-meets-recommendation-systems/


<h2>Step 1 - Read in initial movie data</h2>

In [None]:
import numpy as np
import pandas as pd

The links.csv file has a list of movies, and their IMDB IDs. This is the movie dataset that we'll be using in this notebook.

In [None]:
# Read in the link.csv file and store the information in a dataframe
df_data = pd.read_csv('/home/nbuser/library/dataset/ml-latest-small/links.csv', sep=',')

In [None]:
# Check the type of df_id
type(df_data)

In [None]:
# Print the first few records from the dataframe
df_data.head()

In [None]:
# Check the number of movie links that were read in.
len(df_data.index)

In [None]:
# Iterate over the dataframe rows as namedtuples, extracting the IMDB ID and storing it in the idx_to_movie dict object.
idx_to_movie = {}
for row in df_data.itertuples():
    idx_to_movie[row[1]-1] = row[2]

In [None]:
type(idx_to_movie)

In [None]:
# Print therecords from the dataframe. The two columns represent an index value and the corresponding IMDB ID
idx_to_movie

In [None]:
# Check the number of movies that are present int the dict object.
len(idx_to_movie)

In [None]:
total_movies = len(idx_to_movie)
movies = [0]*total_movies

In [None]:
type(movies)

In [None]:
len(movies)

In [None]:
for i in range(len(movies)):
    if i in idx_to_movie.keys() and len(str(idx_to_movie[i])) == 6:
        movies[i] = (idx_to_movie[i]) 

In [None]:
# The function filter(function, list) is used to filter out all the elements of a list, for which function returns True. 
# The function filter(f, l) needs a function
#  - f as its first argument. f returns a Boolean value, i.e. either True or False.
#  - This function will be applied to every element of the list l.
# Only if f returns True will the element of the list be included in the result list.

# In Python3, filter returns an iterator. Hence wrapping the function in list() so that a list is returned
movies = list(filter(lambda imdb: imdb != 0, movies))

In [None]:
type(movies)

In [None]:
total_movies  = len(movies)

In [None]:
total_movies

<h2> Step 2 - Fetch Movie Poster images</h2>

In [None]:
import requests
import json

from IPython.display import Image
from IPython.display import display
from IPython.display import HTML


In [None]:
# Get base url filepath structure. w185 corresponds to size of movie poster.
headers = {'Accept': 'application/json'}
payload = {'api_key': 'bb3beb7ec7af6d1c0c23ca7381b62a89'} 
response = requests.get("http://api.themoviedb.org/3/configuration", params=payload, headers=headers)
response = json.loads(response.text)
base_url = response['images']['base_url'] + 'w185'

In [None]:
def get_poster(imdbid, base_url):
    # Get IMDB movie ID
    movie_id = "tt0" + str(imdbid) 
    
    # Query themoviedb.org API for movie poster path.
    movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
    headers = {'Accept': 'application/json'}
    payload = {'api_key': 'bb3beb7ec7af6d1c0c23ca7381b62a89'} 
    response = requests.get(movie_url, params=payload, headers=headers)
    try:
        file_path = json.loads(response.text)['posters'][0]['file_path']
    except:
        file_path = ""
        
    return (base_url + file_path, imdbid)

In [None]:
URL = [0]*total_movies 
IMDB = [0]*total_movies 
URL_IMDB = {"url":[],"imdb":[]}
i = 0
for movie in movies:
    (URL[i], IMDB[i]) = get_poster(movie, base_url)
    if URL[i] != base_url+"":
        URL_IMDB["url"].append(URL[i])
        URL_IMDB["imdb"].append(IMDB[i])
    i += 1 
# URL = filter(lambda url: url != base_url+"", URL)


In [None]:
df = pd.DataFrame(data=URL_IMDB) 
df

In [None]:
# images = ''
# for i in range(n_display):
#     images += "<img style='width: 120px; margin: 0px; \
#                 float: left; border: 1px solid black;' src='%s' />" \
#                 % URL[i]

# display(HTML(images))

In [None]:
# The total number of movies that are present in the data frame
total_movies = len(df)
total_movies

In [None]:
# Download the movie poster images from the movie website using their exposed APIs. Stored them locally
import urllib.request

poster_path = "/home/nbuser/library/dataset/ml-latest-small/posters/"

# Commenting out this code as the movie posters have already been downloaded. Only need to download it once.
# for i in range(total_movies):
#     urllib.request.urlretrieve(df.url[i], poster_path + str(i) + ".jpg")

<h2> Step 3 - Image Pre-processing</h2>

VGG is the Visual Geometry Group at the University of Oxford (http://www.robots.ox.ac.uk/~vgg/). In 2014 utilizing Convolutional Neural Networks, they produced an image classifier that outperformed other classifiers in the 2014 ILSVRC challenge. A research paper outlining their approach and method is present at https://arxiv.org/pdf/1409.1556.pdf

In [None]:
# Import the VGG model that is included as part of the keras distribution.
# Here 16 refers to a 16 layer convolutoinal neural network.
from keras.applications import VGG16
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing import image as kimage

# Create two arrays, each of size of the total number of movies
img = [0]*total_movies
x = [0]*total_movies

# Loop through all the movies, and do the following
#   1. Load the images in an array
#   2. Convert the image instance to a Numpy array using the keras preprocessing function
#   3. Expand the array by inserting a new axis that will appear at the axis position in the expanded array shape.
#   4. Pre-process this array using the vgg16 model
for i in range(total_movies):
    img[i] = kimage.load_img(poster_path + str(i) + ".jpg", target_size=(224, 224))
    x[i] = kimage.img_to_array(img[i])
    x[i] = np.expand_dims(x[i], axis=0)
    x[i] = preprocess_input(x[i])

<h2> Step 4 - Image Classification</h2>

Image pre-processing has been done. Now time for image classification using the VGG16 pre-built / pre-trained model. Here, the pre-trained model has been previously trained on a dataset and contains the weights and biases that represent the features of whichever dataset it was trained on. Using a pre-trained model saves considerable computing time and resources. 

In [None]:
# The function returns a Keras model instance for VGG16
# Arguments
#  - image_top: whether to include the 3 fully-connected layers at the top of the network
#  - weights: None (random initialization), 'imagenet' (pre-training on ImageNet), or the path to the weights file
# model = VGG16(include_top=False, weights='imagenet')
model = keras.applications.vgg16(include_top=False, weights='imagenet')


In [None]:
# The current model prediction takes a very long time. Hence reducing the size of the dataset here.
total_movies=5

In [None]:
# Create two arrays, each of size of the total number of movies
# pred is the array containing the predicted classification

pred = [0]*total_movies
pred_norm = [0]*total_movies
matrix_res = np.zeros([total_movies,25088])

for i in range(total_movies):
    pred[i] = model.predict(x[i]).ravel()
    matrix_res[i,:] = pred[i]

In [None]:
sim = matrix_res.dot(matrix_res.T)
norms = np.array([np.sqrt(np.diagonal(sim))])
sim = sim / norms / norms.T 
len(pred[0])

In [None]:
sim

<h2> Step 5 - Build Movie Recommender</h2>

In [None]:
# Load in movie data
idx_to_movie2 = {}
i = 0

for row in df.itertuples():
    idx_to_movie2[i] = row[1]
    i += 1

In [None]:
def top_k_movies(similarity, mapper, movie_idx, k=6):
    return [mapper[x] for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

<h2> Step 6 - Generate Movie Recommendations</h2>

In [None]:
#idx = 1811
idx=3
movies = top_k_movies(sim, idx_to_movie2, idx)
movies = movies[:5]

In [None]:
n_display = 5
URL = [0]*n_display
i = 0
for movie in movies:
    (URL[i], IMDB[i]) = get_poster(movie, base_url)
    i += 1 
    
images = ''
for i in range(n_display):
    images += "<img style='width: 110px; margin: 0px; \
                float: left; border: 1px solid black;' src='%s' />" \
                % URL[i]

display(HTML(images))