# 04.2 Build a recommender system ...
# WESmith 06/21/23
## This is from Osinga's Deep Learning Cookbook

In [None]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm
import os

In [None]:
colab = False # set to True on colab

In [None]:
from tensorflow.python.client import device_lib
devices = device_lib.list_local_devices()
devices

In [None]:
device_type = 'cpu'  # WS addition
for k in devices:
    if k.device_type == 'GPU': device_type = 'gpu'
device_type

In [None]:
import tensorflow as tf
import keras as ks
tf.__version__, ks.__version__  

In [None]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')
    base     = '/content/drive/My Drive'
    data_dir = 'data'
    fullpath = os.path.join(base, data_dir)
else:
    fullpath = 'data'    

In [None]:
filename = 'wp_movies_10k.ndjson'

In [None]:
with open(os.path.join(fullpath, filename)) as fin:
    movies = [json.loads(l) for l in fin]

In [None]:
len(movies)  # WS 11/13/18

In [None]:
# WS 11/13/18  
# 0 is name of movie, 1 is movie metadata, 
# 2 is actors, awards, misc info, 3 is rotten score, 4 is
# other score
dd = movies[0]
dd[0]

In [None]:
list(dd[1].keys())

In [None]:
# the links from the wikipedia page
len(dd[2]) # a very long list of names, awards, reviewers, misc info: links

In [None]:
#dd[2]

In [None]:
dd[3], dd[4]

In [None]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(15)

In [None]:
#help(link_counts)  # WS Counter is very useful

In [None]:
link_counts.total()  # total number of links over all movies

In [None]:
# WS average number of links/movie
float(link_counts.total())/len(movies)

In [None]:
type(link_counts)  # WS 11/13/18

In [None]:
#link_counts.items()

In [None]:
top_links = [link for link, c in link_counts.items() if c >= 3]
top_links[:10]  # WS 11/13/18

In [None]:
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
#link_to_idx # WS 11/13/18  add an index to the top links

In [None]:
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
#movie_to_idx  # WS 11/13/18 add an index to the movie names

In [None]:
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) 
                 for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

In [None]:
pairs[:10]  # WS 11/13/18 tuple linking link index to movie index

In [None]:
random.seed(5)
def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

In [None]:
def movie_embedding_model(embedding_size=50):
    link  = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model  = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

In [None]:
next(batchifier(pairs, positive_samples=3, negative_ratio=2))  # example

## DEFINE, TRAIN (ON GPU), OR LOAD (ON CPU) THE MODEL

In [None]:
if device_type == 'cpu':  # on local machine, copy a colab-trained model
    data_dir = '/home/smithw/Downloads/deep_learning'
    filename = '230621-003000_colab_model.h5' # WS file generated on colab's GPU
    model = ks.models.load_model(os.path.join(data_dir, filename))
else:  # on colab with GPU: train the model
    model = movie_embedding_model()
    # WS 06/20/23 15 epochs took 11m 23s on colab with a compute-capability 7.5 GPU
    positive_samples_per_batch = 512
    # WS model.fit_generator() is deprecated; use model.fit()
    model.fit(
        batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
        epochs=15,
        steps_per_epoch=len(pairs) // positive_samples_per_batch,
        verbose=2)

In [None]:
if device_type == 'gpu':  # on colab
    savenam = 'WS_model.h5'
    model.save(savenam)
    files.download(savenam)  # this will go to ~/Downloads, need to move to appropriate area

In [None]:
model.summary()                  

## USE THE MODEL

In [None]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

In [None]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

In [None]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y) 

In [None]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])


In [None]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [None]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

In [None]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ', 1)
    unit = unit.lower()
    if not unit in ('million', 'billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v *= 1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

In [None]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie, gr in zip(movies, movie_gross) if gr])

In [None]:
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

In [None]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)