In [1]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

Using TensorFlow backend.


In [4]:
# WS check versions 11/13/18
# had to get version 2.1.2 of keras to be compatible with tensorflow 1.1.0
# 'pip install keras==2.1.2'
import tensorflow as tf
import keras as ks
tf.__version__, ks.__version__  

('1.1.0', '2.1.2')

In [5]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [6]:
len(movies)  # WS 11/13/18

10000

In [7]:
# WS 11/13/18  
# 0 is name of movie, 1 is movie metadata, 2 is actors, awards, misc info, 3 is rotten score, 4 is other score
movies[0][4] 

'6.9/10'

In [8]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(15)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454),
 ('Entertainment Weekly', 2375),
 ('British Board of Film Classification', 2236),
 ('Chicago Sun-Times', 1826),
 ('Deadline.com', 1814),
 ('The Guardian', 1528)]

In [9]:
type(link_counts)  # WS 11/13/18

collections.Counter

In [10]:
link_counts.items()



In [11]:
top_links = [link for link, c in link_counts.items() if c >= 3]
top_links[:10]  # WS 11/13/18

['Tim Miller (director)',
 'Simon Kinberg',
 'Ryan Reynolds',
 'Lauren Shuler Donner',
 'Rhett Reese',
 'Paul Wernick',
 'Deadpool',
 'Morena Baccarin',
 'Ed Skrein',
 'T.J. Miller']

In [12]:
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
link_to_idx  # WS 11/13/18  add an index to the top links

{'Tim Miller (director)': 0,
 'Simon Kinberg': 1,
 'Ryan Reynolds': 2,
 'Lauren Shuler Donner': 3,
 'Rhett Reese': 4,
 'Paul Wernick': 5,
 'Deadpool': 6,
 'Morena Baccarin': 7,
 'Ed Skrein': 8,
 'T.J. Miller': 9,
 'Gina Carano': 10,
 'Leslie Uggams': 11,
 'Brianna Hildebrand': 12,
 'Stefan Kapičić': 13,
 'Junkie XL': 14,
 'Julian Clarke': 15,
 'Marvel Entertainment': 16,
 'Kinberg Genre': 17,
 'TSG Entertainment': 18,
 '20th Century Fox': 19,
 'Le Grand Rex': 20,
 'Variety (magazine)': 21,
 'Box Office Mojo': 22,
 'superhero film': 23,
 'Marvel Comics': 24,
 'X-Men (film series)': 25,
 'antihero': 26,
 'New Line Cinema': 27,
 'X-Men Origins: Wolverine': 28,
 'principal photography': 29,
 'Vancouver': 30,
 'IMAX': 31,
 'D-Box Technologies': 32,
 'Golden Globe Award': 33,
 'Golden Globe Award for Best Motion Picture – Musical or Comedy': 34,
 'Golden Globe Award for Best Actor – Motion Picture Musical or Comedy': 35,
 'Producers Guild of America Award': 36,
 "Critics' Choice Movie Awards

In [13]:
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
movie_to_idx  # WS 11/13/18 add an index to the movie names

{'Deadpool (film)': 0,
 'The Revenant (2015 film)': 1,
 'Suicide Squad (film)': 2,
 'Spectre (2015 film)': 3,
 'Rebel Without a Cause': 4,
 'Warcraft (film)': 5,
 'The Martian (film)': 6,
 'List of Marvel Cinematic Universe films': 7,
 'X-Men (film series)': 8,
 'The Hateful Eight': 9,
 'The Jungle Book (2016 film)': 10,
 'The Big Short (film)': 11,
 '10 Cloverfield Lane': 12,
 'Spotlight (film)': 13,
 'Room (2015 film)': 14,
 'Creed (film)': 15,
 'DC Universe Animated Original Movies': 16,
 'Star Trek Beyond': 17,
 'Star Wars (film)': 18,
 'Interstellar (film)': 19,
 'Ant-Man (film)': 20,
 'Everest (2015 film)': 21,
 'Jurassic World': 22,
 'Joy (film)': 23,
 'Gods of Egypt (film)': 24,
 'Star Wars sequel trilogy': 25,
 'The Conjuring 2': 26,
 'The Danish Girl (film)': 27,
 'Sicario (2015 film)': 28,
 'Rogue One': 29,
 'Finding Dory': 30,
 'Black Mass (film)': 31,
 'Blade Runner': 32,
 'Harry Potter (film series)': 33,
 'Doctor Strange (film)': 34,
 'Titanic (1997 film)': 35,
 'Furious

In [14]:
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [15]:
pairs  # WS 11/13/18 tuple linking link index to movie index

[(0, 0),
 (1, 0),
 (2, 0),
 (3, 0),
 (4, 0),
 (5, 0),
 (6, 0),
 (7, 0),
 (8, 0),
 (9, 0),
 (10, 0),
 (11, 0),
 (12, 0),
 (13, 0),
 (14, 0),
 (15, 0),
 (16, 0),
 (17, 0),
 (3, 0),
 (18, 0),
 (19, 0),
 (20, 0),
 (21, 0),
 (22, 0),
 (23, 0),
 (0, 0),
 (4, 0),
 (5, 0),
 (24, 0),
 (6, 0),
 (25, 0),
 (2, 0),
 (7, 0),
 (8, 0),
 (9, 0),
 (10, 0),
 (11, 0),
 (12, 0),
 (13, 0),
 (26, 0),
 (27, 0),
 (19, 0),
 (28, 0),
 (29, 0),
 (30, 0),
 (31, 0),
 (32, 0),
 (33, 0),
 (34, 0),
 (35, 0),
 (36, 0),
 (37, 0),
 (38, 0),
 (39, 0),
 (40, 0),
 (41, 0),
 (6, 0),
 (42, 0),
 (43, 0),
 (44, 0),
 (45, 0),
 (46, 0),
 (47, 0),
 (48, 0),
 (49, 0),
 (50, 0),
 (51, 0),
 (52, 0),
 (53, 0),
 (54, 0),
 (55, 0),
 (2, 0),
 (6, 0),
 (56, 0),
 (28, 0),
 (57, 0),
 (7, 0),
 (43, 0),
 (58, 0),
 (8, 0),
 (45, 0),
 (59, 0),
 (0, 0),
 (9, 0),
 (49, 0),
 (1, 0),
 (60, 0),
 (10, 0),
 (46, 0),
 (11, 0),
 (50, 0),
 (61, 0),
 (12, 0),
 (52, 0),
 (53, 0),
 (62, 0),
 (63, 0),
 (64, 0),
 (65, 0),
 (13, 0),
 (51, 0),
 (66, 0),
 (67, 0

In [16]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

WS 11/13/18 
there was a version issue between tensorflow and keras versions: fixed by going backwards to keras 2.1.2

see https://github.com/pierluigiferrari/ssd_keras/issues/83

In [23]:
import tensorflow as tf
import keras as ks
tf.__version__, ks.__version__

('1.1.0', '2.1.2')

In [24]:
model = movie_embedding_model()  # WS 11/13/18 this works now, with keras/tensorflow incompatibility fixed
model.summary()                  

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 50)        3345650     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        500000      movie[0][0]                      
__________________________________________________________________________________________________
dot_produc

In [25]:
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

In [26]:
next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([ 3801., 48731.,  1313., 13365., 31254., 20558., 32643., 32318.,
         22418.]),
  'movie': array([5874., 1854., 7236., 6238., 5530.,  849., 7628., 7685., 1529.])},
 array([-1., -1.,  1., -1.,  1., -1., -1., -1.,  1.]))

In [28]:
# WS 11/13/18 this will take time to run, killed it; consider reducing samples per batch? 

positive_samples_per_batch = 512

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/15


KeyboardInterrupt: 

In [19]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

29 Rogue One 0.9999999
3349 Star Wars: The Force Awakens 0.9722805
101 Prometheus (2012 film) 0.9653338
140 Star Trek Into Darkness 0.9635347
22 Jurassic World 0.962336
25 Star Wars sequel trilogy 0.95218825
659 Rise of the Planet of the Apes 0.9516557
62 Fantastic Beasts and Where to Find Them (film) 0.94662267
42 The Avengers (2012 film) 0.94634
37 Avatar (2009 film) 0.9460137


In [20]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

14913 George Lucas 1.0
50812 Star Wars (film) 0.9670632
66120 Star Wars 0.9511891
466 Hugo Award for Best Dramatic Presentation 0.9418189
2254 Raiders of the Lost Ark 0.92919797
60696 Saturn Award for Best Science Fiction Film 0.92867565
42371 Hugo Award 0.92317486
35358 Lucasfilm 0.91876715
20994 2001: A Space Odyssey (film) 0.9185802
12959 London Symphony Orchestra 0.91714984


In [24]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape

(16, 50)

In [25]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])


best:
307 Les Misérables (2012 film) 1.246511730519127
66 Skyfall 1.1888723752441601
481 The Devil Wears Prada (film) 1.1348285888204566
630 The Tree of Life (film) 1.1295026844583682
81 Birdman (film) 1.1121067681173762
worst:
9694 The Marine (film series) -1.6472428525072056
5097 Ready to Rumble -1.6412750149090598
8837 The Santa Clause (film series) -1.6391878640118387
1782 Scooby-Doo! WrestleMania Mystery -1.610221193972685
3188 Son of the Mask -1.6013579562623643


In [27]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [28]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [29]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.06'

In [30]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.09'

In [34]:
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ', 1)
    unit = unit.lower()
    if not unit in ('million', 'billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v *= 1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

6 The Martian (film) 10900.0
7 List of Marvel Cinematic Universe films 4300.0
49 Back to the Future 3900.0
71 The Conjuring 2932.0
162 Thor (film) 2464.0
36 Furious 7 2340.0
30 Finding Dory 2187.0
1906 Jane Eyre (2011 film) 2068.0
19 Interstellar (film) 1670.0
2251 An American Werewolf in London 1655.0


In [35]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie, gr in zip(movies, movie_gross) if gr])

In [36]:
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [37]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 7729.44'

In [38]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 14115.59'