In [1]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

Using TensorFlow backend.


In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

FileNotFoundError: [Errno 2] No such file or directory: 'data/wp_movies_10k.ndjson'

In [11]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(3)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867)]

In [12]:
# indexing links appeared more than 3 times
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link:idx for idx, link in enumerate(top_links)}

# indexing every movies
movie_to_idx = {movie[0]:idx for idx, movie in enumerate(movies)}

In [13]:
len(top_links), len(movie_to_idx)

(66913, 10000)

In [14]:
pairs=[]

for movie in movies:
    #(link index, movie index): links connected to movie is in meaningful
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) 
                 for link in movie[2] 
                 if link in link_to_idx)

pairs_set = set(pairs)
#링크 인덱스, 무비인덱스
pairs_set

{(2454, 7796),
 (21670, 416),
 (2926, 6061),
 (4849, 255),
 (35937, 8392),
 (23746, 1366),
 (18849, 2311),
 (13688, 7765),
 (19206, 2373),
 (178, 2893),
 (11820, 8449),
 (42710, 5809),
 (965, 7725),
 (22, 1106),
 (1203, 8556),
 (33113, 1909),
 (43079, 8950),
 (321, 5887),
 (61712, 5361),
 (15937, 216),
 (1931, 369),
 (43991, 3682),
 (2988, 836),
 (402, 211),
 (13348, 3766),
 (23548, 3035),
 (433, 5722),
 (7609, 5065),
 (700, 4275),
 (22, 4515),
 (2966, 2139),
 (27551, 570),
 (943, 65),
 (22, 8142),
 (19893, 2249),
 (22873, 7291),
 (16570, 2666),
 (22698, 4247),
 (9868, 8215),
 (22, 9301),
 (24719, 4337),
 (56790, 4089),
 (24937, 481),
 (862, 6414),
 (55409, 3392),
 (51124, 6238),
 (4588, 6739),
 (25841, 2840),
 (1591, 3611),
 (11202, 2683),
 (66068, 8593),
 (60643, 5100),
 (10865, 5926),
 (690, 66),
 (53488, 9161),
 (35748, 996),
 (18468, 7543),
 (215, 2246),
 (7534, 6429),
 (19506, 310),
 (57410, 7048),
 (7381, 1666),
 (8904, 1043),
 (37124, 8908),
 (22144, 389),
 (31043, 1277),
 (464

In [15]:
def movie_embedding_model(embedding_size=30):
    
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    
    movie_embedding = Embedding(name='movie_embedding',
                               input_dim=len(movie_to_idx),
                               output_dim=embedding_size)(movie)
    
    dot=Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    
    merged = Reshape((1,))(dot)
    print(merged)
    model=Model(inputs=[link, movie], outputs=[merged])
    
    model.compile(optimizer='nadam', loss='mse')
    
    return model


In [16]:
import numpy as np
a = np.zeros((4))
a.shape

(4,)

In [None]:
model = movie_embedding_model()
model.summary()




Tensor("reshape_1/Reshape:0", shape=(?, 1), dtype=float32)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 30)        2007390     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 30)        300000      movie[0][0]                      
_____________________________

In [None]:
def batchifier(pairs, positive_samples=50, negative_ratio=5):
    batch_size = positive_samples * (1+negative_ratio)
    batch=np.zeros((batch_size, 3))
    
    while True:
        
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx,:] = (link_id, movie_id, 1)
        idx = positive_samples
        
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            
            if not (link_id, movie_id) in pairs_set:
                batch[idx,:] = (link_id, movie_id, -1)
                idx += 1
        
        np.random.shuffle(batch)
        yield {'link':batch[:,0], 'movie':batch[:,1]}, batch[:,2]

In [None]:
model.fit_generator(batchifier(pairs, positive_samples=512, negative_ratio=10), epochs=5,
                   steps_per_epoch= len(pairs)//512)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/5
Epoch 2/5

In [45]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
len(movie.get_weights()[0]),len(movie.get_weights()[0][0])

(10000, 30)

임베딩 층의 가중치(input, output)를 찾아낸다.

In [51]:
movie_lengths = np.linalg.norm(movie_weights, axis=1)
len(movie_lengths)

10000

In [53]:
normalized_movies = (movie_weights.T / movie_lengths).T
normalized_movies.shape

(10000, 30)

In [56]:
normalized_movies[0]

array([ 0.14850336, -0.06961831,  0.28547525,  0.2940895 , -0.19868422,
        0.12792797, -0.00523895, -0.30956873,  0.13859656,  0.03159344,
       -0.08810817, -0.00750295,  0.01563228,  0.05389674, -0.11431464,
        0.06973672, -0.2443584 ,  0.05773252,  0.05812418,  0.29764917,
        0.02806741, -0.1788136 , -0.33404267, -0.09478752,  0.22174637,
       -0.04158582,  0.14229204,  0.23154363,  0.3365445 ,  0.24069883],
      dtype=float32)

In [None]:
def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

In [36]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('George Lucas')

127 George Lucas 0.9999999
3176 Star Wars (film) 0.97726214
2723 E.T. the Extra-Terrestrial 0.9622769
2983 Hugo Award 0.93577754
4759 Playboy 0.93288386
3696 Raiders of the Lost Ark 0.9253376
3082 Category:Films that won the Best Visual Effects Academy Award 0.9223366
976 Hugo Award for Best Dramatic Presentation 0.91892844
4986 HarperCollins 0.9162214
4360 Cinefantastique 0.91432893


### Building a movie recommender

In [57]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape

(16, 30)

In [58]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [59]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])


best:
630 The Tree of Life (film) 1.0990628587564395
481 The Devil Wears Prada (film) 1.0719270781456087
6870 Goodbye to Language 1.0603318449493053
66 Skyfall 1.033642170702758
70 Carol (film) 1.0000000178924902
worst:
9595 Speed Zone -1.7915385665168142
8927 The Big Green -1.7462241447300098
5582 CB4 -1.7113821576265076
9694 The Marine (film series) -1.7014690236743166
3584 Air Bud -1.6919734990679738


### Predict Simple Movie Properties

In [None]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [None]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

In [None]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ', 1)
    unit = unit.lower()
    if not unit in ('million', 'billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v *= 1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

In [None]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie, gr in zip(movies, movie_gross) if gr])

In [None]:
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

In [None]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

In [None]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)