In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px

In [5]:
df = pd.read_csv('data/movies.csv')

In [6]:
fig = px.scatter(df, x='title', y='rating', color='rating',
                 hover_data=['title', 'year'])

# change dimensions
fig.update_layout(width=1200, height=600)

fig.update_xaxes(showticklabels=False)
fig.update_xaxes(title_text="Movie")
fig.update_yaxes(title_text="Score")

fig.update_layout(title_text="Movie Rating Scores", 
                  title_x=0.5)



In [9]:
df.head(10)

Unnamed: 0,title,votes,rating,writer,director,actors,genres,plotwords,year
0,The Shawshank Redemption,2600000,137.370505,Stephen King,Frank Darabont,"['Frank Darabont', 'Tim Robbins', 'Morgan Free...",['Drama'],"['wrongful imprisonment', 'prison', 'based on ...",1994
1,The Dark Knight,2600000,132.939198,Jonathan Nolan,Christopher Nolan,"['Christopher Nolan', 'David S. Goyer', 'Chris...","['Action', 'Crime', 'Drama']","['dc comics', 'moral dilemma', 'psychopath', '...",2008
2,The Godfather,1800000,132.510334,Mario Puzo,Francis Ford Coppola,"['Francis Ford Coppola', 'Marlon Brando', 'Al ...","['Crime', 'Drama']","['mafia', 'patriarch', 'crime family', 'organi...",1972
3,The Lord of the Rings: The Return of the King,1800000,129.629675,J.R.R. Tolkien,Peter Jackson,"['Fran Walsh', 'Philippa Boyens', 'Elijah Wood...","['Action', 'Adventure', 'Drama']","['epic', 'orc', 'hobbit', 'ring', 'middle eart...",2003
4,Pulp Fiction,2000000,129.127054,Quentin Tarantino,Quentin Tarantino,"['Roger Avary', 'John Travolta', 'Uma Thurman'...","['Crime', 'Drama']","['nonlinear timeline', 'overdose', 'drug use',...",1994
5,Inception,2300000,128.906093,Christopher Nolan,Christopher Nolan,"['Leonardo DiCaprio', 'Joseph Gordon-Levitt', ...","['Action', 'Adventure', 'Sci-Fi']","['dream', 'ambiguous ending', 'subconscious', ...",2010
6,Forrest Gump,2000000,127.676188,Winston Groom,Robert Zemeckis,"['Eric Roth', 'Tom Hanks', 'Robin Wright', 'Ga...","['Drama', 'Romance']","['vietnam war', 'based on book', 'vietnam', 'm...",1994
7,Fight Club,2000000,127.676188,Chuck Palahniuk,David Fincher,"['Jim Uhls', 'Brad Pitt', 'Edward Norton', 'Me...",['Drama'],"['surprise ending', 'anti establishment', 'ins...",1999
8,The Lord of the Rings: The Fellowship of the Ring,1800000,126.749016,J.R.R. Tolkien,Peter Jackson,"['Fran Walsh', 'Philippa Boyens', 'Elijah Wood...","['Action', 'Adventure', 'Drama']","['ring', 'quest', 'hobbit', 'middle earth', 'e...",2001
9,Schindler's List,1300000,126.700873,Thomas Keneally,Steven Spielberg,"['Steven Zaillian', 'Liam Neeson', 'Ralph Fien...","['Biography', 'Drama', 'History']","['accountant', 'champagne', 'villa', 'womanize...",1993


## Create Embeddings for plotwords

In [95]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [34]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased').eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [132]:
def get_embeddings(arr):
    plot = ', '.join(eval(arr)).lower()
    tok = tokenizer(plot, return_tensors="pt")
    with torch.no_grad():
        out = model(**tok)
    
    yield out.last_hidden_state.squeeze().mean(dim=0)


In [149]:
# using generators to avoid immediate computation
df['embeddings'] = df.plotwords.apply(lambda x: get_embeddings(x))

In [150]:
from tqdm import tqdm

print(len(df.embeddings))

for i, val in tqdm(enumerate(df.embeddings)):
    df['embeddings'].at[i] = np.array([e for e in val], dtype=object)

22223



The input object of type 'Tensor' is an array-like implementing one of the corresponding protocols (`__array__`, `__array_interface__` or `__array_struct__`); but not a sequence (or 0-D). In the future, this object will be coerced as if it was first converted using `np.array(obj)`. To retain the old behaviour, you have to either modify the type 'Tensor', or assign to an empty array created with `np.empty(correct_shape, dtype=object)`.

22223it [05:13, 70.86it/s]


In [151]:
df[['title', 'embeddings']].to_csv('plotword_embeddings.csv', index=False)

## Use annoy index to search for similar movies

In [214]:
# df.embeddings = pd.read_csv('plotword_embeddings.csv')['embeddings']

In [215]:
from annoy import AnnoyIndex

n = 500

embedding_dimensions = len(df.embeddings[0][0])
annoy_index = AnnoyIndex(embedding_dimensions, 'euclidean')
for idx, embedding in tqdm(enumerate(df.embeddings)):
    annoy_index.add_item(idx, embedding[0])
annoy_index.build(n)
annoy_index.save(f'models/angular{n}.ann')


22223it [00:11, 2010.55it/s]


True

In [216]:
df[df.title.str.lower().str.contains('truman show')]


Unnamed: 0,title,votes,rating,writer,director,actors,genres,plotwords,year,embeddings
59,The Truman Show,1000000,113.287187,Andrew Niccol,Peter Weir,"['Jim Carrey', 'Ed Harris', 'Laura Linney', 'P...","['Comedy', 'Drama']","['hidden camera', 'simulated reality', 'fictio...",1998,"[[tensor(0.4149), tensor(0.2347), tensor(0.215..."


In [246]:
annoy_index.get_nns_by_item(
    8, 50, include_distances=True
)[1]

[0.0,
 3.874802350997925,
 3.9118878841400146,
 3.9201176166534424,
 4.2518768310546875,
 4.26020622253418,
 4.306495189666748,
 4.318349361419678,
 4.350052356719971,
 4.363883972167969,
 4.396214962005615,
 4.4211745262146,
 4.498809337615967,
 4.501717567443848,
 4.5411858558654785,
 4.558286190032959,
 4.583065986633301,
 4.592584609985352,
 4.6084442138671875,
 4.6196513175964355,
 4.620026588439941,
 4.643993854522705,
 4.65002965927124,
 4.651834964752197,
 4.659714221954346,
 4.662047386169434,
 4.662972927093506,
 4.688904762268066,
 4.71012020111084,
 4.7106099128723145,
 4.711930751800537,
 4.712278842926025,
 4.719250202178955,
 4.744794845581055,
 4.750583648681641,
 4.757874965667725,
 4.764669895172119,
 4.767683506011963,
 4.767960548400879,
 4.775468826293945,
 4.776240348815918,
 4.788448810577393,
 4.790950775146484,
 4.793685436248779,
 4.804899215698242,
 4.805782318115234,
 4.818269729614258,
 4.818339824676514,
 4.821322441101074,
 4.824385166168213]

In [245]:
df[['title', 'director', 'rating', 'plotwords']].iloc[annoy_index.get_nns_by_item(
    8, 30)].sort_values(by='rating', ascending=False)


Unnamed: 0,title,director,rating,plotwords
3,The Lord of the Rings: The Return of the King,Peter Jackson,129.629675,"['epic', 'orc', 'hobbit', 'ring', 'middle eart..."
8,The Lord of the Rings: The Fellowship of the Ring,Peter Jackson,126.749016,"['ring', 'quest', 'hobbit', 'middle earth', 'e..."
12,The Lord of the Rings: The Two Towers,Peter Jackson,125.712525,"['middle earth', 'hobbit', 'epic', 'wizard', '..."
164,Harry Potter and the Prisoner of Azkaban,Alfonso Cuarón,105.340526,"['magic', 'wizard', 'werewolf', 'rat', 'school..."
188,The Hobbit: The Desolation of Smaug,Peter Jackson,104.352727,"['lord of the rings', 'middle earth', 'sequel'..."
220,Harry Potter and the Sorcerer's Stone,Chris Columbus,102.882101,"['magic', 'wizard', 'orphan', 'school of magic..."
374,The Hobbit: The Battle of the Five Armies,Peter Jackson,97.352906,"['middle earth', 'army', 'epic', 'hobbit', 'ep..."
657,Thor: The Dark World,Alan Taylor,91.150806,"['elf', 'open ended', 'end of the world', 'ado..."
867,Doctor Sleep,Mike Flanagan,88.574564,"['based on the works of stephen king', 'hat', ..."
953,Hellboy II: The Golden Army,Guillermo del Toro,87.517267,"['superhero', 'twins', 'friendship', 'cat', 'e..."
