In [1]:
from glob import glob
import pandas as pd
import numpy as np
import os
import json
from collections import defaultdict
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import re
from gensim.models import Word2Vec
from tqdm import tqdm

In [2]:
df = pd.read_csv('dataset/tmdb_5000_movies_nonull.csv')
df_credits = pd.read_csv('dataset/tmdb_5000_credits.csv')
credits_sub = df_credits.loc[:, ['movie_id', 'cast']].rename(columns={
                                                             'movie_id': 'id'})

df = df[['id', 'budget', 'genres', 'keywords', 'original_language',
         'overview', 'popularity', 'production_companies',
         'production_countries', 'release_date', 'revenue', 'runtime',
         'spoken_languages', 'tagline', 'title', 'vote_average', 'vote_count']]
df = df.merge(credits_sub)

In [3]:
with open('dataset/subtitles/subtitles.json', 'r') as f:
    sub_dict = json.load(f)
sub_dict = defaultdict(str, sub_dict)

In [4]:
# df["subtitles"] = df.title.apply(lambda title: sub_dict[title]) # keep blocks separated
df["subtitles"] = df.title.apply(
    lambda title: "\n\n".join(sub_dict[title]))  # single string
df.shape

(4799, 19)

In [5]:
# drop movies with no genre info
for i, row in df.iterrows():
    if row['genres'] == '[]':
        df.drop(i, inplace=True)
df = df.dropna()
df.shape

(3956, 19)

In [6]:
df.drop(df[df["subtitles"] == ''].index, inplace=True)
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.tail()

(3037, 19)


Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast,subtitles
3032,226458,0,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 27, ""n...","[{""id"": 9712, ""name"": ""possession""}]",en,"During an all-night, drug-fueled party at an a...",3.619167,"[{""name"": ""GO Productions"", ""id"": 2943}, {""nam...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2015-01-16,0,91.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",nederlands,Backmask,4.7,79,"[{""cast_id"": 3, ""character"": ""Father Conway"", ...",The Exeter School for\nthe feeble minded.\n\nT...
3033,692,12000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 35, ""nam...","[{""id"": 237, ""name"": ""gay""}, {""id"": 900, ""name...",en,Notorious Baltimore criminal and underground f...,4.553644,"[{""name"": ""Dreamland Productions"", ""id"": 407}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1972-03-12,6000000,93.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",An exercise in poor taste.,Pink Flamingos,6.2,110,"[{""cast_id"": 8, ""character"": ""Divine / Babs Jo...","Hello, moviegoers.\nThis is Mr. Jag...\n\nspea..."
3034,124606,0,"[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 10726, ""name"": ""gang""}, {""id"": 33928, ...",en,A young woman in L.A. is having a bad day: she...,0.918116,"[{""name"": ""Asylum Films"", ""id"": 10571}, {""name...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",1995-09-09,0,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Sometimes you've got to break the rules,Bang,6.0,1,"[{""cast_id"": 2, ""character"": ""The Girl"", ""cred...","All right, Saturday\nis the big day.\n\nA lot ..."
3035,14337,7000,"[{""id"": 878, ""name"": ""Science Fiction""}, {""id""...","[{""id"": 1448, ""name"": ""distrust""}, {""id"": 2101...",en,Friends/fledgling entrepreneurs invent a devic...,23.307949,"[{""name"": ""Thinkfilm"", ""id"": 446}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2004-10-08,424760,77.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",What happens if it actually works?,Primer,6.9,658,"[{""cast_id"": 1, ""character"": ""Aaron"", ""credit_...",Here's what's going to happen.\nI'm going to r...
3036,9367,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]","[{""iso_3166_1"": ""MX"", ""name"": ""Mexico""}, {""iso...",1992-09-04,2040920,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]","He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...",Yes.\n\nYes.\n\nYou know what to do.\n\nGood m...


In [7]:
def text_to_words(text):
    # remove punctuation and whitespace
    # but keep hyphens and apostrophes
    filtered_text = re.sub(r'[^\w\'\s-]',
                           '', text)
    return word_tokenize(filtered_text.lower())

In [8]:
idxs = df.id.tolist()
%time sub_words = [text_to_words(text) for text in df.subtitles.tolist()]
subs = dict(zip(idxs, sub_words))

CPU times: user 1min 45s, sys: 932 ms, total: 1min 46s
Wall time: 1min 48s


In [9]:
%time tagged_data = [TaggedDocument(words=word_list, tags=[index]) for index, word_list in subs.items()]

CPU times: user 372 ms, sys: 4 ms, total: 376 ms
Wall time: 374 ms


In [10]:
model = Doc2Vec(vector_size=50, min_count=2, workers=4)
%time model.build_vocab(tagged_data)

CPU times: user 12.4 s, sys: 92 ms, total: 12.5 s
Wall time: 12.6 s


In [11]:
word2vec = Word2Vec(size=50)
%time word2vec.build_vocab([word for text in tagged_data for word in text])

CPU times: user 6.52 s, sys: 12 ms, total: 6.54 s
Wall time: 6.56 s


In [12]:
# download glove.6B.50d.txt from
# https://www.kaggle.com/rtatman/glove-global-vectors-for-word-representation/version/1
# run following command to convert to word2vec format (extra line at the top)
# python -m gensim.scripts.glove2word2vec -i glove.6B.50d.txt -o glove.6B.50d.word2vec.txt
# https://github.com/RaRe-Technologies/gensim/blob/4543646d3fe3496e11bc935e72cbf9b18504442e/gensim/models/word2vec.py

In [13]:
# lockf=0 doesn't train word vectors any further, 1.0 does.
%time word2vec.intersect_word2vec_format("dataset/glove.6B.50d.word2vec.txt", lockf=1.0, binary=False, encoding='utf8', unicode_errors='strict')

CPU times: user 14.7 s, sys: 84 ms, total: 14.8 s
Wall time: 14.8 s


In [14]:
%time model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 5min 32s, sys: 1.53 s, total: 5min 33s
Wall time: 1min 47s


In [15]:
model_pretrained = Doc2Vec(vector_size=50, min_count=2, workers=4)
model_pretrained.build_vocab(tagged_data)

In [16]:
model_pretrained.wv = word2vec.wv
%time model_pretrained.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 5min 54s, sys: 1.9 s, total: 5min 56s
Wall time: 2min 11s


In [17]:
df[df.title.str.lower().str.contains("star wars")][[
    "title", "id", "release_date"]]

Unnamed: 0,title,id,release_date
204,Star Wars: Episode III - Revenge of the Sith,1895,2005-05-17
205,Star Wars: Episode II - Attack of the Clones,1894,2002-05-15
208,Star Wars: Episode I - The Phantom Menace,1893,1999-05-19
2199,Star Wars,11,1977-05-25


In [18]:
movie_id = 1894

In [19]:
words = subs[movie_id]

In [20]:
inferred_vector = model.infer_vector(words, steps=20)
similar = model.docvecs.most_similar([inferred_vector], topn=10)
pd.DataFrame({"id": [x[0] for x in similar],
              "similarity": [x[1] for x in similar]}).merge(df)[["title", "id", "release_date", "similarity"]]

Unnamed: 0,title,id,release_date,similarity
0,Star Wars: Episode II - Attack of the Clones,1894,2002-05-15,0.93787
1,Star Wars: Episode III - Revenge of the Sith,1895,2005-05-17,0.929543
2,Return of the Jedi,1892,1983-05-23,0.884209
3,Star Wars: Episode I - The Phantom Menace,1893,1999-05-19,0.872506
4,The Black Hole,9570,1979-12-18,0.851249
5,Star Trek III: The Search for Spock,157,1984-05-31,0.816023
6,John Carter,49529,2012-03-07,0.815593
7,The Empire Strikes Back,1891,1980-05-17,0.799171
8,Star Wars,11,1977-05-25,0.793723
9,Stargate: The Ark of Truth,13001,2008-03-11,0.787915


In [21]:
inferred_vector = model_pretrained.infer_vector(words, steps=20)
similar = model_pretrained.docvecs.most_similar([inferred_vector], topn=10)
pd.DataFrame({"id": [x[0] for x in similar],
              "similarity": [x[1] for x in similar]})\
    .merge(df)[["title", "id", "release_date", "similarity"]]

Unnamed: 0,title,id,release_date,similarity
0,Star Wars: Episode II - Attack of the Clones,1894,2002-05-15,0.939697
1,Star Wars: Episode III - Revenge of the Sith,1895,2005-05-17,0.924172
2,Return of the Jedi,1892,1983-05-23,0.902013
3,Star Wars: Episode I - The Phantom Menace,1893,1999-05-19,0.85875
4,Star Trek III: The Search for Spock,157,1984-05-31,0.802331
5,Star Trek VI: The Undiscovered Country,174,1991-12-05,0.799173
6,Stargate: The Ark of Truth,13001,2008-03-11,0.794862
7,The Empire Strikes Back,1891,1980-05-17,0.789949
8,Star Wars,11,1977-05-25,0.787847
9,Warcraft,68735,2016-05-25,0.787183


In [22]:
def infer_docvecs(df):
    docvecs = []
    for index in tqdm(df.id.tolist()):
        word_list = subs[index]
        vec = model_pretrained.infer_vector(word_list, steps=20)
        docvecs.append(vec)
    docvecs = np.array(docvecs, dtype=np.float32)
    return docvecs

In [23]:
def convert_list(cell):
    """convert the json format to a list of categories"""
    kw_list = []
    for kw in json.loads(cell):
        kw_list.append(kw['name'])
    return kw_list


def larger_n(col, n):
    """filter the column"""
    keywords = defaultdict(int)
    for row in df[col]:
        row = json.loads(row)
        for entry in row:
            keywords[entry['name']] += 1
    kw_cnt = sorted(keywords.items(), key=lambda x: -x[1])
    return [kw[0] for kw in kw_cnt if kw[1] >= n]


def extract_gender(cell):
    """Extract cast gender"""
    female = 0
    male = 0
    for item in json.loads(cell):
        if item['gender'] == 1:
            female += 1
        elif item['gender'] == 2:
            male += 1
        else:
            continue
    return female, male


def concat_names(cell):
    """Concatenate first names and last names"""
    names = []
    for name in cell:
        names.append(name.replace(' ', ''))
    return names


def list2str(cell):
    """Convert list to string"""
    return ' '.join(cell)


def transform_cols(df, cols_to_transform):
    """Transform columns of a dataframe.
    cols_to_transform should be a dict(col_name: filter value n)
    """
    for col_name in cols_to_transform.keys():
        larger_col = larger_n(col_name, cols_to_transform[col_name])
        if col_name == 'cast':
            gen = df[col_name].apply(extract_gender)
            df['female_pct'] = gen.apply(lambda x: x[0]/(x[0]+x[1]+0.001))
            df['male_pct'] = gen.apply(lambda x: x[1]/(x[0]+x[1]+0.001))

        df[col_name] = df[col_name].apply(convert_list)\
            .apply(lambda cell: [kw for kw in cell if kw in larger_col])
    return df

In [24]:
cols_to_transform = {'keywords': 30,
                     'genres': 0,
                     'production_companies': 5,
                     'production_countries': 3,
                     'spoken_languages': 10,
                     'cast': 2}

df_movies = transform_cols(df, cols_to_transform)

In [25]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(df_movies['genres'])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [26]:
import datetime

In [27]:
df.release_date = pd.to_datetime(df.release_date)
df.release_date.max(), df.release_date.min()
split_date = datetime.datetime(2012, 1, 1)
train_df = df[df.release_date < split_date]
test_df = df[df.release_date >= split_date]

train_genres_dummies = pd.DataFrame(mlb.transform(
    train_df['genres']), columns=mlb.classes_).add_prefix('genre_')

test_genres_dummies = pd.DataFrame(mlb.transform(
    test_df['genres']), columns=mlb.classes_).add_prefix('genre_')

train_df.shape[0]/df.shape[0], test_df.shape[0]/df.shape[0]

(0.7912413566019098, 0.20875864339809022)

In [28]:
train_docvecs = infer_docvecs(train_df)
test_docvecs = infer_docvecs(test_df)

100%|██████████| 2403/2403 [11:54<00:00,  3.37it/s]
100%|██████████| 634/634 [02:31<00:00,  4.36it/s]


In [29]:
train_data, test_data = train_docvecs, test_docvecs
train_labels, test_labels = train_genres_dummies, test_genres_dummies

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

parameters = {'n_estimators': [100],
              'min_samples_leaf': [2],
              # 'criterion': ['gini', 'entropy'],
              'max_depth': [None, 5, 20, 40],
              'min_samples_split': [2]}

rf = GridSearchCV(RandomForestClassifier(verbose=1, n_jobs=4,
                                         oob_score=True), cv=3, param_grid=parameters)

In [31]:
%time rf.fit(train_data, train_labels)

[Parallel(n_jobs=4)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

CPU times: user 39.6 s, sys: 112 ms, total: 39.7 s
Wall time: 35.3 s


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=4,
            oob_score=True, random_state=None, verbose=1, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100], 'min_samples_leaf': [2], 'max_depth': [None, 5, 20, 40], 'min_samples_split': [2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
rf.best_params_

{'max_depth': 40,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [33]:
from sklearn.metrics import accuracy_score
train_predictions = rf.predict(train_data)
predictions = rf.predict(test_data)

print(f"Train accuracy: {accuracy_score(train_labels, train_predictions)}\n")
print(f"Test accuracy: {accuracy_score(test_labels, predictions)}\n")

print(classification_report(test_labels, predictions, target_names=mlb.classes_))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


Train accuracy: 0.6729088639200999

Test accuracy: 0.14353312302839116

                 precision    recall  f1-score   support

         Action       0.73      0.51      0.60       176
      Adventure       0.80      0.26      0.40       125
      Animation       1.00      0.05      0.09        41
         Comedy       0.71      0.63      0.66       202
          Crime       0.73      0.17      0.28        92
    Documentary       0.00      0.00      0.00         7
          Drama       0.64      0.50      0.56       259
         Family       0.93      0.38      0.54        65
        Fantasy       0.67      0.03      0.06        65
        Foreign       0.00      0.00      0.00         1
        History       0.00      0.00      0.00        15
         Horror       1.00      0.07      0.13        83
          Music       0.00      0.00      0.00        25
        Mystery       0.00      0.00      0.00        35
        Romance       0.59      0.22      0.32        72
Science Fiction

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [34]:
from sklearn.metrics import hamming_loss
print(f"Train Hamming Loss: {hamming_loss(train_labels, train_predictions)}\n")
print(f"Hamming Loss: {hamming_loss(test_labels, predictions)}\n")

Train Hamming Loss: 0.021306699958385352

Hamming Loss: 0.09755520504731861

