# Classification By Movie Overview
Classification of movie genre using movie overviews from TMDB exclusively, using the text featurization methods of bigrams and Doc2vec.

In [64]:
from collections import namedtuple
import numpy as np
import pandas as pd
import timeit
from gensim.models import doc2vec
from gensim.parsing.preprocessing import stem
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import hamming_loss
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
import pickle
import string

# Identify Movies for which we have posters
We want to use the same movies for train, validation and test across all our classification tasks, so let's make sure the movies that we're using for text classification are the same ones we're using for image classification

In [31]:
from os import listdir

movie_names = []
for subdirectory in ['train', 'test', 'validation']:
    movie_names += listdir("data/posters_split/" + subdirectory)
movie_ids = [filename.split("-")[-1].split("_")[0] for filename in movie_names]
movie_ids = [int(id) for id in movie_ids]

# Text Preprocessing
We need to preprocess the movie overviews by stripping out punctuation and turning everything to lowercase. We deliberately choose to avoid stop word removal and stemming (both common NLP procedures) due to the relatively short length of the movie overviews. 

We also employ a 70/30 train-test split.

In [32]:
# Pandas has issues reading in a dataframe this large
# so read in chunk by chunk instead
full_dataset = pd.concat(pd.read_csv('data/full_raw_movie_overviews.csv', iterator=True, chunksize=1000), ignore_index=True)
print full_dataset.shape
full_dataset.head()

(31173, 23)


Unnamed: 0,movie_id,movie_title,overview,genre_id,War,Crime,Music,Comedy,History,Western,...,TV_Movie,Fantasy,Animation,Drama,Documentary,Science_Fiction,Horror,Action,Romance,Family
0,329865,Arrival,Taking place after alien crafts land around th...,"[53, 18, 878, 9648]",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,210577,Gone Girl,With his wife's disappearance having become th...,"[9648, 53, 18]",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,27205,Inception,"Cobb, a skilled thief who commits corporate es...","[28, 53, 878, 9648, 12]",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,158852,Tomorrowland,"Bound by a shared destiny, a bright, optimisti...","[28, 10751, 878, 12, 9648]",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,198663,The Maze Runner,"Set in a post-apocalyptic world, young Thomas ...","[28, 9648, 878, 53]",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [33]:
# Remove rows where overview is missing
full_dataset = full_dataset.dropna()
print full_dataset.shape

(20291, 23)


In [34]:
# Filter for movies that we have posters for, dropping duplicates
full_dataset = full_dataset[full_dataset['movie_id'].isin(movie_ids)].drop_duplicates()
print full_dataset.shape

(10509, 23)


In [35]:
# Convert to lowercase and remove punctuation
full_dataset['overview'] = full_dataset['overview'].str.lower()
full_dataset['overview'] = full_dataset['overview'].apply(lambda x: x.translate(string.maketrans(" "," "), string.punctuation))

In [39]:
# Use same test-train-validation split as posters
train_val_test_byid = pickle.load(open('data/train_val_test_byid.pkl', 'rb'))
train = full_dataset[full_dataset['movie_id'].isin(train_val_test_byid['train'])]
test = full_dataset[full_dataset['movie_id'].isin(train_val_test_byid['test'])]
validation = full_dataset[full_dataset['movie_id'].isin(train_val_test_byid['validation'])]
print train.shape, test.shape, validation.shape

(8405, 23) (1049, 23) (1055, 23)


# Bigram feature generation

In [40]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))
count_vectorizer.fit(train["overview"])

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [41]:
# Generate bigram features
train_bigrams = count_vectorizer.transform(train["overview"])
val_bigrams = count_vectorizer.transform(validation["overview"])

# Doc2vec feature generation
We use doc2vec to learn fixed length vector representations of each of the documents (movie overviews) and therefore avoid time intensive route of manual feature engineering. 

In [42]:
taggedXML = namedtuple('TaggedXML', 'words tags')
documents = []

for i in train.index:
    tags = [train['movie_id'][i]]
    documents.append(taggedXML(train['overview'][i].split(), tags))

In [45]:
d2v = doc2vec.Doc2Vec(documents, size=100, workers=4, iter=30)
# Save model
d2v.save("d2v_models/d2v_size100_iter30")

In [46]:
# Use trained d2v model in order to infer feature vectors for each of the documents
train_d2v_features = [d2v.infer_vector(document) for document in train["overview"]]
val_d2v_features = [d2v.infer_vector(document) for document in validation["overview"]]

# Multilabel Random Forest Classification

In [53]:
# Get labels into multilabel classification format for train and test
train_y = train.ix[:, 'War':]
val_y = validation.ix[:, 'War':]

In [56]:
# Train random forest on bigram features
bigram_rf = RandomForestClassifier(n_estimators=10)
bigram_rf.fit(train_bigrams, train_y)

train_bigram_pred = bigram_rf.predict(train_bigrams)
val_bigram_pred = bigram_rf.predict(val_bigrams)

bigram_rf.score(val_bigrams, val_y)

0.023696682464454975




In [81]:
print "Hamming loss on train with bigram features:", hamming_loss(train_y, train_bigram_pred)
print "Hamming loss on validation with bigram features:", hamming_loss(val_y, val_bigram_pred)

Hamming loss on train with bigram features: 0.0176023043927
Hamming loss on validation with bigram features: 0.134696931903


In [58]:
# Train random forest on doc2vec features
d2v_rf = RandomForestClassifier(n_estimators=10)
d2v_rf.fit(train_d2v_features, train_y)

train_d2v_pred = d2v_rf.predict(train_d2v_features)
val_d2v_pred = d2v_rf.predict(val_d2v_features)

d2v_rf.score(val_d2v_features, val_y)

0.0056872037914691941

In [82]:
print "Hamming loss on train with d2v features:",  hamming_loss(train_y, train_d2v_pred)
print "Hamming loss on validation with d2v features:", hamming_loss(val_y, val_d2v_pred)

Hamming loss on train with d2v features: 0.0214095619775
Hamming loss on validation with d2v features: 0.141032676478


# Naive 0 Classifier (Baseline)

In [83]:
print "Hamming loss on train with naive all 0 classifier:",  hamming_loss(train_y, np.zeros((train_d2v_pred.shape)))
print "Hamming loss on validation with naive all 0 classifier:",  hamming_loss(val_y, np.zeros((val_d2v_pred.shape)))

Hamming loss on train with naive all 0 classifier: 0.14239644322
Hamming loss on validation with naive all 0 classifier: 0.139835370417


# Naive Bayes on Bigram Features, One v. Rest Classifier for Multilabels

In [65]:
# Train random forest on bigram features
nb = OneVsRestClassifier(MultinomialNB())
nb.fit(train_bigrams, train_y)

train_nb_pred = nb.predict(train_bigrams)
val_nb_pred = nb.predict(val_bigrams)

nb.score(val_bigrams, val_y)

0.016113744075829384

In [84]:
print "Hamming loss on train with naive bayes:",  hamming_loss(train_y, train_nb_pred)
print "Hamming loss on validation with naive bayes:", hamming_loss(val_y, val_nb_pred)

Hamming loss on train with naive bayes: 0.020470271455
Hamming loss on validation with naive bayes: 0.128261411823


In [80]:
# Check the datapoints for which the naive bayes predictions match the actual 
# multilabels in the validation set
val_y[(val_y == val_nb_pred).all(axis=1)]

Unnamed: 0,War,Crime,Music,Comedy,History,Western,Thriller,Mystery,Adventure,TV_Movie,Fantasy,Animation,Drama,Documentary,Science_Fiction,Horror,Action,Romance,Family
6726,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
26891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
27489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
27574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
27613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


Unfortunately from the abysmal performance in terms of both Hamming loss and subset accuracy, it does not look like predicting by movie overview alone is a feasible approach for this multilabel classification problem.