# Classification By Movie Overview
Classification of movie genre using movie overviews from TMDB exclusively, using the text featurization methods of bigrams and Doc2vec.

In [154]:
from collections import namedtuple
import numpy as np
import pandas as pd
import timeit
from gensim.models import doc2vec
from gensim.parsing.preprocessing import stem
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import hamming_loss
import pickle
import string

# Identify Movies for which we have posters
We want to use the same movies to train and test across our classification tasks, so let's make sure the movies that we're using for text classification are the same ones we're using for image classification

In [73]:
from os import listdir

movie_names = []
with open("data/genre_dict.pickle", 'rb') as handle:
    genres = pickle.load(handle)
for genre_name in genres.values():
    movie_names += listdir("data/posters/" + genre_name)
movie_ids = [filename.split("-")[-1].split("_")[0] for filename in movie_names]
movie_ids = [int(id) for id in movie_ids]

# Text Preprocessing
We need to preprocess the movie overviews by stripping out punctuation and turning everything to lowercase. We deliberately choose to avoid stop word removal and stemming (both common NLP procedures) due to the relatively short length of the movie overviews. 

We also employ an 70/30 train-test split.

In [46]:
full_dataset = pd.read_csv('data/full_raw_movie_overviews.csv')
print full_dataset.shape
full_dataset.head()

(2260, 23)


Unnamed: 0,movie_id,movie_title,overview,genre_id,War,Crime,Music,Comedy,History,Western,...,TV_Movie,Fantasy,Animation,Drama,Documentary,Science_Fiction,Horror,Action,Romance,Family
0,329865,Arrival,Taking place after alien crafts land around th...,"[53, 18, 878, 9648]",0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,210577,Gone Girl,With his wife's disappearance having become th...,"[9648, 53, 18]",0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,419430,Get Out,A young black man visits his white girlfriend'...,"[27, 9648, 53]",0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,27205,Inception,"Cobb, a skilled thief who commits corporate es...","[28, 53, 878, 9648, 12]",0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
4,273248,The Hateful Eight,Bounty hunters seek shelter from a raging bliz...,"[80, 18, 9648, 37]",0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [47]:
# Remove rows where overview is missing
full_dataset = full_dataset.dropna()
print full_dataset.shape

(2248, 23)


In [80]:
# Filter for movies that we have posters for, dropping duplicates
full_dataset = full_dataset[full_dataset['movie_id'].isin(movie_ids)].drop_duplicates()
print full_dataset.shape

(1177, 23)


In [81]:
# Convert to lowercase and remove punctuation
full_dataset['overview'] = full_dataset['overview'].str.lower()
full_dataset['overview'] = full_dataset['overview'].apply(lambda x: x.translate(string.maketrans(" "," "), string.punctuation))

In [82]:
# Randomly Train-test split, 70-30
train, test = train_test_split(full_dataset, test_size = 0.30)

In [89]:
# Save to csv
train.to_csv("data/preprocessed_overviews_train_small.csv", index=False)
test.to_csv("data/preprocessed_overviews_test_small.csv", index=False)

# Bigram feature generation

In [90]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))
count_vectorizer.fit(train["overview"])

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [107]:
# Generate bigram features
train_bigrams = count_vectorizer.transform(train["overview"])
test_bigrams = count_vectorizer.transform(test["overview"])

# Doc2vec feature generation
We use doc2vec to learn fixed length vector representations of each of the documents (movie overviews) and therefore avoid time intensive route of manual feature engineering. 

In [91]:
taggedXML = namedtuple('TaggedXML', 'words tags')
documents = []

for i in train.index:
    tags = [train['movie_id'][i]]
    documents.append(taggedXML(train['overview'][i].split(), tags))

In [93]:
d2v = doc2vec.Doc2Vec(documents, size=10, workers=4, iter=30)
# Save model
d2v.save("d2v_models/d2v_size10_iter30")

In [106]:
# Use trained d2v model in order to infer feature vectors for each of the documents
train_d2v_features = [d2v.infer_vector(document) for document in train["overview"]]
test_d2v_features = [d2v.infer_vector(document) for document in test["overview"]]

# Multilabel Random Forest Classification

In [104]:
# Get labels into multilabel classification format for train and test
train_y = train.ix[:, 'War':]
test_y = test.ix[:, 'War':]

In [151]:
# Train random forest on bigram features
bigram_rf = RandomForestClassifier(n_estimators=10)
bigram_rf.fit(train_bigrams, train_y)

train_bigram_pred = bigram_rf.predict(train_bigrams)
test_bigram_pred = bigram_rf.predict(test_bigrams)

bigram_rf.score(test_bigrams, test_y)

0.0028248587570621469

In [163]:
print "Hamming loss on train with bigram features:", hamming_loss(train_y, train_bigram_pred)
print "Hamming loss on test with bigram features:", hamming_loss(test_y, test_bigram_pred)

Hamming loss on train with bigram features: 0.0227025644305
Hamming loss on test with bigram features: 0.150609574784


In [160]:
# Train random forest on doc2vec features
d2v_rf = RandomForestClassifier(n_estimators=10)
d2v_rf.fit(train_d2v_features, train_y)

train_d2v_pred = d2v_rf.predict(train_d2v_features)
test_d2v_pred = d2v_rf.predict(test_d2v_features)

d2v_rf.score(test_d2v_features, test_y)

0.0

In [162]:
print "Hamming loss on train with d2v features:",  hamming_loss(train_y, train_d2v_pred)
print "Hamming loss on test with d2v features:", hamming_loss(test_y, test_d2v_pred)

Hamming loss on train with d2v features: 0.0248129436593
Hamming loss on test with d2v features: 0.16696402022


In [172]:
print "Hamming loss on train with naive all 0 classifier:",  hamming_loss(train_y, np.zeros((train_d2v_pred.shape)))
print "Hamming loss on test with naive all 0 classifier:",  hamming_loss(test_y, np.zeros((test_d2v_pred.shape)))

 Hamming loss on train with naive all 0 classifier: 0.156487817356
Hamming loss on test with naive all 0 classifier: 0.155961938745
