** The notebook requires the python files we submitted. **

# Data Acquisition

Talk about how we got our data

# Preprocessing

First we need to preprocess our data. For details see common.py. The following code assumes tmdb_posters.pkl in ../data folder and glove.6B.300d.txt in ../local/glove. Results are stored in /tmp, and after that they should be moved to ../local folder.

In [1]:
import pickle
import numpy as np
from common import Data, Split, Batches, encode_y, Vocab

if False:
    # load pickled data
    data_file = "../data/tmdb_posters.pkl"
    data = pickle.load(open(data_file, 'rb'))


    # get overviews from data
    OVERVIEWS = Data(np.array([d['overview'] for d in data]))
    # get title from data
    TITLES = Data(np.array([d['title'] for d in data]))

    OVERVIEWS.save("/tmp/overviews.pkl")
    TITLES.save("/tmp/titles.pkl")

    # get genres, encode as 'one'-hot vectors
    GENRES = Data(encode_y(np.array([d['genre_ids'] for d in data])))
    GENRES.save("/tmp/genres.pkl")

    # create train-val-test split
    train, val, test = OVERVIEWS.create_splits(0.8,0.1)
    train.save("/tmp/train.pkl")
    val.save("/tmp/val.pkl")
    test.save("/tmp/test.pkl")

    # create vocab, this is to support fine-tuning of embeddings (otherwise don't call add_sentences)
    # during this step, all punctuations are removed and all words are converted to lower cases.
    vocab = Vocab()
    vocab.initialize_glove("../local/glove/glove.6B.300d.txt")
    vocab.add_sentences(train.get_data(OVERVIEWS))
    vocab.add_sentences(train.get_data(TITLES))
    vocab.save("/tmp/vocab.pkl")

    # create embedding layer, for now we freeze the embedding layer. (default is freeze=True)
    embedding = vocab.create_pytorch_embeddings()
    torch.save(embedding, "/tmp/embedding.pth")

    # encode data as indices
    OVERVIEWS_ENCODED = Data(vocab.encode_sentences(OVERVIEWS.data))
    TITLES_ENCODED = Data(vocab.encode_sentences(TITLES.data))

    OVERVIEWS_ENCODED.save("/tmp/overviews_encoded.pkl")
    TITLES_ENCODED.save("/tmp/titles_encoded.pkl")

    del OVERVIEWS_ENCODED, TITLES_ENCODED, vocab, embedding, train, val, test, GENRES, TITLES, OVERVIEWS, data, data_file

In [2]:
# load saved preprocessed data
from common import load_data, load_split
GENRES = load_data("../local/genres.pkl")
train = load_split("../local/train.pkl")
val = load_split("../local/val.pkl")
test = load_split("../local/test.pkl")
OVERVIEWS = load_data("../local/overviews_encoded.pkl")
OVERVIEWS_ENCODED = OVERVIEWS

from sklearn.metrics import accuracy_score, f1_score

def report(Y_true, Y_pred):
    print("acc:", accuracy_score(Y_true, Y_pred), "\tf1:",f1_score(Y_true, Y_pred, average="micro"))

# EDA

# SVM and NaiveBayes (Bag of Words)

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
    
X_train = train.get_data(OVERVIEWS)
X_val = val.get_data(OVERVIEWS)
X_test = test.get_data(OVERVIEWS)
Y_train = train.get_data(GENRES)
Y_val = val.get_data(GENRES)
Y_test = test.get_data(GENRES)

X_train = [" ".join([str(e) for e in x]) for x in X_train]
X_val = [" ".join([str(e) for e in x]) for x in X_val]
X_test = [" ".join([str(e) for e in x]) for x in X_test]

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train)
X_val_cv = cv.transform(X_val)
X_test_cv = cv.transform(X_test)

In [4]:
svm = OneVsRestClassifier(LinearSVC())
svm.fit(X_train_cv, Y_train)
val_predict_svm = svm.predict(X_val_cv)
test_predict_svm = svm.predict(X_test_cv)
report(Y_val, val_predict_svm)
report(Y_test, test_predict_svm)

acc: 0.3373333333333333 	f1: 0.5013595752945745
acc: 0.33 	f1: 0.49223070845404265


In [5]:
cv = CountVectorizer(max_df=0.95, min_df=0.005)

X_train_cv = cv.fit_transform(X_train)
X_val_cv = cv.transform(X_val)
X_test_cv = cv.transform(X_test)

nb = OneVsRestClassifier(MultinomialNB())
nb.fit(X_train_cv, Y_train)
val_predict_nb = nb.predict(X_val_cv)
test_predict_nb = nb.predict(X_test_cv)
report(Y_val, val_predict_nb)
report(Y_test, test_predict_nb)

acc: 0.185 	f1: 0.4241896889275614
acc: 0.188 	f1: 0.42417035996108526


The performance of SVM is better than NB in our experiment.

# tf-idf

In [6]:
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_cv)
X_val_tfidf = tfidf.transform(X_val_cv)
X_test_tfidf = tfidf.transform(X_test_cv)

In [7]:
svm = OneVsRestClassifier(LinearSVC())
svm.fit(X_train_tfidf, Y_train)
val_predict_svm = svm.predict(X_val_tfidf)
test_predict_svm = svm.predict(X_test_tfidf)
report(Y_val, val_predict_svm)
report(Y_test, test_predict_svm)

acc: 0.279 	f1: 0.3320158102766798
acc: 0.277 	f1: 0.3219440891649918


In [8]:
cv = CountVectorizer(max_df=0.95, min_df=0.005)

X_train_cv = cv.fit_transform(X_train)
X_val_cv = cv.transform(X_val)
X_test_cv = cv.transform(X_test)
X_train_tfidf = tfidf.fit_transform(X_train_cv)
X_val_tfidf = tfidf.transform(X_val_cv)
X_test_tfidf = tfidf.transform(X_test_cv)

nb = OneVsRestClassifier(MultinomialNB())
nb.fit(X_train_tfidf, Y_train)
val_predict_nb = nb.predict(X_val_tfidf)
test_predict_nb = nb.predict(X_test_tfidf)
report(Y_val, val_predict_nb)
report(Y_test, test_predict_nb)

acc: 0.24966666666666668 	f1: 0.1397802197802198
acc: 0.25133333333333335 	f1: 0.14345991561181437


tfidf doesn't improve the performance of our model.

# word2vec Embeddings

# Deep Learning

## Text-Only Model

The model is composed of an encoder and a classifier. The encoder definition is:
```
class Encoder2(torch.nn.Module):
    def __init__(self, encoder, embedding, hidden_dim, input_channel, num_layers, bidirectional, dropout, cuda):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = embedding
        self.bidirectional = bidirectional
        self.cuda = cuda

        self.encoder = encoder(input_size=input_channel, hidden_size=hidden_dim, batch_first=True,
                                bidirectional=bidirectional, num_layers=num_layers, dropout=dropout)
        
        if cuda:
            self.embedding.cuda()
            self.encoder.cuda()

    def forward(self, pack:Pack):
        rev = pack.get_rev()
        data = pack.get_pack(self.embedding, torch_var=True)
        if self.cuda:
            rev.cuda()
        states_packed, _ = self.encoder(data) # (packed_sequence, hidden_state)
        states, _ = torch.nn.utils.rnn.pad_packed_sequence(states_packed)
        states = torch.cat([states[-1,:,:self.hidden_dim], states[0,:,self.hidden_dim:]], dim=1)
        return states[rev, :]
        
    def init_hidden(self):
        pass
```
The classifier definition is:
```
class BRClassifier(torch.nn.Module):
    def __init__(self, dims, num_class, encoding_size, cuda):
        super().__init__()
        
        self.classifiers = []
        for i in range(num_class):
            cls = MultiLayerFCReLUClassifier(dims, 1, encoding_size, cuda)
            self.add_module(str(i), cls)
            self.classifiers.append(cls)

    def forward(self, encodings):
        out = torch.stack([cls(encodings) for cls in self.classifiers])[:,:,0]
        return torch.transpose(out,0,1)
```

where MultiLayerFCReLUClassifier is defined as:
```
class MultiLayerFCReLUClassifier(torch.nn.Module):
    def __init__(self, dims, num_class, encoding_size, cuda):
        super().__init__()
        assert(len(dims)>0)
        self.fc1 = torch.nn.Linear(encoding_size, dims[0])
        self.relu1 = torch.nn.ReLU()
        if cuda:
            self.fc1.cuda()
            self.relu1.cuda()
        self.fcs = []
        self.relus = []
        prev_dim = dims[0]
        for dim in dims[1:]:
            fc = torch.nn.Linear(prev_dim, dim)
            relu = torch.nn.ReLU()
            if cuda:
                fc.cuda()
                relu.cuda()
            self.fcs.append(fc)
            self.relus.append(relu)
            prev_dim = dim
        
        self.out_fc = torch.nn.Linear(dims[-1], num_class)
        if cuda:
            self.out_fc.cuda()

    def forward(self, encodings):
        l_out = self.fc1(encodings)
        l_out = self.relu1(l_out)
        for i in range(len(self.fcs)):
            l_out = self.fcs[i](l_out)
            l_out = self.relus[i](l_out)
        out = self.out_fc(l_out)
        return out
```

The basic idea is encoder is a multi layer LSTM/GRU uni/bidirectional network and the classifier is a combinition of 19 binary classifiers. In our experiments, a 3-layer bidirectional LSTM performs best. It's easy to change to any combination of LSTM/GRU+uni/bidirectional+different hidden dim+different number of layers by changing the parameters of Encoder2 class.

In [9]:
import torch

embedding = torch.load('../local/embedding.pth').cuda()

from cls import BRClassifier
from torch_models import Encoder2
from model import TextOnlyModel
from train import train_epoches

classifier = BRClassifier(dims=[1024, 512], num_class=19, encoding_size=1024, cuda=True)
encoder = Encoder2(encoder=torch.nn.LSTM, embedding=embedding, input_channel=embedding.embedding_dim,
                  hidden_dim=512, num_layers=3, cuda=True, bidirectional=True, dropout=0)

# replace torch.nn.LSTM by torch.nn.GRU to use GRU
# change num_layers, hidden_dim, bidirectional to experiment with different configs
# dropout isn't very useful, we tried and it led to underfit.

model = TextOnlyModel(encoder, classifier, OVERVIEWS_ENCODED, GENRES)
loss = torch.nn.BCEWithLogitsLoss().cuda()
adam = torch.optim.Adam(filter(lambda p:p.requires_grad, model.parameters()))

optimizer = adam
scheduler = None

loss_hist = []
save_per_epoch = 10
n_epochs = 0 # change this to train

for i in range(int(n_epochs/save_per_epoch)): 
    epoch_losses = train_epoches(n_epochs=save_per_epoch, model=model, train=train, loss=loss, val=val,
                  batch_size=32, optimizer=optimizer, scheduler=scheduler)
    loss_hist.append(epoch_losses)
    bn = (i+1)*save_per_epoch
    torch.save(model.encoder, "/tmp/encoder_{}_{}.pth".format(bn, str(epoch_losses[1][1][-1])[:4]))
    torch.save(model.classifier, "/tmp/cls_{}_{}.pth".format(bn, str(epoch_losses[1][1][-1])[:4]))

In [10]:
from utils import evaluate, inference
encoder = torch.load("./saved/overview-lstm2/encoder_70_0.53.pth")
encoder.encoder.cuda()
classifier = torch.load("./saved/overview-lstm2/cls_70_0.53.pth").cuda()
model = TextOnlyModel(encoder, classifier, OVERVIEWS_ENCODED, GENRES)
Yp_val, Yt_val = inference(split=val, model=model, batch_size=128)
Yp_test, Yt_test = inference(split=test, model=model, batch_size=128)

In [11]:
report(Yt_val, Yp_val)
report(Yt_test, Yp_test)
del model, encoder, classifier, optimizer, scheduler, adam, loss

acc: 0.3556666666666667 	f1: 0.5327067191667725
acc: 0.3486666666666667 	f1: 0.5147591921284308


Our RNN performs better than all previous models. But what about we do something more interesting? Like using the posters?

## Poster-Only Model

We need to preprocess the posters. We assume posters are in ../local/posters. We need to copy posters.npy from tmp to ../local after preprocessing. We essentially convert images into matrices, and apply the transformation required by the torchvision models.

In [12]:
import cv2
import os

from torchvision import transforms

# For efficiency we load all images into RAM and do the preprocessing. This requires A LOT OF RAM space.
# Instead the images could be processes individually and combined afterwards.
if False:
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    toTensor = transforms.ToTensor()
    preprocess = transforms.Compose([toTensor, normalize])

    images_list = []

    image_folder = "../local/posters/"
    for i in range(30000):
        f = os.path.join(image_folder, "{}.jpg".format(i))
        I = cv2.imread(f)
        assert(I is not None)
        I = preprocess(I)
        images_list.append(I.numpy())

    images = np.stack(images_list)

    POSTERS = Data(images)
    assert(len(POSTERS)==30000)
    POSTERS.save("/tmp/posters.npy")

The structure of poster-only model is very similar to TextOnlyModel, except for that now the encoder is replaced by CNN. We tried both VGG16 and RESNET152 as our encoder, and found RES152 is better.

In [13]:
from torchvision.models import resnet152, vgg16
from model import PosterOnlyModel

POSTERS = load_data("../local/posters.npy")

if True:
    MODEL = resnet152(pretrained=True)
    # Freeze base layers
    for child in list(MODEL.children())[:-3]:
        for param in child.parameters():
            param.requires_grad = False
    MODEL = torch.nn.Sequential(*list(MODEL.children())[:-1]).cuda()

    # Conv4, turn off grad in first 2 blocks
    for child in list(list(MODEL.children())[-3].children())[:2]:
        for param in child.parameters():
            param.requires_grad = False
else:
    # in order to use VGG16, just use the following code instead of the code above
    MODEL = vgg16(pretrained=True)
    # Freeze base layers (except for the last three conv layers)
    for child in list(MODEL.features.children())[:-7]:
        for param in child.parameters():
            param.requires_grad = False
    MODEL = MODEL.features.cuda()
    
classifier = BRClassifier(dims=[1024, 512], num_class=19, encoding_size=2048, cuda=True)

model = PosterOnlyModel(MODEL, classifier, POSTERS, GENRES)

adam = torch.optim.Adam(filter(lambda p:p.requires_grad, model.parameters()))

optimizer = adam
scheduler = None

loss = torch.nn.BCEWithLogitsLoss().cuda()

n_epochs = 0 # change this to train
save_per_epoch = 10
for i in range(int(n_epochs/save_per_epoch)): 
    epoch_losses = train_epoches(n_epochs=save_per_epoch, model=model, train=train, loss=loss, val=val,
                  batch_size=16, optimizer=optimizer, scheduler=scheduler)
    loss_hist.append(epoch_losses)
    bn = (i+1)*save_per_epoch
    torch.save(model.encoder, "/tmp/cnn_encoder_{}_{}.pth".format(bn, str(epoch_losses[1][1][-1])[:4]))
    torch.save(model.classifier, "/tmp/cnn_cls_{}_{}.pth".format(bn, str(epoch_losses[1][1][-1])[:4]))

In [14]:
encoder = torch.load("./saved/poster-res2/encoder_80_0.45.pth")
encoder.cuda()
classifier = torch.load("./saved/poster-res2/cls_80_0.45.pth").cuda()
model = PosterOnlyModel(encoder, classifier, POSTERS, GENRES)
Yp_val, Yt_val = inference(split=val, model=model, batch_size=32)
Yp_test, Yt_test = inference(split=test, model=model, batch_size=32)

In [15]:
report(Yt_val, Yp_val)
report(Yt_test, Yp_test)
del model, MODEL, classifier, optimizer, scheduler, adam, loss

acc: 0.31966666666666665 	f1: 0.45208568207440814
acc: 0.31066666666666665 	f1: 0.4304740083058857


The poster-only model is worse than text-only model and several traditional models. But it's not surprising because inferring genre from poster is harder (even for humans) than from overview text. But what if we combine the poster model with text model? 

## Combined Model

The model is simple, the CNN and RNN operates independently, and the encodings from both networks are then stacked and fed to a classifier.

We wanted to train the combined model, but the GPU RAM is too small! So instead we fixed RNN and CNN, and only train the classifier.

The combined encoder is simple as 

```
class TextPosterCombinedEncoder(torch.nn.Module):
    def __init__(self, text_encoder, poster_encoder):
        super().__init__()
        self.text_encoder = text_encoder
        self.poster_encoder = poster_encoder
        
    def forward(self, text_pack, posters):
        poster_encodings = self.poster_encoder(posters).view(len(posters),-1)
        text_encodings = self.text_encoder(text_pack)
        return poster_encodings, text_encodings
```

In [16]:
from torch_models import TextPosterCombinedEncoder
from model import TextPosterCombinedModel

text_encoder = torch.load("./saved/overview-lstm2/encoder_70_0.53.pth")
posters_encoder = torch.load("./saved/poster-res2/encoder_80_0.45.pth")

encoder = TextPosterCombinedEncoder(text_encoder, posters_encoder).cuda()

for param in encoder.parameters():
    param.requires_grad = False
    
classifier = BRClassifier(dims=[1024, 512], num_class=19, encoding_size=2048+1024, cuda=True)

optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, classifier.parameters()))
loss = torch.nn.BCEWithLogitsLoss().cuda()

scheduler=None

model = TextPosterCombinedModel(encoder, classifier, OVERVIEWS_ENCODED, POSTERS, GENRES)

n_epochs = 0 # change this to train
save_per_epoch = 4
for i in range(int(n_epochs/save_per_epoch)): 
    epoch_losses = train_epoches(n_epochs=save_per_epoch, model=model, train=train, loss=loss, val=val,
                  batch_size=32, optimizer=optimizer, scheduler=scheduler)
    bn = (i+1)*save_per_epoch
    torch.save(model.classifier, "/tmp/cbn_cls_{}_{}.pth".format(bn, str(epoch_losses[1][1][-1])[:4]))

In [17]:
classifier = torch.load("./saved/cbn/cls_4_0.54.pth").cuda()
model = TextPosterCombinedModel(encoder, classifier, OVERVIEWS_ENCODED, POSTERS, GENRES)
Yp_val, Yt_val = inference(split=val, model=model, batch_size=32)
Yp_test, Yt_test = inference(split=test, model=model, batch_size=32)

In [18]:
report(Yt_val, Yp_val)
report(Yt_test, Yp_test)
del model, encoder, classifier, optimizer, scheduler, loss

acc: 0.3973333333333333 	f1: 0.5420451215939057
acc: 0.37933333333333336 	f1: 0.5186186186186187


The result is slightly better than TextOnlyModel thus having the best performance. The result could be better if we don't fix some layers of the RNN and CNN, but it requires more GPU RAM. Maybe training on a more powerful GPU or multiple GPUs is a possible future direction.