In [0]:
import ast
import itertools
import numpy as np
import operator
import pandas as pd
import random
import spacy
import torch
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from spacy.util import minibatch, compounding
from tqdm import tqdm

Constants used to parametrize notebook 

In [0]:
# Path to the movies_metadata.csv file
MOVIES_METADATA_FILE_PATH = '/opt/movie_classifier/data/movies_metadata.csv'

# Genres to be used as classes (only the ones with at least 5000 movies)
GENRES_TO_INCLUDE = ['Action', 'Comedy', 'Drama', 'Romance', 'Thriller']

# Directory for saving the output model
OUTPUT_DIR = '/opt/movie_classifier/spacy_model'

# Spacy pretrained model
SPACY_MODEL = 'en_core_web_lg'

# Training batch size
BATCH_SIZE = 64

# Early stopping patience
PATIENCE = 5

Setting random seeds to make results reproducible


In [0]:
random.seed(18)
np.random.seed(18)

In [0]:
if OUTPUT_DIR is not None:
    OUTPUT_DIR = Path(OUTPUT_DIR)
    if not OUTPUT_DIR.exists():
        OUTPUT_DIR.mkdir()

## Data preparation

Loading the movies_metadata.csv in a pandas dataframe and selecting the colums of interest for the classification task

In [44]:
training_df = pd.read_csv(MOVIES_METADATA_FILE_PATH)
training_df = training_df[['overview', 'genres']]
training_df = training_df.dropna()
training_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,original_title,overview,genres
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]"


The genres field currently is currently valued with a complex structure containing multiple entries (list of dicts). We are going to simplify it by mapping it to a list of strings corresponding to the genres names.

In [0]:
training_df.genres = training_df.genres.map(lambda x: [genre['name'] for genre in ast.literal_eval(x)])

In order to obtain a more balanced dataset, we are going to only keep movies in the dataset if their genre appears in at least 5000 entries; these genres were previously extracted and listed in the **GENRES_TO_INCLUDE** list.

In [0]:
training_df.genres = training_df.genres.map(lambda x: np.intersect1d(GENRES_TO_INCLUDE, x).tolist())
training_df = training_df[training_df['genres'].astype(str) != '[]']

Final dataframe

In [48]:
training_df.head()

Unnamed: 0,original_title,overview,genres
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",[Comedy]
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Comedy, Romance]"
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]"
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy]
5,Heat,"Obsessive master thief, Neil McCauley leads a ...","[Action, Drama, Thriller]"


Splitting the dataframe in training, validation and test sets.


In [50]:
texts = training_df['overview'].values
labels = training_df['genres'].values

train_texts, test_texts, train_cats, test_cats = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_texts, val_texts, train_cats, val_cats = train_test_split(train_texts, train_cats, test_size=0.1, random_state=42)

train_cats = [{genre: genre in y for genre in GENRES_TO_INCLUDE} for y in train_cats]
test_cats = [{genre: genre in y for genre in GENRES_TO_INCLUDE} for y in test_cats]
val_cats = [{genre: genre in y for genre in GENRES_TO_INCLUDE} for y in val_cats]

train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
print(f'Using {len(train_texts) + len(test_texts)} examples ({len(train_texts)} training, {len(val_texts)} validation, {len(test_texts)} test)')

Using 31868 examples (24940 training, 2772 validation, 6928 test)


## Training

Definin an evaluation funciton. This function will be used to evaluate the model's performance at each epoch.

In [0]:
def evaluate(nlp, texts, cats):
    """
    This method evaluates the performance of the nlp model on the
    test texts and cats passed as parameters.
    It returns a dictionary containing the following keys:
      - textcat_p: the model precision (tp / (tp + fp))
      - textcat_r: the model recall (tp / (tp + fn))
      - textcat_f: the model F1 score (2 * (precision * recall) / (precision + recall))
      - textcat_a: the model accuracy ((tp + tn) / (tp + tn + fp + fn))
    """
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(nlp.pipe(texts)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score, "textcat_a": accuracy}

Initializing the Spacy nlp model

In [52]:
is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    spacy.require_gpu()
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
nlp = spacy.load(SPACY_MODEL)
print(f'Loaded model {SPACY_MODEL}')

Loaded model en_core_web_lg


Adding the model a new TextCategorizer pipeline component to be trained from scratch

In [0]:
# add the text classifier to the pipeline
textcat = nlp.create_pipe(
    "textcat", config={"exclusive_classes": False, "architecture": "simple_cnn"}
)
nlp.add_pipe(textcat, last=True)

# add label to text classifier
for label in GENRES_TO_INCLUDE:
    textcat.add_label(label)

Training loop with early stopping

In [54]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]

with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print('EPOCH\tLOSS\tP\tR\tF\tA')
    best_f1 = -1
    epochs = 0
    while True:
        losses = {}
        random.shuffle(train_data)
        batches = minibatch(train_data, size=BATCH_SIZE)
        with tqdm(total=len(train_data) // BATCH_SIZE) as pbar:
            for i, batch in enumerate(batches):
                if i < len(train_data) // BATCH_SIZE:
                    texts, annotations = zip(*batch)
                    nlp.update(texts, annotations, sgd=optimizer, losses=losses)
                    pbar.update(1)
            with nlp.use_params(optimizer.averages):
                  scores = evaluate(nlp, val_texts, val_cats)
            print(f'\n{epochs}\t{losses["textcat"]:.3f}\t{scores["textcat_p"]:.3f}\t{scores["textcat_r"]:.3f}\t{scores["textcat_f"]:.3f}\t{scores["textcat_a"]:.3f}')
            if scores["textcat_f"] > best_f1:
                best_acc = scores["textcat_f"]
                with nlp.use_params(optimizer.averages):
                    nlp.to_disk(OUTPUT_DIR)
                print("Saved model to", OUTPUT_DIR)
                bad_epochs = 0
            else:
                bad_epochs += 1
            epochs += 1
            if bad_epochs == PATIENCE:
                break

  0%|          | 1/389 [00:00<00:40,  9.51it/s]

Training the model...
EPOCH	LOSS	P	R	F	A


100%|██████████| 389/389 [00:27<00:00, 14.65it/s]


0	0.075	0.723	0.572	0.639	0.797



  1%|          | 2/389 [00:00<00:30, 12.69it/s]

Saved model to genre-classifier


100%|█████████▉| 388/389 [00:25<00:00, 14.55it/s]


1	0.061	0.730	0.585	0.649	0.802


100%|██████████| 389/389 [00:33<00:00, 11.70it/s]
  1%|          | 2/389 [00:00<00:26, 14.70it/s]

Saved model to genre-classifier


100%|█████████▉| 388/389 [00:25<00:00, 15.18it/s]


2	0.053	0.723	0.610	0.662	0.805


100%|██████████| 389/389 [00:33<00:00, 11.66it/s]
  1%|          | 2/389 [00:00<00:27, 14.13it/s]

Saved model to genre-classifier


100%|██████████| 389/389 [00:27<00:00, 14.34it/s]
  1%|          | 2/389 [00:00<00:25, 15.44it/s]


3	0.046	0.704	0.614	0.656	0.799


100%|█████████▉| 388/389 [00:25<00:00, 14.66it/s]


4	0.038	0.705	0.627	0.664	0.801


100%|██████████| 389/389 [00:32<00:00, 11.91it/s]
  1%|          | 2/389 [00:00<00:28, 13.81it/s]

Saved model to genre-classifier


100%|██████████| 389/389 [00:27<00:00, 14.37it/s]
  1%|          | 2/389 [00:00<00:25, 15.15it/s]


5	0.031	0.693	0.622	0.655	0.795


100%|██████████| 389/389 [00:27<00:00, 14.33it/s]
  1%|          | 2/389 [00:00<00:24, 16.11it/s]


6	0.026	0.688	0.618	0.651	0.793


100%|██████████| 389/389 [00:27<00:00, 14.39it/s]
  1%|          | 2/389 [00:00<00:26, 14.68it/s]


7	0.021	0.680	0.617	0.647	0.789


100%|██████████| 389/389 [00:26<00:00, 14.58it/s]
  1%|          | 2/389 [00:00<00:23, 16.31it/s]


8	0.018	0.681	0.617	0.647	0.790


100%|██████████| 389/389 [00:27<00:00, 14.27it/s]


9	0.016	0.678	0.615	0.645	0.788





# Performance evaluation

Loading the model

In [55]:
print(f"Loading from {OUTPUT_DIR}")
nlp = spacy.load(OUTPUT_DIR)

Loading from ./genre-classifier


Evaluating the model's performance

In [56]:
test_scores = evaluate(nlp, val_texts, val_cats)
print(f'Precision: {scores["textcat_p"]:.3f}\n Recall: {scores["textcat_r"]:.3f}\n F1 Score: {scores["textcat_f"]:.3f}\n Accuracy: {scores["textcat_a"]:.3f}')

Precision: 0.678
 Recall: 0.615
 F1 Score: 0.645
 Accuracy: 0.788


Model usage and output examples

In [59]:
test_description = "The evil Iago pretends to be friend of Othello in order to manipulate him to serve his own end in the film version of this Shakespeare classic."

# test the saved model
doc = nlp(test_description)
print(test_description)
print(doc.cats)

The evil Iago pretends to be friend of Othello in order to manipulate him to serve his own end in the film version of this Shakespeare classic.
{'Action': 0.012063146568834782, 'Comedy': 0.17406924068927765, 'Drama': 0.571480929851532, 'Romance': 0.008160477504134178, 'Thriller': 0.009414924308657646}
