In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from datasets import Dataset, load_metric
from torch import logical_and, logical_or, nn
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

In [2]:
# Load data and nest genres
df = pd.read_csv("data/df_fixed.csv")
df_genres = pd.read_csv("data/df_genres.csv")

In [3]:
# Only keep the top 30 genres
top_genres = (
    df_genres.query("Genre != 'unknown'")
    .groupby("Genre")
    .agg(n=("Genre", "count"))
    .reset_index()
    .sort_values("n", ascending=False)
    .head(30)
    .Genre.values
)

In [4]:
# Encode genre labels to wide arrays
df_genres = (
    df_genres.query("Genre in @top_genres")
    .assign(cnt=1)
    .pivot_table(index=["movieID"], columns="Genre", values=["cnt"])
    .fillna(0)
    # .astype(int)
    .reset_index(col_level=1)  # get movieID out
)

df_genres.columns = [x[1] for x in df_genres.columns]
df_genres = df_genres.set_index("movieID")

genre_names = df_genres.columns.tolist()
labels = df_genres.values.tolist()
df_genres = pd.DataFrame({"movieID": df_genres.index, "labels": labels})

In [5]:
df = (
    df.reset_index()
    .rename(columns={"index": "movieID"})
    .filter(["movieID", "Plot"])
    .merge(df_genres, on="movieID")
)

df.sample(10)

Unnamed: 0,movieID,Plot,labels
11327,11891,David Greene (Brendan Fraser) is a working-cla...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
6303,6514,An old friend of the boys returns to town and ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
27658,33949,"In 1995, five mysterious murders took place. I...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
4481,4658,Ross McEwen pulls an unusual bank job in the N...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
16465,17210,The film begins with a getaway driver waiting ...,"[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
27432,33660,"Hikaru Oshiro, noticing that there was a lack ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
15541,16212,"In a high tech underground facility, senior te...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
22958,25266,"Shankar's (Amitabh Bachchan) father, who is a ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
22113,24096,Biren Dutta is a lawyer. He is unsuccessful in...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
10774,11315,Eddie Dodd is a burnt-out attorney who has lef...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


## Multi-label classification

Recall what our goal was: to predict the genres of movies based on the movie plots. We can do this by using a multi-label classifier, such as `sklearn.ensemble.RandomForestClassifier`. But before getting into the modeling, we need to construct a dataset that can be used by the classifier, i.e. break down the plot text into features.

In [6]:
# Configurations
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
test_ratio = 0.1
val_ratio = 0.1
batch_size = 8
num_labels = len(genre_names)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Construct dataset
dat = Dataset.from_pandas(df)

# Tokenize the Plot column
dat = dat.map(
    lambda batch: tokenizer.batch_encode_plus(
        batch["Plot"], padding="max_length", truncation=True
    ),
    batched=True,
    remove_columns=["Plot", "__index_level_0__", "movieID"],
)

# Retrieve tensors of the following columns as model inputs
valid_cols = ["input_ids", "token_type_ids", "attention_mask", "labels"]
cols = [c for c in dat.column_names if c in valid_cols]
dat.set_format(type="torch", columns=cols)

# Train/validation/test split
dat = dat.train_test_split(test_size=test_ratio, seed=42)
dat_train = dat["train"].train_test_split(test_size=val_ratio, seed=42)
dat["train"] = dat_train["train"]
dat["validation"] = dat_train["test"]

  0%|          | 0/29 [00:00<?, ?ba/s]

In [7]:
# Modify last layer of model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, problem_type="multi_label_classification"
)
model.classifier = nn.Linear(768, num_labels)
model.to("cuda")

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [8]:
training_args = TrainingArguments(
    output_dir="distilbert_multilabel",
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=2e-5,
    num_train_epochs=10,
    save_strategy="epoch",
    label_names=genre_names,
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dat["train"],
    eval_dataset=dat["validation"],
)
trainer.train()

***** Running training *****
  Num examples = 22704
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 9460


Epoch,Training Loss,Validation Loss
1,0.1918,No log
2,0.0985,No log
3,0.0832,No log
4,0.072,No log
5,0.0612,No log
6,0.0532,No log
7,0.0459,No log
8,0.0411,No log
9,0.0364,No log
10,0.0343,No log


***** Running Evaluation *****
  Num examples = 2523
  Batch size = 24
Saving model checkpoint to distilbert_multilabel/checkpoint-946
Configuration saved in distilbert_multilabel/checkpoint-946/config.json
Model weights saved in distilbert_multilabel/checkpoint-946/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2523
  Batch size = 24
Saving model checkpoint to distilbert_multilabel/checkpoint-1892
Configuration saved in distilbert_multilabel/checkpoint-1892/config.json
Model weights saved in distilbert_multilabel/checkpoint-1892/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2523
  Batch size = 24
Saving model checkpoint to distilbert_multilabel/checkpoint-2838
Configuration saved in distilbert_multilabel/checkpoint-2838/config.json
Model weights saved in distilbert_multilabel/checkpoint-2838/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2523
  Batch size = 24
Saving model checkpoint to distilbert_multilabel/checkpoint-3784
Con

TrainOutput(global_step=9460, training_loss=0.0680310230174478, metrics={'train_runtime': 3144.6302, 'train_samples_per_second': 72.199, 'train_steps_per_second': 3.008, 'total_flos': 3.00904160477184e+16, 'train_loss': 0.0680310230174478, 'epoch': 10.0})

In [10]:
model.to("cuda:0")
model.eval()

dl = DataLoader(dat["test"], batch_size=8)

In [11]:
hamming_accuracies = []
precisions = []
recalls = []
for batch in dl:
    batch = {k: v.to("cuda:0") for k, v in batch.items()}

    logits = model(**batch).get("logits")
    y_true = batch.get("labels").bool()
    y_pred = nn.Sigmoid()(logits) > 0.5

    true_pos = logical_and(y_true, y_pred).sum(axis=1)
    pred_pos = logical_or(y_true, y_pred).sum(axis=1)

    hamming_score = (true_pos / pred_pos).nansum().cpu().item()
    precision = (true_pos / y_true.sum(axis=1)).nansum().cpu().item()
    recall = (true_pos / y_pred.sum(axis=1)).nansum().cpu().item()

    hamming_accuracies.append(hamming_score / y_true.shape[0])
    precisions.append(precision / y_true.shape[0])
    recalls.append(recall / y_true.shape[0])

In [12]:
print(
    f"""
    Hamming accuracy: {sum([8 * x for x in hamming_accuracies]) / dat["test"].shape[0]}
    Precision: {sum([8 * x for x in precisions]) / dat["test"].shape[0]}
    Recall: {sum([8 * x for x in recalls]) / dat["test"].shape[0]}
"""
)


    Hamming accuracy: 0.48030087160772117
    Precision: 0.5124152715204162
    Recall: 0.5462599597372472

