In [3]:
!pip install datasets
!pip install transformers
!pip install seqeval



In [8]:
import pandas as pd
import itertools
import os
from datasets import Dataset
from datasets import load_dataset
import json
import transformers
import torch
from data_loader import GzippedJSONDataset, LabelledCaptionsDataset, get_intersection_range, load_captions_from_chunks

## Tokenized, labelled dataset

In [6]:
class LabelledTokensDataset(torch.utils.data.IterableDataset):
    """
    IterableDataset that tokenizes the transcripts of a given caption dataset
    and labels the tokens according to whether they are included in a sponsor-labeled caption.
    """

    def __init__(self, dataset: torch.utils.data.IterableDataset, tokenizer: transformers.PreTrainedTokenizer):
        super().__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __iter__(self):
        for video_id, captions, sponsor_times in self.dataset:

            drop_row = False

            sponsor_ranges = []

            for start_time, end_time in sponsor_times:
                # get intersection range and extract the sponsor text from it
                start_index, end_index = get_intersection_range(captions, start_time, end_time)
                if start_index is None or end_index is None:
                    print(f'Dropping {video_id} because sponsor times do not match the captions')
                    drop_row = True
                    break

                # mark range as sponsor
                for i in range(start_index, end_index):
                    captions[i]['is_sponsor'] = True

                sponsor_ranges.append([start_index, end_index])

            if not drop_row:
                input_ids = []
                labels = []

                for caption in captions:
                    tokenized_caption = self.tokenizer(caption['text'])
                    # remove special beginning/end tokens
                    input_ids += tokenized_caption['input_ids'][1:-1]
                    
                    # label every token accordingly
                    label = 1 if 'is_sponsor' in caption else 0
                    labels += [label] * len(tokenized_caption['input_ids'][1:-1])
                
                # flag indicating whether a completely non-sponsor segment has been yielded.
                # limitting the number of fully non-sponsor segments to balance the data
                yielded_non_sponsor = False
                
                # go through the transcript max_length segment by max_length segment
                for window_start in range(0, len(input_ids), 510):
                    w_input_ids = input_ids[window_start:]
                    w_labels = labels[window_start:]
                    
                    # make sure to yield at most 1 completely non-sponsor segment
                    if 1 not in w_labels:
                        if yielded_non_sponsor:
                            continue
                        else:
                            yielded_non_sponsor = True
                            
                    # add back special tokens
                    prepared_tokenizer = self.tokenizer.prepare_for_model(w_input_ids, truncation=True, padding='max_length')

                    attention_mask = prepared_tokenizer['attention_mask']

                    # loop to deal with special tokens, labelling them with -100 for the BERT model to ignore
                    new_labels = []
                    for i, m in enumerate(attention_mask):
                        if m == 1:
                            if i == 0:
                                new_labels.append(-100)
                            elif i == len(attention_mask) - 1:
                                new_labels.append(-100)
                            elif i == len(w_input_ids) + 1:
                                new_labels.append(-100)
                            else:
                                new_labels.append(w_labels[i-1])
                        else:
                            new_labels.append(-100)

                    w_input_ids = prepared_tokenizer['input_ids']

                    yield {'input_ids': w_input_ids, 'labels': new_labels, 'attention_mask': attention_mask}

    def __len__(self):
        # needed for training, 20001 is the length of every chunk in the caption dataset
        return 20001
    

## Load the dataset

In [10]:
dataset_dir = 'dataset'
train_dataset = load_captions_from_chunks('data', root_dir=dataset_dir, chunks=range(1, 3))
eval_dataset = GzippedJSONDataset(f'{dataset_dir}/data.16.json.gz', 100)

## Initialising training and datasets

In [11]:
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

import numpy as np

import torch


model_checkpoint = "bert-base-cased"
batch_size = 4

# Initialise tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=2)

# tokenize and label datasets
labelled_train_dataset = LabelledTokensDataset(train_dataset, tokenizer)

labelled_eval_dataset = LabelledTokensDataset(eval_dataset, tokenizer)



args = TrainingArguments(
    f"test_sponsors",
    save_steps=500,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.00001,
    save_total_limit=1,
)


# use SeqEval as the evaluation library
metric = load_metric("seqeval")

# define which metrics will be reported
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# specify components of the training and evaluation processes
trainer = Trainer(
    model,
    args,
    train_dataset=labelled_train_dataset,
    eval_dataset=labelled_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
# train, evaluate and save the model
trainer.train()
trainer.evaluate()
trainer.save_model('seq_labelling.model')

***** Running training *****
  Num examples = 20001
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3753


## Prediction

In [12]:
def predict_sponsor_range(model, tokenizer, captions):
    token_captions = []
    input_ids = []
    for i, caption in enumerate(captions):
        tokenized_caption = tokenizer(caption['text'], add_special_tokens=False)['input_ids']
        input_ids += tokenized_caption
        token_captions += [i] * len(tokenized_caption)

    predicted_labels = []
    for window_start in range(0, len(input_ids), 512):
        if window_start + 512 < len(input_ids):
            w_input_ids = input_ids[window_start:window_start + 512]
        else:
            w_input_ids = input_ids[window_start:]

        with torch.no_grad():
            predictions = model.forward(input_ids=torch.tensor(w_input_ids).unsqueeze(0))
            # softmax is applied on the outputs of the previous step
            predictions = list(torch.argmax(predictions.logits.squeeze(), axis=1))

        predicted_labels += predictions

        
    predicted_ranges = []
    in_sponsor = False
    start = -1
    for i, label in enumerate(predicted_labels):
        if label == 1:
            if not in_sponsor:
                in_sponsor = True
                start = i
        elif label == 0:
            if in_sponsor:
                predicted_ranges.append((token_captions[start], token_captions[i-1]))
            in_sponsor = False
    else:
        if in_sponsor:
            predicted_ranges.append((token_captions[start], token_captions[-1]))
    
    return predicted_ranges


## Applying the model

In [14]:
# load the tokeniser
tokenizer = AutoTokenizer.from_pretrained('seq_labelling.model')

# load the fine-tuned model
model = AutoModelForTokenClassification.from_pretrained('seq_labelling.model', num_labels=2)

# get test video
video_id, captions, sponsor_times = next(LabelledCaptionsDataset(GzippedJSONDataset(f'{dataset_dir}/data.2.json.gz')).__iter__())

predicted_ranges = predict_sponsor_range(model, tokenizer, captions)
print(video_id)
print(f'True ranges: {sponsor_times}')
print(f'Labelled ranges: {predicted_ranges}')

Didn't find file seq_labelling.model/added_tokens.json. We won't load it.
loading file seq_labelling.model/vocab.txt
loading file seq_labelling.model/tokenizer.json
loading file None
loading file seq_labelling.model/special_tokens_map.json
loading file seq_labelling.model/tokenizer_config.json
loading configuration file seq_labelling.model/config.json
Model config BertConfig {
  "_name_or_path": "seq_labelling.model",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_

32hCmkB7VGk
True ranges: [[0, 26]]
Labelled ranges: []
