In [2]:
!pip install datasets
!pip install transformers
!pip install seqeval
!pip install log

Collecting datasets
  Downloading datasets-2.2.1-py3-none-any.whl (342 kB)
[K     |████████████████████████████████| 342 kB 2.6 MB/s eta 0:00:01
Collecting xxhash
  Downloading xxhash-3.0.0-cp38-cp38-macosx_10_9_x86_64.whl (34 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.12.2-py38-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 3.1 MB/s eta 0:00:01
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting tqdm>=4.62.1
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 5.9 MB/s eta 0:00:01
[?25hCollecting fsspec[http]>=2021.05.0
  Using cached fsspec-2022.3.0-py3-none-any.whl (136 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.8 MB/s eta 0:00:011
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp38-cp38-macosx_10_9_x86_64.whl (574 kB)
[K   

In [1]:
import pandas as pd
import itertools
import os
from datasets import Dataset
from datasets import load_dataset
import json
import transformers
import torch
from data_loader import GzippedJSONDataset, LabelledCaptionsDataset, get_intersection_range

In [2]:
class LabelledTokensDataset(torch.utils.data.IterableDataset):

    def __init__(self, dataset: torch.utils.data.IterableDataset, tokenizer: transformers.PreTrainedTokenizer):
        super().__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __iter__(self):
        # count = 0
        for video_id, captions, sponsor_times in self.dataset:
            # print(f'\r{count}', end='')
            # count+=1
            drop_row = False

            sponsor_ranges = []

            for start_time, end_time in sponsor_times:
                # get intersection range and extract the sponsor text from it
                start_index, end_index = get_intersection_range(captions, start_time, end_time)
                if start_index is None or end_index is None:
                    print(f'Dropping {video_id} because sponsor times do not match the captions')
                    drop_row = True
                    break

                # mark range as sponsor
                for i in range(start_index, end_index):
                    captions[i]['is_sponsor'] = True

                sponsor_ranges.append([start_index, end_index])

            if not drop_row:
                input_ids = []
                labels = []
                for caption in captions:
                    # remove special beginning/end tokens
                    tokenized_caption = self.tokenizer(caption['text'])['input_ids'][1:-1]
                    input_ids += tokenized_caption
                    label = 1 if 'is_sponsor' in caption else 0
                    labels += [label] * len(tokenized_caption)

                # add back special tokens
                input_ids = self.tokenizer.prepare_for_model(input_ids, truncation=True)['input_ids']

                labels = [-100] + labels[:len(input_ids)-2] + [-100]
 
                yield {'input_ids': input_ids, 'labels': labels}

    def __len__(self):
        return 20001

In [10]:
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np

import torch

dir_path = './dataset'

model_checkpoint = 'allenai/longformer-base-4096'
batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

train_dataset = GzippedJSONDataset(f'{dir_path}/data.1.json.gz')
labelled_train_dataset = LabelledTokensDataset(train_dataset, tokenizer)

eval_dataset = GzippedJSONDataset(f'{dir_path}/data.2.json.gz', 100)
labelled_eval_dataset = LabelledTokensDataset(eval_dataset, tokenizer)

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=2)

args = TrainingArguments(
    f"test_sponsors",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.00001,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

# use SeqEval as the evaluation library
metric = load_metric("seqeval")

# define which metrics will be reported
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# specify components of the training and evaluation processes
trainer = Trainer(
    model,
    args,
    train_dataset=labelled_train_dataset,
    eval_dataset=labelled_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/allenai/longformer-base-4096/resolve/main/config.json from cache at /Users/hisham/.cache/huggingface/transformers/0690955d8f70934f95adf0fb108d5f7322d02f8d7dd938b7b133cb7421e120e6.b25f41ff6acdcb7ab47c505c70e351b3fc01957b3798197e5ac6e8efc547ac99
Model config LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_emb

In [10]:
# train, evaluate and save the model
trainer.train()
trainer.evaluate()
trainer.save_model('SponsorML.model')

***** Running training *****
  Num examples = 20001
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3753


AttributeError: __enter__

### Code for extracting timestamps from mock model prediction

In [7]:
video_id, captions, sponsor_times = next(LabelledCaptionsDataset(GzippedJSONDataset(f'{dir_path}/data.1.json.gz')).__iter__())
token_timestamps = []
input_ids = []
labels = []
for caption in captions:
    # remove special beginning/end tokens
    # print(caption)
    tokenized_caption = tokenizer(caption['text'])['input_ids'][1:-1]
    input_ids += tokenized_caption
    label = 1 if caption['is_sponsor'] else 0
    labels += [label] * len(tokenized_caption)
    token_timestamps += [(caption['start'], caption['end'])] * len(tokenized_caption)

# add back special tokens
input_ids = tokenizer.prepare_for_model(input_ids, truncation=True)['input_ids']
labels = [-100] + labels[:len(input_ids)-2] + [-100]


sponsor_ranges = []
in_sponsor = False
start = -1
for i, label in enumerate(labels):
    if label == 1:
        if not in_sponsor:
            in_sponsor = True
            start = i
    elif label == 0:
        if in_sponsor:
            sponsor_ranges.append((token_timestamps[start][1], token_timestamps[i-1][1]))
        in_sponsor = False
else:
    if in_sponsor:
        sponsor_ranges.append((token_timestamps[start][1], token_timestamps[i][1]))

sponsor_times = [
        (captions[start_index]['start'], captions[end_index]['end'])
        for start_index, end_index in sponsor_times
    ]
print(f'True ranges: {sponsor_times}')
print(f'Labelled ranges: {sponsor_ranges}')

True ranges: [(59.67, 93.26)]
Labelled ranges: [(61.58, 93.26)]


In [9]:
'https://www.youtube.com/watch?v=' + video_id

'https://www.youtube.com/watch?v=---jcia5ufM'