In [1]:
%%capture
%pip install gdown
!gdown --fuzzy https://drive.google.com/file/d/19Lvgk4wPS0lwy-IFdZ9-bBQxuhnBL3hx/view?usp=sharing
!unzip -o data.zip
!rm data.zip

In [2]:
import itertools

from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    pipeline,
)
from datasets import Dataset
from torchmetrics.functional import f1_score
from transformers.pipelines.pt_utils import KeyDataset
import pandas as pd
import torch

2024-04-28 09:59:13.020213: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 09:59:13.020347: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 09:59:13.150145: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
train_df = pd.read_json('train.json', lines=False)
test_df = pd.read_json('test.json', lines=False)
dev_df = pd.read_json('dev.json', lines=False)

In [4]:
id2label = dict(enumerate(['O'] + sorted({part for ners in train_df.ners for _, _, part in ners})))
label2id = {label: idx for idx, label in id2label.items()}

In [5]:
model_id = 'sentence-transformers/LaBSE'

In [6]:
model = AutoModelForTokenClassification.from_pretrained(model_id, 
                                                        num_labels=len(id2label), 
                                                        ignore_mismatched_sizes=True,
                                                        id2label=id2label,
                                                        label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at sentence-transformers/LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
def convert_dataframe(dataframe):
    dataframe = dataframe.copy()
    def convert_ners(ners):
        return [(start, end, label2id[label]) for start, end, label in ners]
    dataframe['ners'] = dataframe.ners.apply(convert_ners)
    return dataframe

In [8]:
train_set = Dataset.from_pandas(convert_dataframe(train_df))
test_set = Dataset.from_pandas(test_df)

In [9]:
class DatasetTokenizer:
    """Tokenizer for the dataset.

    Splits sentences in the dataset into tokens using provided
    HuggingFace tokenizer, and labels the tokens.
    """

    def __init__(self, tokenizer):
        """Initializes the tokenizer.

        Args:
            tokenizer (Tokenizer): HuggingFace tokenizer.
        """
        self.tokenizer = tokenizer

    def __call__(self, row: dict) -> dict:
        """Splits sentences into tokens and labels them.

        Args:
            row (dict): Row in the dataset. Should contain the following keys:
                sentences (str): text to tokenize
                ners (list[tuple[int, int, str]]): Labels for words as tuples
                    of (start, end, label).
        
        Returns:
            dict: Tokenized row. Contains:
                input_ids (list[int]): IDs of the tokens in sentences.
                labels (list[int]): List of label IDs corresponding to
                    tokens.
                token_type_ids (list[int]): List of token type IDs.
                    See HuggingFace documentation on tokenizers for
                    further detail.
                attention_mask (list[int]): Whether to attend to tokens
                    or not. See HuggingFace documentation for further
                    detail.
        """
        text_len = len(row['sentences'])
        char_labels = [label2id['O'] for _ in range(text_len)]
        for start, end, label in row['ners']:
            for i in range(start, end):
                char_labels[i] = label
        tokenized = tokenizer(row['sentences'])
        n_tokens = len(tokenized['input_ids'])
        labels = [label2id['O']] * n_tokens
        for i in range(n_tokens):
            span = tokenized.token_to_chars(i)
            if span is None:
                continue
            labels[i] = char_labels[span.start]
        tokenized['labels'] = labels
        return tokenized

In [10]:
def split_into_multiple(batch):
    """Splits one row into multiple to prevent overflowing the model's
    context window.

    Args:
        batch (dict[str, list]): Batch of dataset rows. Should contain column
            `labels`, and every other column should be the same length.
    """
    result = {column: [] for column in batch}
    batch_size = len(batch['labels'])
    for i in range(batch_size):
        n = len(batch['labels'][i])
        for j in itertools.count(0, 384):
            for column in batch:
                result[column].append(batch[column][i][j:j+512])
            if j + 512 >= n:
                break
    return result

In [11]:
dataset_tokenizer = DatasetTokenizer(tokenizer)

In [12]:
dataset = train_set\
    .map(dataset_tokenizer)\
    .remove_columns(['id', 'sentences', 'ners', '__index_level_0__'])\
    .map(split_into_multiple, batched=True)\
    .train_test_split(test_size=.1)

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [13]:
def compute_metrics(results) -> dict[str, float]:
    """Computes the metrics for the model's evaluation.

    Args:
        results: Outputs of the model.
    
    Returns:
        dict[str, float]: Mapping from the name of the metric to its value.
    """
    preds, target = results
    f1 = f1_score(torch.tensor(preds).transpose(-1, -2),
                  torch.tensor(target),
                  num_classes=len(id2label),
                  average='macro',
                  task='multiclass',
                  ignore_index=-100)
    return {'f1_score': f1}

In [14]:
training_args = TrainingArguments(
    output_dir="/tmp",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    save_total_limit = 4,
    weight_decay=0.01,
    report_to='tensorboard',
    save_strategy='epoch',
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=DataCollatorForTokenClassification(tokenizer),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.567914,0.327124
2,No log,0.35151,0.598954
3,No log,0.308231,0.677104
4,No log,0.276562,0.744848
5,No log,0.267522,0.746542
6,No log,0.267779,0.761824
7,No log,0.267154,0.775623
8,0.368000,0.26815,0.790043
9,0.368000,0.2776,0.791928
10,0.368000,0.281527,0.793472


TrainOutput(global_step=1360, training_loss=0.17860160575193518, metrics={'train_runtime': 1010.9285, 'train_samples_per_second': 10.683, 'train_steps_per_second': 1.345, 'total_flos': 2728318339290240.0, 'train_loss': 0.17860160575193518, 'epoch': 20.0})

In [16]:
trainer.model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')

In [17]:
!zip -r model.zip -xi model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: model/ (stored 0%)
  adding: model/vocab.txt (deflated 43%)
  adding: model/tokenizer.json (deflated 65%)
  adding: model/config.json (deflated 58%)
  adding: model/tokenizer_config.json (deflated 75%)
  adding: model/model.safetensors (deflated 7%)
  adding: model/special_tokens_map.json (deflated 80%)


In [18]:
!rm *.jsonl
!rm -r model
!rm *.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


rm: cannot remove '*.jsonl': No such file or directory


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
