In [None]:
%%capture
%pip install gdown
!gdown --fuzzy https://drive.google.com/file/d/16M6_lcY-rDcOV3uMeNt8Ukg9A6E-T6Cv/view?usp=sharing
!unzip -o NLP3.zip
!rm NLP3.zip

In [None]:
import itertools

from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    pipeline,
)
from datasets import Dataset
from torchmetrics.functional import f1_score
from transformers.pipelines.pt_utils import KeyDataset
import pandas as pd
import torch

In [None]:
train_df = pd.read_json('train.jsonl', lines=True)

In [None]:
id2label = dict(enumerate(['O'] + sorted({part for ners in train_df.ners for _, _, part in ners})))
label2id = {label: idx for idx, label in id2label.items()}
print(len(id2label), 'labels in total')

In [None]:
model_id = 'sentence-transformers/LaBSE'

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_id, 
                                                        num_labels=len(id2label), 
                                                        ignore_mismatched_sizes=True,
                                                        id2label=id2label,
                                                        label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)

In [None]:
def convert_dataframe(dataframe):
    dataframe = dataframe.copy()
    def convert_ners(ners):
        return [(start, end, label2id[label]) for start, end, label in ners]
    dataframe['ners'] = dataframe.ners.apply(convert_ners)
    return dataframe

In [None]:
train_set = Dataset.from_pandas(convert_dataframe(train_df))

In [None]:
class DatasetTokenizer:
    """Tokenizer for the dataset.

    Splits sentences in the dataset into tokens using provided
    HuggingFace tokenizer, and labels the tokens.
    """

    def __init__(self, tokenizer):
        """Initializes the tokenizer.

        Args:
            tokenizer (Tokenizer): HuggingFace tokenizer.
        """
        self.tokenizer = tokenizer

    def __call__(self, row: dict) -> dict:
        """Splits sentences into tokens and labels them.

        Args:
            row (dict): Row in the dataset. Should contain the following keys:
                sentences (str): text to tokenize
                ners (list[tuple[int, int, str]]): Labels for words as tuples
                    of (start, end, label).
        
        Returns:
            dict: Tokenized row. Contains:
                input_ids (list[int]): IDs of the tokens in sentences.
                labels (list[int]): List of label IDs corresponding to
                    tokens.
                token_type_ids (list[int]): List of token type IDs.
                    See HuggingFace documentation on tokenizers for
                    further detail.
                attention_mask (list[int]): Whether to attend to tokens
                    or not. See HuggingFace documentation for further
                    detail.
        """
        text_len = len(row['sentences'])
        char_labels = [label2id['O'] for _ in range(text_len)]
        for start, end, label in row['ners']:
            for i in range(start, end):
                char_labels[i] = label
        tokenized = tokenizer(row['sentences'])
        n_tokens = len(tokenized['input_ids'])
        labels = [label2id['O']] * n_tokens
        for i in range(n_tokens):
            span = tokenized.token_to_chars(i)
            if span is None:
                continue
            labels[i] = char_labels[span.start]
        tokenized['labels'] = labels
        return tokenized

In [None]:
def split_into_multiple(batch):
    """Splits one row into multiple to prevent overflowing the model's
    context window.

    Args:
        batch (dict[str, list]): Batch of dataset rows. Should contain column
            `labels`, and every other column should be the same length.
    """
    result = {column: [] for column in batch}
    batch_size = len(batch['labels'])
    for i in range(batch_size):
        n = len(batch['labels'][i])
        for j in itertools.count(0, 384):
            for column in batch:
                result[column].append(batch[column][i][j:j+512])
            if j + 512 >= n:
                break
    return result

In [None]:
dataset_tokenizer = DatasetTokenizer(tokenizer)

In [None]:
dataset = train_set\
    .map(dataset_tokenizer)\
    .remove_columns(['id', 'sentences', 'ners'])\
    .map(split_into_multiple, batched=True)\
    .train_test_split(test_size=.1)

In [None]:
def compute_metrics(results) -> dict[str, float]:
    """Computes the metrics for the model's evaluation.

    Args:
        results: Outputs of the model.
    
    Returns:
        dict[str, float]: Mapping from the name of the metric to its value.
    """
    preds, target = results
    f1 = f1_score(torch.tensor(preds).transpose(-1, -2),
                  torch.tensor(target),
                  num_classes=len(id2label),
                  average='macro',
                  task='multiclass',
                  ignore_index=-100)
    return {'f1_score': f1}

In [None]:
training_args = TrainingArguments(
    output_dir="/tmp",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    save_total_limit = 4,
    weight_decay=0.01,
    report_to='tensorboard',
    save_strategy='epoch',
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=DataCollatorForTokenClassification(tokenizer),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

In [None]:
!zip -r model.zip -xi model

In [None]:
!rm *.jsonl
!rm -r model
!rm *.json