In [1]:
! pip install transformers datasets evaluate seqeval accelerate -U
from datasets import load_dataset,load_metric, Dataset, Features, ClassLabel
from transformers import RobertaForTokenClassification, RobertaTokenizerFast, Trainer, TrainingArguments, DataCollatorForTokenClassification
import torch
import numpy as np
import pandas as pd
import csv
import evaluate
from sklearn.metrics import confusion_matrix


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | done
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l- \ | done
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=

2024-06-11 11:48:36.785791: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 11:48:36.785886: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 11:48:36.916762: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# optional block to disable unnecessary wandb login
! pip install -q wandb
import wandb
wandb.init(mode="disabled")

  pid, fd = os.forkpty()




In [3]:
train_data_path = '/kaggle/input/qc1a-dataset/train.tsv'
eval_data_path = '/kaggle/input/qc1a-dataset/val.tsv'
test_data_path = '/kaggle/input/qc1a-dataset/test.tsv'
save_path = '/kaggle/working/'

train_data = pd.read_csv(train_data_path, sep='\t')
eval_data = pd.read_csv(eval_data_path, sep='\t')
test_data = pd.read_csv(test_data_path, sep='\t')

def str2list(token):
    if type(token) is float:
        return []
    token = token[1:-1]
    splitted = [word[1:-1] for word in token.split(", ")]
    return splitted

train_data["tokens"] = train_data["tokens"].map(str2list)
train_data["ner_tags"] = train_data["ner_tags"].map(str2list)
eval_data["tokens"] = eval_data["tokens"].map(str2list)
eval_data["ner_tags"] = eval_data["ner_tags"].map(str2list)
test_data["tokens"] = test_data["tokens"].map(str2list)
test_data["ner_tags"] = test_data["ner_tags"].map(str2list)

train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)
test_dataset = Dataset.from_pandas(test_data)

classmap = ClassLabel(num_classes=3, names=['O', 'B-focus', 'I-focus'])

train_dataset = train_dataset.map(lambda y: {"ner_tags": classmap.str2int(y["ner_tags"])})
eval_dataset = eval_dataset.map(lambda y: {"ner_tags": classmap.str2int(y["ner_tags"])})
test_dataset = test_dataset.map(lambda y: {"ner_tags": classmap.str2int(y["ner_tags"])})

label_names = dict(zip([0, 1, 2], ['O', 'B-focus', 'I-focus']))
id2label = {i: classmap.int2str(i) for i in range(classmap.num_classes)}
label2id = {c: classmap.str2int(c) for c in classmap.names}


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/5285 [00:00<?, ? examples/s]

Map:   0%|          | 0/5285 [00:00<?, ? examples/s]

In [4]:

model_checkpoints = ["roberta-base"]
for model_checkpoint in model_checkpoints:
    tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint,add_prefix_space=True)
    model = RobertaForTokenClassification.from_pretrained(model_checkpoint, id2label={i:classmap.int2str(i) for i in range(classmap.num_classes)},
                                                        label2id={c:classmap.str2int(c) for c in classmap.names},
                                                        finetuning_task="ner")

    def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
            labels = []
            for i, label in enumerate(examples["ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)
            tokenized_inputs["labels"] = labels
            return tokenized_inputs

    train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
    eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)
    test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    metric = evaluate.load("seqeval")

    def compute_metrics(eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

        flattened_true = [label for sublist in true_labels for label in sublist]
        flattened_pred = [label for sublist in true_predictions for label in sublist]

        labels_set = list(set(flattened_true + flattened_pred))

        cm = confusion_matrix(flattened_true, flattened_pred, labels=labels_set)

        per_class_accuracies = cm.diagonal() / cm.sum(axis=1)
        class_accuracies = {label: per_class_accuracies[idx] for idx, label in enumerate(labels_set)}

        return {
            "overall_precision": all_metrics["overall_precision"],
            "overall_recall": all_metrics["overall_recall"],
            "overall_f1": all_metrics["overall_f1"],
            "overall_accuracy": all_metrics["overall_accuracy"],
            "per_class_accuracies": class_accuracies,
            "entity_metrics": {
                entity: {
                    "precision": metrics["precision"],
                    "recall": metrics["recall"],
                    "f1": metrics["f1"],
                    "number": metrics["number"]
                }
                for entity, metrics in all_metrics.items()
                if entity not in ["overall_precision", "overall_recall", "overall_f1", "overall_accuracy"]
            }
        }

    training_args = TrainingArguments(
        output_dir=save_path,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=5e-5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    errors = []
    for i, (true_label, true_prediction) in enumerate(zip(true_labels, true_predictions)):
        for j, (label, prediction) in enumerate(zip(true_label, true_prediction)):
            if label != prediction:
                errors.append({
                    "index": i,
                    "tokens": test_data.iloc[i]["tokens"],
                    "token": test_data.iloc[i]["tokens"][j],
                    "true_label": label,
                    "predicted_label": prediction
                })

    errors_df = pd.DataFrame(errors)
    errors_df.to_csv(f"{save_path}/errors.tsv", sep='\t', index=False)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/5285 [00:00<?, ? examples/s]

Map:   0%|          | 0/5285 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Per Class Accuracies,Entity Metrics
1,0.1401,0.107828,0.569273,0.583333,0.576218,0.961931,"{'B-focus': 0.6914954885774621, 'O': 0.9874779389195826, 'I-focus': 0.6140802556818182}","{'focus': {'precision': 0.5692732290708372, 'recall': 0.5833333333333334, 'f1': 0.5762175249092095, 'number': 5304}}"
2,0.1009,0.112283,0.597523,0.609351,0.603379,0.962242,"{'B-focus': 0.7158763678249184, 'O': 0.9830849063842848, 'I-focus': 0.6903409090909091}","{'focus': {'precision': 0.5975226474394527, 'recall': 0.6093514328808446, 'f1': 0.6033790721553253, 'number': 5304}}"
3,0.0702,0.121576,0.611059,0.610483,0.610771,0.963402,"{'B-focus': 0.7137646381263199, 'O': 0.9847730586249233, 'I-focus': 0.6832386363636364}","{'focus': {'precision': 0.6110586903189281, 'recall': 0.6104826546003017, 'f1': 0.6107705366405735, 'number': 5304}}"


Trainer is attempting to log a value of "{'B-focus': 0.6914954885774621, 'O': 0.9874779389195826, 'I-focus': 0.6140802556818182}" of type <class 'dict'> for key "eval/per_class_accuracies" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'focus': {'precision': 0.5692732290708372, 'recall': 0.5833333333333334, 'f1': 0.5762175249092095, 'number': 5304}}" of type <class 'dict'> for key "eval/entity_metrics" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'B-focus': 0.7158763678249184, 'O': 0.9830849063842848, 'I-focus': 0.6903409090909091}" of type <class 'dict'> for key "eval/per_class_accuracies" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'focus': {'precision': 0.5975226474394527

In [5]:
model.save_pretrained(save_path + "model/")
tokenizer.save_pretrained(save_path + "tokenizer/")

('/kaggle/working/tokenizer/tokenizer_config.json',
 '/kaggle/working/tokenizer/special_tokens_map.json',
 '/kaggle/working/tokenizer/vocab.json',
 '/kaggle/working/tokenizer/merges.txt',
 '/kaggle/working/tokenizer/added_tokens.json',
 '/kaggle/working/tokenizer/tokenizer.json')