<a href="https://colab.research.google.com/github/xuzean18/focus-anno/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Token classification (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [2]:
! pip install transformers datasets evaluate seqeval accelerate -U
from datasets import load_dataset,load_metric, Dataset, Features, ClassLabel
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForTokenClassification
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import csv
import evaluate



In [3]:
# Check GPU
print(torch.cuda.get_device_name(0))

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:


train_data_path = 'drive/My Drive/Colab Notebooks/train.tsv'
eval_data_path = 'drive/My Drive/Colab Notebooks/val.tsv'
save_model_path = 'drive/My Drive/Colab Notebooks'
save_tokenizer_path = 'drive/My Drive/Colab Notebooks'


train_data = pd.read_csv(train_data_path, sep='\t')#,names=['id', 'tokens', 'ner_tags'])
eval_data = pd.read_csv(eval_data_path, sep='\t', header=None,names=['id', 'tokens', 'ner_tags'])


def str2list(token):
    if type(token) is float:
        return []
    token = token[1:-1]
    splitted = [word[1:-1] for word in token.split(", ")]
    return splitted

train_data["tokens"] = train_data["tokens"].map(str2list)
train_data["ner_tags"] = train_data["ner_tags"].map(str2list)
eval_data["tokens"] = eval_data["tokens"].map(str2list)
eval_data["ner_tags"] = eval_data["ner_tags"].map(str2list)


train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

# Define a Classlabel object to use to map string labels to integers.
classmap = ClassLabel(num_classes=3, names=['O', 'B-focus', 'I-focus'])


# Map labels to label ids.
train_dataset = train_dataset.map(lambda y: {"ner_tags": classmap.str2int(y["ner_tags"])})
eval_dataset = eval_dataset.map(lambda y: {"ner_tags": classmap.str2int(y["ner_tags"])})

label_names = dict (zip([0,1,2],['O', 'B-focus', 'I-focus']))
id2label={i:classmap.int2str(i) for i in range(classmap.num_classes)}
label2id={c:classmap.str2int(c) for c in classmap.names}


model_checkpoints = [ "bert-base-uncased"]#"roberta-base", "deberta-base",
for model_checkpoint in model_checkpoints:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label={i:classmap.int2str(i) for i in range(classmap.num_classes)},
                                                        label2id={c:classmap.str2int(c) for c in classmap.names},
                                                        finetuning_task="ner")

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:  # Set the special tokens to -100.
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx: # The only condition that marks a diff word
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)  # Only label the first token of a given word.
                previous_word_idx = word_idx
            labels.append(label_ids)

        tokenized_inputs["labels"] = labels # New labels are stored as a list, and as the value for the "labels" key
        return tokenized_inputs

    train_dataset = train_dataset.map(
        tokenize_and_align_labels,
        batched=True,
    )

    eval_dataset = eval_dataset.map(
        tokenize_and_align_labels,
        batched=True,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


    metric = evaluate.load("seqeval")


    # Function compute_metrics used to compute the precision, recall
    # F1 score, and accuracy of the predictions made by a model.

    def compute_metrics(eval_preds):

        # Unpack eval_preds into logits and labels
        # logits are the raw output values from the model
        # labels are the true labels.
        logits, labels = eval_preds


        # Find the indices of the maximum values along the last axis of logits
        # Indices represent the model's predictions
        predictions = np.argmax(logits, axis=-1)


        # Remove ignored index (special tokens) and convert to labels
        # creates a new list of labels called true_labels,
        # iterates over labels and replacing each label l with
        # its corresponding name from label_names
        # but only if l is not equal to -100 (special tokens)
        true_labels = [[label_names[l] for l in label if l != -100] for label in labels]


        # This line creates a new list of predictions, true_predictions,
        # by iterating over predictions and labels together, replacing each
        # prediction p with its corresponding name from label_names, but only
        # if the corresponding label l is not equal to -100
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]


        # This line computes the metrics by calling the compute method
        # of the metric object with true_predictions and true_labels as arguments.
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": all_metrics["overall_precision"],
            "recall": all_metrics["overall_recall"],
            "f1": all_metrics["overall_f1"],
            "accuracy": all_metrics["overall_accuracy"],
        }

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        logging_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
    )

    # Train and evaluate the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )


    trainer.train()

    trainer.save_model(save_model_path) #model.save_pretrained("path/to/model")
    tokenizer.save_pretrained(save_tokenizer_path)

Map:   0%|          | 0/7610 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7610 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]



ValueError: No valid checkpoint found in output directory (./results)