# Environment setting

In [None]:
! pip install datasets transformers seqeval

To be able to share your model with the community and generate results like the one shown in the picture below via the inference API, store the authentication token from the Hugging Face website (sign up [here](https://huggingface.co/join)) then execute the following cell and input your username and password:

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


install Git-LFS. Uncomment the following instructions:

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (4,239 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 156210 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [None]:
import transformers

print(transformers.__version__)

4.18.0


# Fine-tuning a model on a token classification task

load model checkpoint from Hugging face [Model Hub](https://huggingface.co/models), specify the task of NER(Named Entity Recognition). Pick the SciBERT which is trianed on Scientific Papers Corpus.

In [None]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
# model_checkpoint = "distilbert-base-uncased"
model_checkpoint = 'allenai/scibert_scivocab_uncased'
batch_size = 16

## Loading the dataset

upload project directory summary_NER/Sci_NER/train.txt test.txt dev.txt to colab content folder

Preprocess the txt file to pandas.dataframe format

In [1]:
def load_txt(file_path,label_dict):
    # load data from txt
    with open(file_path,'rb') as f:
        lines = f.readlines()
    tok_str = []
    tok_label = []
    tok_pd = []
    label_pd = []
    for line in lines[:]:
        tok = str(line,'utf-8')
        if tok.find('DOCSTART')>0:
            continue

        if tok == '\n':
            if tok_str:
                tok_pd.append(tok_str)
                label_pd.append(tok_label)
            tok_str = []
            tok_label = []
            continue
        toks = tok.split()
        tok_str.append(toks[0])
        tok_label.append(toks[3])
        
    # add B-XXXX tag
    for l in range(len(label_pd)):
        first_flag = True
        for j in range(len(label_pd[l])):
            label_str = label_pd[l][j]
            
            if label_str != 'O':
                if first_flag:
                    label_pd[l][j]='B'+label_pd[l][j][1:]
                    first_flag = False
            else:
                first_flag = True
    
    # change label text to number
    label_num_pd = []
    for la in label_pd:
        label_num_pd.append([label_dict[j] for j in la])
    return {'tokens':tok_pd,'ner_tags':label_num_pd,'label_text':label_pd}

In [4]:
import pandas as pd
from datasets import Dataset
label_dict={'O':0,'B-Task':1, 'I-Task':2, 'B-Method':3 , 'I-Method':4,
            'B-OtherScientificTerm':5, 'I-OtherScientificTerm':6,
            'B-Generic':7, 'I-Generic':8, 'B-Material':9, 'I-Material':10,
            'B-Metric':11, 'I-Metric':12}
label_list = ['O', 'B-Task', 'I-Task', 'B-Method', 'I-Method', 'B-OtherScientificTerm', 'I-OtherScientificTerm',
        'B-Generic', 'I-Generic', 'B-Material', 'I-Material', 'B-Metric', 'I-Metric']
train_path = '/content/train.txt'
test_path = '/content/test.txt'
dev_path = '/content/dev.txt'

In [5]:
train_dict = load_txt(train_path,label_dict)
test_dict = load_txt(test_path,label_dict)
val_dict = load_txt(dev_path,label_dict)

train_df = pd.DataFrame(train_dict)
test_df = pd.DataFrame(test_dict)
val_df = pd.DataFrame(val_dict)

In [7]:
train_df.head()

Unnamed: 0,tokens,ner_tags,label_text
0,"[English, is, shown, to, be, trans-context-fre...","[9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, ...","[B-Material, O, O, O, O, O, O, O, O, O, B-Othe..."
1,"[The, agreement, in, question, involves, numbe...","[0, 7, 0, 0, 0, 0, 0, 5, 0, 5, 6, 0, 0, 0, 0, ...","[O, B-Generic, O, O, O, O, O, B-OtherScientifi..."
2,"[The, formal, proof, ,, which, makes, crucial,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, ...","[O, O, O, O, O, O, O, O, O, O, B-Method, I-Met..."
3,"[In, this, paper, ,, a, novel, method, to, lea...","[0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 5, 6, 6, 0, 1, ...","[O, O, O, O, O, O, B-Method, O, O, O, B-OtherS..."
4,"[The, basic, assumption, is, that, the, parame...","[0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 0, 0, 5, 6, 6, ...","[O, O, O, O, O, O, B-OtherScientificTerm, I-Ot..."


Statistics of training data, length of text

In [None]:
len_pd = [len(text) for text in train_dict['tokens']]
import numpy as np
train_len = np.array(len_pd)
train_len.mean()

24.401934443847395

In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

use the [Datasets](https://github.com/huggingface/datasets) library to convert to our custom dataset to hugging face supported dataset and get the metric use for evaluation (to compare our model to the benchmark). This can be done with the functions `load_dataset` and `load_metric`.  

In [None]:
from datasets import load_dataset, load_metric

## Preprocessing the data

Before we can feed those texts to our model, we need to preprocess them. This is done by a Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that the model requires.

Instantiate a tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:

- get a tokenizer that corresponds to the model architecture we want to use,
- download the vocabulary used when pretraining this specific checkpoint.


In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [None]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

preprocess the data(tokenization) and align the labels to corresponding tokens.

In [None]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

This will apply the function on all the elements of all the splits in `dataset`, so training, validation and testing data will be preprocessed in one single command.

In [None]:
tokenized_train_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val_datasets = val_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Fine-tuning the model

Now data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about token classification, use the `AutoModelForTokenClassification` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us.

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

This will apply the function on all the elements of all the splits in `dataset`, so training, validation and testing data will be preprocessed in one single command.

In [None]:
model_name = model_checkpoint.split("/")[-1]+'_epoch20'
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Then we will need a data collator that will batch our processed examples together while applying padding to make them all the same size (each pad will be padded to the length of its longest example). There is a data collator for this task in the Transformers library, that not only pads the inputs, but also the labels:

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

The last thing to define for `Trainer` is to compute the metrics from the predictions. Here load the [`seqeval`](https://github.com/chakki-works/seqeval) metric (which is commonly used to evaluate results on the CONLL dataset) via the Datasets library.

In [None]:
metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

This metric takes list of labels for the predictions and references:

In [None]:
example = tokenized_train_datasets[0]
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'Material': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'OtherScientificTerm': {'f1': 1.0,
  'number': 2,
  'precision': 1.0,
  'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

do a bit of post-processing on our predictions:
- select the predicted index (with the maximum logit) for each token
- convert it to its string label
- ignore everywhere we set a label of -100

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

drop the precision/recall/f1 computed for each category and only focus on the overall precision/recall/f1/accuracy.

pass all of this along with our datasets to the `Trainer`:

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

We can now finetune our model for **5 epochs** by just calling the `train` method:

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
)
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Cloning https://huggingface.co/HenryHXR/scibert_scivocab_uncased_5epochstest-finetuned-ner into local empty directory.


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: label_text, tokens, ner_tags. If label_text, tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1861
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 295


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.654529,0.366361,0.368509,0.367432,0.794637
2,No log,0.514967,0.50219,0.576214,0.536661,0.839772
3,No log,0.462815,0.573034,0.640704,0.604982,0.858357
4,No log,0.463578,0.573082,0.663317,0.614907,0.858622
5,No log,0.460901,0.581831,0.675879,0.625339,0.861808


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: label_text, tokens, ner_tags. If label_text, tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 275
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: label_text, tokens, ner_tags. If label_text, tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 275
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: label_text, tokens, ner_tags. If label_text, tokens, ner_tags are not expected by `BertForTokenClassification.forw

TrainOutput(global_step=295, training_loss=0.49020137463585806, metrics={'train_runtime': 225.4194, 'train_samples_per_second': 41.279, 'train_steps_per_second': 1.309, 'total_flos': 322637632893624.0, 'train_loss': 0.49020137463585806, 'epoch': 5.0})

The `evaluate` method allows you to evaluate again on the evaluation dataset or on the test dataset:

In [None]:
trainer.evaluate(eval_dataset=tokenized_test_datasets)

The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: label_text, tokens, ner_tags. If label_text, tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 551
  Batch size = 16


{'epoch': 5.0,
 'eval_accuracy': 0.86002326483133,
 'eval_f1': 0.5829650462032945,
 'eval_loss': 0.4457939863204956,
 'eval_precision': 0.558076923076923,
 'eval_recall': 0.6101766190075694,
 'eval_runtime': 5.7039,
 'eval_samples_per_second': 96.6,
 'eval_steps_per_second': 6.136}

model evaluation

To get the precision/recall/f1 computed for each category now the finished training.

In [None]:
predictions, labels, _ = trainer.predict(tokenized_test_datasets)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: label_text, tokens, ner_tags. If label_text, tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 551
  Batch size = 16


{'Generic': {'f1': 0.6894197952218429,
  'number': 280,
  'precision': 0.6601307189542484,
  'recall': 0.7214285714285714},
 'Material': {'f1': 0.5916666666666667,
  'number': 228,
  'precision': 0.5634920634920635,
  'recall': 0.6228070175438597},
 'Method': {'f1': 0.7033121916842846,
  'number': 712,
  'precision': 0.7057991513437057,
  'recall': 0.7008426966292135},
 'Metric': {'f1': 0.5714285714285715,
  'number': 93,
  'precision': 0.5,
  'recall': 0.6666666666666666},
 'OtherScientificTerm': {'f1': 0.5401755570560431,
  'number': 697,
  'precision': 0.5102040816326531,
  'recall': 0.5738880918220947},
 'Task': {'f1': 0.5755208333333334,
  'number': 368,
  'precision': 0.5525,
  'recall': 0.6005434782608695},
 'overall_accuracy': 0.8663564689156004,
 'overall_f1': 0.6164411230054534,
 'overall_precision': 0.5930820054411193,
 'overall_recall': 0.6417157275021026}

load the trained model to the *pipeline* to do NER inference, sapcify task by 'ner'

In [None]:
from transformers import pipeline
import torch
ner_pipe = pipeline(task="ner",model=model.to(torch.device('cpu')),tokenizer=tokenizer)

In [None]:
x = ' '.join(tokenized_test_datasets[12]['tokens'])
x

'Structural or numerical constraints can then be added locally to the reconstruction process through a constrained optimization scheme .'

In [None]:
x = ' '.join(tokenized_test_datasets[12]['tokens'])
entities = ner_pipe(x)
for entity in entities:
  print(entity)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entity': 'LABEL_5', 'score': 0.9496698, 'index': 1, 'word': 'structural', 'start': 0, 'end': 10}
{'entity': 'LABEL_6', 'score': 0.87721187, 'index': 2, 'word': 'or', 'start': 11, 'end': 13}
{'entity': 'LABEL_5', 'score': 0.475167, 'index': 3, 'word': 'numerical', 'start': 14, 'end': 23}
{'entity': 'LABEL_6', 'score': 0.9518148, 'index': 4, 'word': 'constraints', 'start': 24, 'end': 35}
{'entity': 'LABEL_0', 'score': 0.99914324, 'index': 5, 'word': 'can', 'start': 36, 'end': 39}
{'entity': 'LABEL_0', 'score': 0.99910575, 'index': 6, 'word': 'then', 'start': 40, 'end': 44}
{'entity': 'LABEL_0', 'score': 0.99921286, 'index': 7, 'word': 'be', 'start': 45, 'end': 47}
{'entity': 'LABEL_0', 'score': 0.9977059, 'index': 8, 'word': 'added', 'start': 48, 'end': 53}
{'entity': 'LABEL_0', 'score': 0.99687284, 'index': 9, 'word': 'locally', 'start': 54, 'end': 61}
{'entity': 'LABEL_0', 'score': 0.99896824, 'index': 10, 'word': 'to', 'start': 62, 'end': 64}
{'entity': 'LABEL_0', 'score': 0.9983333

upload the result of the training to the Hub

In [None]:
trainer.push_to_hub()

Saving model checkpoint to scibert_scivocab_uncased_epoch20-finetuned-ner
Configuration saved in scibert_scivocab_uncased_epoch20-finetuned-ner/config.json
Model weights saved in scibert_scivocab_uncased_epoch20-finetuned-ner/pytorch_model.bin
tokenizer config file saved in scibert_scivocab_uncased_epoch20-finetuned-ner/tokenizer_config.json
Special tokens file saved in scibert_scivocab_uncased_epoch20-finetuned-ner/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 3.34k/417M [00:00<?, ?B/s]

Upload file training_args.bin: 100%|##########| 2.98k/2.98k [00:00<?, ?B/s]

To https://huggingface.co/HenryHXR/scibert_scivocab_uncased_epoch20-finetuned-ner
   94c3c66..9ae2d06  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Token Classification', 'type': 'token-classification'}}
To https://huggingface.co/HenryHXR/scibert_scivocab_uncased_epoch20-finetuned-ner
   9ae2d06..492dd15  main -> main



'https://huggingface.co/HenryHXR/scibert_scivocab_uncased_epoch20-finetuned-ner/commit/9ae2d06717cdb5f5f97fb789f8d403fd31bd513f'

## Load model, tokenizer from hugging face

label_list is ['O', 'B-Task', 'I-Task', 'B-Method', 'I-Method', 'B-OtherScientificTerm', 'I-OtherScientificTerm',
        'B-Generic', 'I-Generic', 'B-Material', 'I-Material', 'B-Metric', 'I-Metric']

In [None]:
from transformers import pipeline
model_checkpoint = 'HenryHXR/scibert_scivocab_uncased-finetuned-ner'
ner_pipe = pipeline(task="ner",model=model_checkpoint)
label_list = ['O', 'B-Task', 'I-Task', 'B-Method', 'I-Method', 'B-OtherScientificTerm', 'I-OtherScientificTerm',
        'B-Generic', 'I-Generic', 'B-Material', 'I-Material', 'B-Metric', 'I-Metric']

In [None]:
text = 'Structural or numerical constraints can then be added locally to the reconstruction process through a constrained optimization scheme.'
entities = ner_pipe(text)
for entity in entities:
  print(entity)