## install transformers

### in this python file I have tried the Disease Ner data (ncbi_disease) which has the same data structure like SpeedofMagic.

In [1]:
! pip install transformers datasets evaluate seqeval accelerate


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-

## Import libraries

In [None]:
from datasets import load_dataset, load_metric, ClassLabel, Sequence
import random
import pandas as pd
import numpy as np
from IPython.display import display, HTML

import transformers
from transformers import AutoTokenizer, RobertaForTokenClassification, TrainingArguments, Trainer
from transformers import RobertaTokenizerFast

from transformers import DataCollatorForTokenClassification




## Load the dataset

In [5]:
data = load_dataset('ncbi_disease')

Downloading builder script:   0%|          | 0.00/5.83k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.70k [00:00<?, ?B/s]

Downloading and preparing dataset ncbi_disease/ncbi_disease to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/284k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/5433 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/924 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/941 [00:00<?, ? examples/s]

Dataset ncbi_disease downloaded and prepared to /root/.cache/huggingface/datasets/ncbi_disease/ncbi_disease/1.0.0/92314c7992b0b8a5ea2ad101be33f365b684a2cc011e0ffa29c691e6d32b2d03. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 924
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 941
    })
})

In [7]:
data['train'][0]

{'id': '0',
 'tokens': ['Identification',
  'of',
  'APC2',
  ',',
  'a',
  'homologue',
  'of',
  'the',
  'adenomatous',
  'polyposis',
  'coli',
  'tumour',
  'suppressor',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]}

## We are using NER task for token classification

In [8]:
task = "ner"
model_checkpoint = "roberta-base"
batch_size = 16

In [9]:
label = data['train'].features[f"{task}_tags"].feature.names
print(label)

['O', 'B-Disease', 'I-Disease']


## just to see how dataset look like

In [10]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [11]:
show_random_elements(data['train'])

Unnamed: 0,id,tokens,ner_tags
0,5204,"[We, have, used, gene, targeting, to, generate, mice, with, a, modified, Apob, allele, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,3076,"[We, found, the, same, CETP, gene, mutation, in, four, families, from, three, different, regions, of, Japan, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,1097,"[The, phenotype, is, likely, to, be, a, contiguous, gene, syndrome, involving, genes, which, are, paternally, expressed, only, ,, located, in, the, human, 15q11, -, q13, region, .]","[O, O, O, O, O, O, O, B-Disease, I-Disease, I-Disease, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,1171,"[2, -, cM, interval, between, D11S4082, and, D11S931, .]","[O, O, O, O, O, O, O, O, O]"
4,2429,"[On, the, other, hand, ,, the, occurrence, of, an, extremely, heterogeneous, spectrum, of, mutations, spread, throughout, the, entire, length, of, the, large, APC, gene, among, the, FAP, patients, indicates, that, this, approach, may, not, be, useful, as, a, rapid, presymptomatic, diagnostic, procedure, in, a, routine, laboratory, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-Disease, O, O, O, B-Disease, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
5,1819,"[In, the, inv, (, X, ), carried, by, several, affected, family, members, ,, FISH, showed, PLP, signals, at, Xp11, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
6,4982,"[The, Na, +, /, glucose, cotransporter, gene, SGLT1, encodes, the, primary, carrier, protein, responsible, for, the, uptake, of, the, dietary, sugars, glucose, and, galactose, from, the, intestinal, lumen, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
7,4059,"[G6PD, activity, of, the, patient, was, 5, .]","[O, O, O, O, O, O, O, O]"
8,1757,"[Age, penetrance, is, greater, for, BRCA1, -, linked, than, for, BRCA2, -, linked, cancers, in, this, population, .]","[O, O, O, O, O, B-Disease, I-Disease, I-Disease, I-Disease, I-Disease, I-Disease, I-Disease, I-Disease, I-Disease, O, O, O, O]"
9,4354,"[Two, overlapping, cDNA, clones, (, 1, ,, 991, bp, and, 736, bp, ,, respectively, ), encoding, the, precursor, of, human, mitochondrial, very, -, long, -, chain, acyl, -, coenzyme, A, dehydrogenase, (, VLCAD, ), were, cloned, and, sequenced, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


## Lets preprocess the data

tokenize the data

In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint, add_prefix_space=True)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [13]:
# this will make sure the tokenizer is fast
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [14]:
## we need to preprocess the data since if our words already tokinze it will tokenize again wchich will end up giving more ids
## we will use truncation = true to reduce the size of the text that are bigger than maximum size and will use
## is_split_into_words=true then we align them
label_all_tokens = True # we can change the value of this flag to change the strategy to set the label for first token only
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], return_offsets_mapping=True, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [15]:
## use map to apply the above function on all sentences
tokenize_data = data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/5433 [00:00<?, ? examples/s]

Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Map:   0%|          | 0/941 [00:00<?, ? examples/s]

In [16]:
tokenize_data['train'][0]

{'id': '0',
 'tokens': ['Identification',
  'of',
  'APC2',
  ',',
  'a',
  'homologue',
  'of',
  'the',
  'adenomatous',
  'polyposis',
  'coli',
  'tumour',
  'suppressor',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0],
 'input_ids': [0,
  36309,
  9,
  1480,
  347,
  176,
  2156,
  10,
  9486,
  28789,
  9,
  5,
  2329,
  225,
  1075,
  415,
  1827,
  11424,
  11474,
  354,
  31435,
  18093,
  2126,
  23192,
  368,
  479,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'offset_mapping': [[0, 0],
  [0, 14],
  [0, 2],
  [0, 2],
  [2, 3],
  [3, 4],
  [0, 1],
  [0, 1],
  [0, 3],
  [3, 9],
  [0, 2],
  [0, 3],
  [0, 2],
  [2, 4],
  [4, 6],
  [6, 8],
  [8, 11],
  [0, 4],
  [4, 7],
  [7, 9],
  [0, 4],
  [0, 3],
  [3, 6],
  [0, 8],
  [8, 10],
  [0, 1],
  [0, 0]],
 'labels': [-100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  

## fine tune the model

In [17]:
model = RobertaForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label))

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

## Initiate training
### before that we need training arguments which will need folder name and other optional

In [18]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

## make all our example to same size 

In [19]:
data_collector = DataCollatorForTokenClassification(tokenizer)

## last we need metric from prediction, we use seqeval metric

In [20]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]


*   selecting predicted index for each token with max logit
*   cnvert it to string label
*   ignore everywhere we set label of -100 from earliear





In [21]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## accuracy/f1score/recall/precision


In [22]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenize_data["train"],
    eval_dataset=tokenize_data["validation"],
    data_collator=data_collector,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

## Now we train our trainer

In [23]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.073782,0.826793,0.853519,0.839944,0.983485
2,0.111300,0.074738,0.819703,0.881751,0.849596,0.9843
3,0.036700,0.07782,0.834303,0.879705,0.856403,0.984266




TrainOutput(global_step=1020, training_loss=0.07317748028858036, metrics={'train_runtime': 253.6101, 'train_samples_per_second': 64.268, 'train_steps_per_second': 4.022, 'total_flos': 553390195925070.0, 'train_loss': 0.07317748028858036, 'epoch': 3.0})

In [24]:
trainer.evaluate()

{'eval_loss': 0.07782011479139328,
 'eval_precision': 0.8343034536282499,
 'eval_recall': 0.8797054009819967,
 'eval_f1': 0.8564031069508066,
 'eval_accuracy': 0.9842661501342305,
 'eval_runtime': 4.4648,
 'eval_samples_per_second': 206.954,
 'eval_steps_per_second': 12.991,
 'epoch': 3.0}

In [25]:
predictions, labels, _ = trainer.predict(tokenize_data["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'100': {'precision': 0.8554077501445922,
  'recall': 0.8882882882882883,
  'f1': 0.8715380082498527,
  'number': 1665},
 '_': {'precision': 0.7912735849056604,
  'recall': 0.8613607188703466,
  'f1': 0.8248309772587585,
  'number': 779},
 'overall_precision': 0.8343034536282499,
 'overall_recall': 0.8797054009819967,
 'overall_f1': 0.8564031069508066,
 'overall_accuracy': 0.9842661501342305}