In [47]:
! pip install torch 
! pip install transformers
! pip install datasets
! pip install evaluate
! pip install pandas



In [48]:
from datasets import load_dataset,ClassLabel
raw_datasets = load_dataset(
    "wnut_17",
)

In [49]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [50]:

column_names = raw_datasets["train"].column_names
features = raw_datasets["train"].features

In [51]:
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

In [52]:
label_column_name="ner_tags"
labels_are_int = isinstance(features[label_column_name].feature, ClassLabel)
if labels_are_int:
    label_list = features[label_column_name].feature.names
    label_to_id = {i: i for i in range(len(label_list))}
else:
    label_list = get_label_list(raw_datasets["train"][label_column_name])
    label_to_id = {l: i for i, l in enumerate(label_list)}
label_list    


['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [53]:
from transformers import AutoConfig,AutoModelForTokenClassification,AutoTokenizer
num_labels = len(label_list)
model_name="bert-base-uncased"
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
    token=True, 
    add_prefix_space=True,
)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True,
    )


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
model.config.label2id = {l: i for i, l in enumerate(label_list)}
model.config.id2label = dict(enumerate(label_list))

In [55]:
b_to_i_label = []
for idx, label in enumerate(label_list):
    if label.startswith("B-") and label.replace("B-", "I-") in label_list:
        b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
    else:
        b_to_i_label.append(idx)

padding = False

In [56]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        padding=padding,
        truncation=True,
        max_length=512,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [57]:
precessed_dataset = raw_datasets.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=8,
        load_from_cache_file="False",
        desc="Running tokenizer on train dataset",
    )
precessed_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1287
    })
})

In [58]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)

    # Metrics
! pip install seqeval    
import evaluate  
import numpy as np  
metric = evaluate.load("seqeval")
def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = metric.compute(predictions=true_predictions, references=true_labels)
            # Unpack nested dictionaries
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }




In [60]:
from transformers import Trainer,TrainingArguments
training_args=TrainingArguments(output_dir="token-classification")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=precessed_dataset["train"],
    eval_dataset=precessed_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [61]:
train_result = trainer.train()
metrics = train_result.metrics
trainer.save_model()  # Saves the tokenizer too for easy upload

max_train_samples = (
    len(precessed_dataset["train"])
)
metrics["train_samples"] = min(max_train_samples, len(precessed_dataset["train"]))

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

  0%|          | 0/1275 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 39%|███▉      | 500/1275 [01:57<02:49,  4.56it/s]

{'loss': 0.0615, 'learning_rate': 3.0392156862745097e-05, 'epoch': 1.18}


 78%|███████▊  | 1000/1275 [03:59<01:04,  4.27it/s]

{'loss': 0.0197, 'learning_rate': 1.0784313725490197e-05, 'epoch': 2.35}


100%|██████████| 1275/1275 [05:17<00:00,  4.01it/s]


{'train_runtime': 317.9285, 'train_samples_per_second': 32.026, 'train_steps_per_second': 4.01, 'train_loss': 0.034041978424670646, 'epoch': 3.0}
***** train metrics *****
  epoch                    =        3.0
  train_loss               =      0.034
  train_runtime            = 0:05:17.92
  train_samples            =       3394
  train_samples_per_second =     32.026
  train_steps_per_second   =       4.01


In [63]:
metrics = trainer.evaluate()

max_eval_samples = len(precessed_dataset["validation"])
metrics["eval_samples"] = min(max_eval_samples, len(precessed_dataset["validation"]))

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

100%|██████████| 127/127 [00:07<00:00, 16.71it/s]

***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.9556
  eval_f1                 =     0.5884
  eval_loss               =     0.0963
  eval_precision          =     0.6852
  eval_recall             =     0.5156
  eval_runtime            = 0:00:08.08
  eval_samples            =       1009
  eval_samples_per_second =    124.748
  eval_steps_per_second   =     15.702



