In [1]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("Babelscape/wikineural")

df = pd.DataFrame(dataset["train_en"])

In [2]:
df.head()

Unnamed: 0,tokens,ner_tags,lang
0,"[This, division, also, contains, the, Ventana,...","[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 0, 0]",en
1,"["", So, here, is, the, balance, NBC, has, to, ...","[0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 7, 8, 0, 0, ...",en
2,"[It, is, a, protest, song, that, "", creates, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",en
3,"[This, differs, from, approaches, such, as, IP...","[0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, ...",en
4,"[Since, then, ,, only, Terry, Bradshaw, in, 14...","[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 2, 0, 0, 0, ...",en


In [3]:
pip show tokenizers

Name: tokenizers
Version: 0.19.1
Summary: 
Home-page: 
Author: Anthony MOI <m.anthony.moi@gmail.com>
Author-email: Nicolas Patry <patry.nicolas@protonmail.com>, Anthony Moi <anthony@huggingface.co>
License: 
Location: C:\ProgramData\anaconda3\envs\myfuckingwnviroment\Lib\site-packages
Requires: huggingface-hub
Required-by: transformers
Note: you may need to restart the kernel to use updated packages.


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [5]:
example = dataset["train_en"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'this',
 'division',
 'also',
 'contains',
 'the',
 'vent',
 '##ana',
 'wilderness',
 ',',
 'home',
 'to',
 'the',
 'california',
 'condor',
 '.',
 '[SEP]']

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_ds = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/11597 [00:00<?, ? examples/s]

In [8]:
pd.DataFrame(tokenized_ds['train_en'])

Unnamed: 0,tokens,ner_tags,lang,input_ids,attention_mask,labels
0,"[This, division, also, contains, the, Ventana,...","[0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 5, 0, 0]",en,"[101, 2023, 2407, 2036, 3397, 1996, 18834, 516...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 5, -100, 6, 0, 0, 0, 0, ..."
1,"["", So, here, is, the, balance, NBC, has, to, ...","[0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 7, 8, 0, 0, ...",en,"[101, 1000, 2061, 2182, 2003, 1996, 5703, 6788...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 7, 8, ..."
2,"[It, is, a, protest, song, that, "", creates, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",en,"[101, 2009, 2003, 1037, 6186, 2299, 2008, 1000...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[This, differs, from, approaches, such, as, IP...","[0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, ...",en,"[101, 2023, 12980, 2013, 8107, 2107, 2004, 129...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, ..."
4,"[Since, then, ,, only, Terry, Bradshaw, in, 14...","[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 2, 0, 0, 0, ...",en,"[101, 2144, 2059, 1010, 2069, 6609, 23762, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 2, 0, ..."
...,...,...,...,...,...,...
92715,"[The, couple, had, a, son, ,, David, ,, and, a...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0]",en,"[101, 1996, 3232, 2018, 1037, 2365, 1010, 2585...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."
92716,"[The, Home, Secretary, ,, J., R., Clynes, ,, w...","[0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0]",en,"[101, 1996, 2188, 3187, 1010, 1046, 1012, 1054...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 1, -100, 2, -100, 2, -100, ..."
92717,"[At, the, time, of, her, birth, ,, she, was, f...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",en,"[101, 2012, 1996, 2051, 1997, 2014, 4182, 1010...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
92718,"[The, film, was, based, on, the, Broadway, pla...","[0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0]",en,"[101, 1996, 2143, 2001, 2241, 2006, 1996, 5934...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, ..."


In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [10]:
import evaluate

seqeval = evaluate.load("seqeval")

In [11]:
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
id2label = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}
label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [12]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [13]:
pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [14]:
from transformers import create_optimizer

batch_size = 32
num_train_epochs = 3
num_train_steps = (len(tokenized_ds["train_en"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)




In [15]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=9, id2label=id2label, label2id=label2id
)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [16]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_ds["train_en"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_ds["val_en"],
    shuffle=False,
    batch_size=32,
    collate_fn=data_collator,
)

In [17]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [18]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [19]:
callbacks = [metric_callback]
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

Epoch 1/3
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported




KeyboardInterrupt



In [21]:
model.summary()

Model: "tf_distil_bert_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  6921      
                                                                 
Total params: 66369801 (253.18 MB)
Trainable params: 66369801 (253.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [41]:
example = "Lobyntseva Anastasia Vladimirovna is my teacher of english language, she is living in Moscow and working in PFUR."
tokenized_input = tokenizer(example, is_split_into_words=False)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

k = model.predict(tokenized_input["input_ids"])['logits'][0]

for i in range(len(k)):
    for j in range(len(k[i])):
        if k[i][j] == max(k[i]):
            print(id2label[j])

O
B-PER
B-PER
I-PER
I-PER
I-PER
I-PER
I-PER
I-PER
O
O
O
O
O
O
O
O
O
O
O
B-LOC
O
O
O
B-ORG
O
O
O


In [42]:
tokens

['[CLS]',
 'lo',
 '##by',
 '##nts',
 '##eva',
 'anastasia',
 'vladimir',
 '##ov',
 '##na',
 'is',
 'my',
 'teacher',
 'of',
 'english',
 'language',
 ',',
 'she',
 'is',
 'living',
 'in',
 'moscow',
 'and',
 'working',
 'in',
 'p',
 '##fur',
 '.',
 '[SEP]']

In [34]:
model.predict(tokenized_input["input_ids"])['logits'][0]



array([ 7.0711894, -1.4391791, -1.9337852, -1.0683072, -0.8003571,
       -1.4253184, -1.8651264, -1.2917857, -1.1298411], dtype=float32)