# Token classification (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece] -q
!pip install accelerate -q
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# !apt install git-lfs

[K     |████████████████████████████████| 431 kB 5.1 MB/s 
[K     |████████████████████████████████| 69 kB 8.0 MB/s 
[K     |████████████████████████████████| 4.9 MB 66.3 MB/s 
[K     |████████████████████████████████| 115 kB 64.0 MB/s 
[K     |████████████████████████████████| 163 kB 70.1 MB/s 
[K     |████████████████████████████████| 212 kB 52.2 MB/s 
[K     |████████████████████████████████| 127 kB 62.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 42.0 MB/s 
[K     |████████████████████████████████| 1.3 MB 66.8 MB/s 
[K     |████████████████████████████████| 143 kB 5.0 MB/s 
[?25h

You will need to setup git, adapt your email and name in the following cell.

In [10]:
import datasets

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [81]:
import pandas as pd
import ast
df=pd.read_csv('dataframe.csv')
df['ner_tags'] = df['ner_tags'].apply(ast.literal_eval)
df['tokens'] = df['tokens'].apply(ast.literal_eval)


# df.columns=['ner_tags','tokens','id']
df.head()

Unnamed: 0,tokens,ner_tags
0,"[Notice of Representation, Ed Eddy and Sarah L...","[Other, Law-Firm, Law-Office-Address, Insuranc..."
1,"[Notice of Representation, Hemingway & Poe PLL...","[Other, Law-Firm, Law-Office-Address, Insuranc..."
2,"[Notice of Representation, The Family Law Offi...","[Other, Law-Firm, Law-Office-Address, Insuranc..."
3,"[Notice of Representation, Gladwell Law, 530 S...","[Other, Law-Firm, Law-Office-Address, Insuranc..."
4,"[Notice of Representation, Budget Mutual Insur...","[Other, Insurance-Company, Insurance-Company-A..."


In [82]:
# labels=list(df.ner_tags.unique())
labels=[]
for i in range(len(df)):
  labels.extend(df.loc[i,'ner_tags'])
labels=list(set(labels))

In [83]:
labels

['Insurance-Company-Address',
 'Other',
 'Policy-Number',
 'Sender',
 'Law-Firm',
 'Insurance-Company',
 'Payout',
 'Required-Action',
 'Policy-Holder-Name',
 'Beneficiary-Name',
 'Law-Office-Address']

In [84]:
# from sklearn.model_selection import GroupShuffleSplit 

# splitter = GroupShuffleSplit(test_size=.20, n_splits=1, random_state = 7)
# split = splitter.split(df, groups=df['id'])
# train_inds, test_inds = next(split)

# train_df = df.iloc[train_inds]
# test_df = df.iloc[test_inds]

# print(len(train_df),len(test_df))

In [85]:
# Shuffle your dataset 
shuffle_df = df.sample(frac=1)

# Define a size for your train set 
train_size = int(0.7 * len(df))

# Split your dataset 
train_df = shuffle_df[:train_size]
test_df = shuffle_df[train_size:]

In [86]:
test_data=datasets.Dataset.from_pandas(test_df)
train_data=datasets.Dataset.from_pandas(train_df)

In [87]:
raw_datasets = datasets.DatasetDict({
    'train': train_data,
    'test':test_data})
raw_datasets = raw_datasets.remove_columns('__index_level_0__')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 205
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 89
    })
})

In [88]:
raw_datasets=raw_datasets.cast_column("ner_tags",datasets.Sequence(datasets.ClassLabel(names=labels)))
raw_datasets=raw_datasets.cast_column("tokens",datasets.Sequence(datasets.Value(dtype='string')))

raw_datasets

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 205
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 89
    })
})

In [89]:
raw_datasets["train"][0]["tokens"]

['Notice of Representation',
 'Number One Insurance Company',
 '1234 Gateway Dr Chicago, IL 15002',
 'Stine Law',
 '32124 19th Ln SW Auburn, Washington(WA), 98001',
 'Re: Estate of',
 'Rosie Montoya',
 'Policy number:',
 '407189890',
 'Our client:',
 'Harold Arnold',
 'Date of death: 5/13/2020 To Whom It May Concern, I',
 'have been retained by',
 'Harold Arnold',
 'to',
 'handle the estate',
 'of',
 'Rosie Montoya',
 '. My understanding is that they had a life insurance',
 'policy with your company. If this is correct, please send',
 'a letter to my office indicating you have received our',
 'letter of representation. Additionally, please do not contact our client',
 'going forward. Our understanding is that the policy was for',
 'the amount of',
 '$500,000',
 '. If that is correct, please forward that amount to',
 'our office. If there are any forms that need to',
 'be completed, please forward those as well. If you are',
 'aware of any additional policies that are in force, send',
 

In [90]:
raw_datasets["train"][0]["ner_tags"]

[1,
 5,
 0,
 4,
 10,
 1,
 8,
 1,
 2,
 1,
 9,
 1,
 1,
 9,
 1,
 7,
 1,
 8,
 1,
 1,
 1,
 1,
 1,
 1,
 6,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1]

In [91]:
raw_datasets["train"].features["tokens"]

Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

In [92]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(num_classes=11, names=['Insurance-Company-Address', 'Other', 'Policy-Number', 'Sender', 'Law-Firm', 'Insurance-Company', 'Payout', 'Required-Action', 'Policy-Holder-Name', 'Beneficiary-Name', 'Law-Office-Address'], id=None), length=-1, id=None)

In [93]:
label_names = ner_feature.feature.names
label_names

['Insurance-Company-Address',
 'Other',
 'Policy-Number',
 'Sender',
 'Law-Firm',
 'Insurance-Company',
 'Payout',
 'Required-Action',
 'Policy-Holder-Name',
 'Beneficiary-Name',
 'Law-Office-Address']

In [94]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

Notice of Representation Number One Insurance Company 1234 Gateway Dr Chicago, IL 15002 Stine Law 32124 19th Ln SW Auburn, Washington(WA), 98001 Re: Estate of Rosie Montoya      Policy number: 407189890     Our client: Harold Arnold    Date of death: 5/13/2020 To Whom It May Concern, I have been retained by Harold Arnold    to    handle the estate of    Rosie Montoya      . My understanding is that they had a life insurance policy with your company. If this is correct, please send a letter to my office indicating you have received our letter of representation. Additionally, please do not contact our client going forward. Our understanding is that the policy was for the amount of $500,000 . If that is correct, please forward that amount to our office. If there are any forms that need to be completed, please forward those as well. If you are aware of any additional policies that are in force, send information about those policies to our office. If you have any questions, please contact m

In [95]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [96]:
tokenizer.is_fast

True

In [97]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'Not',
 '##ice',
 'of',
 'Rep',
 '##res',
 '##entation',
 'Number',
 'One',
 'Insurance',
 'Company',
 '123',
 '##4',
 'Gateway',
 'Dr',
 'Chicago',
 ',',
 'IL',
 '1500',
 '##2',
 'St',
 '##ine',
 'Law',
 '321',
 '##24',
 '19th',
 'L',
 '##n',
 'S',
 '##W',
 'Auburn',
 ',',
 'Washington',
 '(',
 'WA',
 ')',
 ',',
 '98',
 '##00',
 '##1',
 'Re',
 ':',
 'Estate',
 'of',
 'Rosie',
 'Mont',
 '##oya',
 'Policy',
 'number',
 ':',
 '40',
 '##7',
 '##18',
 '##9',
 '##8',
 '##90',
 'Our',
 'client',
 ':',
 'Harold',
 'Arnold',
 'Date',
 'of',
 'death',
 ':',
 '5',
 '/',
 '13',
 '/',
 '2020',
 'To',
 'Who',
 '##m',
 'It',
 'May',
 'Con',
 '##cer',
 '##n',
 ',',
 'I',
 'have',
 'been',
 'retained',
 'by',
 'Harold',
 'Arnold',
 'to',
 'handle',
 'the',
 'estate',
 'of',
 'Rosie',
 'Mont',
 '##oya',
 '.',
 'My',
 'understanding',
 'is',
 'that',
 'they',
 'had',
 'a',
 'life',
 'insurance',
 'policy',
 'with',
 'your',
 'company',
 '.',
 'If',
 'this',
 'is',
 'correct',
 ',',
 'please',

In [98]:
inputs.word_ids()

[None,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 13,
 13,
 14,
 15,
 15,
 15,
 16,
 17,
 17,
 17,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 18,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 19,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 21,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 22,
 23,
 23,
 23,
 24,
 24,
 24,
 24,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 25,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 28,
 28,
 28,
 28,
 28,
 28,
 28,
 28,
 28,
 28,
 28,
 29,
 29,
 29,
 29,
 29,
 29,
 29,
 29,
 29,
 29,
 

In [99]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [100]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[1, 5, 0, 4, 10, 1, 8, 1, 2, 1, 9, 1, 1, 9, 1, 7, 1, 8, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 3, 1]
[-100, 1, 2, 2, 2, 2, 2, 5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 2, 2, 2, 8, 8, 8, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 9, 10, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 9, 10, 1, 7, 8, 8, 1, 8, 8, 8, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 6, 6, 6, 6, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 1, 2, 2, 2, -100]


In [101]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [102]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [103]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [104]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    1,    2,    2,    2,    2,    2,    5,    6,    6,    6,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    4,    4,    4,   10,
           10,   10,   10,   10,   10,   10,   10,   10,   10,   10,   10,   10,
           10,   10,   10,   10,    1,    2,    2,    2,    8,    8,    8,    1,
            2,    2,    2,    2,    2,    2,    2,    2,    1,    2,    2,    9,
           10,    1,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    1,    2,    2,    2,
            9,   10,    1,    7,    8,    8,    1,    8,    8,    8,    1,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    1,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    1,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    1,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    1,    2,    2,    2,    2,    2,
            2,    2,    2,  

In [105]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 1, 2, 2, 2, 2, 2, 5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 2, 2, 2, 8, 8, 8, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 9, 10, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 9, 10, 1, 7, 8, 8, 1, 8, 8, 8, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 6, 6, 6, 6, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 1, 2, 2, 2, -100]
[-100, 1, 2, 2, 2, 2, 2, 5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 2, 2, 2, 9, 10, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 9, 10, 1, 7, 8, 8, 1, 8, 8, 1, 2, 2, 2, 2, 2, 2

In [106]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.6 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=d93af1b1b7a2f301f5aec5475fd353ec56e0a480bce69f1d13e0aea8af0063ab
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [107]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [108]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['Other',
 'Insurance-Company',
 'Insurance-Company-Address',
 'Law-Firm',
 'Law-Office-Address',
 'Other',
 'Policy-Holder-Name',
 'Other',
 'Policy-Number',
 'Other',
 'Beneficiary-Name',
 'Other',
 'Other',
 'Beneficiary-Name',
 'Other',
 'Required-Action',
 'Other',
 'Policy-Holder-Name',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Payout',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Sender',
 'Other']

In [109]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'Action': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Company': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Company-Address': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'Firm': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Holder-Name': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Name': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Number': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Office-Address': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ayout': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ender': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.9166666666666666,
 'overall_f1': 0.9565217391304348,
 'overall_accuracy': 0.9696969696969697}

In [110]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [111]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [112]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [113]:
model.config.num_labels

11

In [114]:
# from huggingface_hub import notebook_login

# notebook_login()

In [115]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    # push_to_hub=True,
)

In [117]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 205
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 78


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.313168,0.552846,0.432051,0.485041,0.931508
2,No log,0.095096,0.920459,0.878221,0.898844,0.985851
3,No log,0.068741,0.966455,0.966114,0.966284,0.992156


***** Running Evaluation *****
  Num examples = 89
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-26
Configuration saved in bert-finetuned-ner/checkpoint-26/config.json
Model weights saved in bert-finetuned-ner/checkpoint-26/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-26/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-26/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 89
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-52
Configuration saved in bert-finetuned-ner/checkpoint-52/config.json
Model weights saved in bert-finetuned-ner/checkpoint-52/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-52/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-52/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 89
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/ch

TrainOutput(global_step=78, training_loss=0.39805857340494794, metrics={'train_runtime': 50.9993, 'train_samples_per_second': 12.059, 'train_steps_per_second': 1.529, 'total_flos': 75405559809468.0, 'train_loss': 0.39805857340494794, 'epoch': 3.0})

In [None]:
trainer.push_to_hub(commit_message="Training complete")

'https://huggingface.co/sgugger/bert-finetuned-ner/commit/26ab21e5b1568f9afeccdaed2d8715f571d786ed'

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "bert-finetuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'sgugger/bert-finetuned-ner-accelerate'

In [None]:
output_dir = "bert-finetuned-ner-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

In [None]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

[{'entity_group': 'PER', 'score': 0.9988506, 'word': 'Sylvain', 'start': 11, 'end': 18},
 {'entity_group': 'ORG', 'score': 0.9647625, 'word': 'Hugging Face', 'start': 33, 'end': 45},
 {'entity_group': 'LOC', 'score': 0.9986118, 'word': 'Brooklyn', 'start': 49, 'end': 57}]