In [6]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from datasets import load_dataset

In [7]:
dataset = load_dataset("conll2003")

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [8]:
dataset['train']

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 14041
})

In [9]:
dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [18]:
labels = dataset['train'].features['ner_tags'].feature.names

In [19]:
labels

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [20]:
tokenized_labels = {label: i for i, label in enumerate(labels)}

In [21]:
tokenized_labels

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [62]:
# We need to tokenize each word in the sentence with its corresponding label
training_data = []
def map_word_label(data):
    for i in range(len(data)):
        token2label = {"tokens": [], "labels": []}
        tokens = data[i]['tokens']
        labels = data[i]['ner_tags']
        token2label['tokens'], token2label['labels'] = tokens, labels
        training_data.append(token2label)


map_word_label(dataset['train'])

In [34]:
import pandas as pd

df = pd.DataFrame(training_data)

In [35]:
df

Unnamed: 0,tokens,labels
0,EU,3
1,rejects,0
2,German,7
3,call,0
4,to,0
...,...,...
23618,216,0
23619,217,0
23620,Swe,5
23621,Bradley,1


In [38]:
df['tokens'].nunique()

23623

In [46]:
df[df['labels'] == 3]['labels'].count().item()

1443

In [44]:
# label to count graph
label_counts = [df[df['labels'] == i]['labels'].count().item() for i in range(9)]

In [53]:
data = {}
idx = 0
for key in tokenized_labels.keys():
    data[key] = label_counts[idx]
    idx+=1

In [54]:
import matplotlib.pyplot as plt
import seaborn as sns

data

{'O': 15529,
 'B-PER': 1627,
 'I-PER': 2213,
 'B-ORG': 1443,
 'I-ORG': 819,
 'B-LOC': 1050,
 'I-LOC': 189,
 'B-MISC': 565,
 'I-MISC': 188}

In [66]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenized_data = tokenizer(training_data[0]['tokens'], is_split_into_words=True)

tokenized_data

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [71]:
# Toeknize and encode each sentence
def encode_tokens():
    for sentence in training_data:
        sentence['tokens'] = tokenizer(sentence['tokens'], is_split_into_words=True)
encode_tokens()

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [72]:
training_data

[{'tokens': {'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
  'labels': [3, 0, 7, 0, 0, 0, 7, 0, 0]},
 {'tokens': {'input_ids': [101, 1943, 14428, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]},
  'labels': [1, 2]},
 {'tokens': {'input_ids': [101, 26660, 13329, 12649, 15928, 1820, 118, 4775, 118, 1659, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
  'labels': [5, 0]},
 {'tokens': {'input_ids': [101, 1109, 1735, 2827, 1163, 1113, 9170, 1122, 19786, 1114, 1528, 5566, 1106, 11060, 1106, 188, 17315, 1418, 2495, 12913, 1235, 6479, 4959, 2480, 6340, 13991, 3653, 1169, 1129, 12086, 1106, 8892, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1,