In [1]:
import nltk
from nltk.corpus import brown
import os

In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
nltk.download("brown")
nltk.download("universal_tagset")

[nltk_data] Downloading package brown to /home/ubuntu/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [4]:
corpus = brown.tagged_sents(tagset="universal")

In [5]:
len(corpus[0])

25

In [6]:
corpus[0]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'NOUN'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', 'NOUN'),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

#### Make corpus in form of huggingface dataset

In [7]:
inputs = []
targets = []

for sentence_tag_pairs in corpus:
    tokens = []
    target = []
    for token,tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)


In [8]:
len(inputs)

57340

In [9]:
len(targets)

57340

In [10]:
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


##### Saving data in json form now 

In [11]:
import json
with open("pos_data.json","w") as f:
    for x,y in zip(inputs,targets):
        j = {"inputs": x,"targets": y}
        s = json.dumps(j)
        f.write(f"{s}\n")

In [12]:
from datasets import load_dataset

In [13]:
data = load_dataset("json",data_files="pos_data.json")

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 9642.08it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1226.76it/s]
Generating train split: 57340 examples [00:00, 535438.69 examples/s]


In [14]:
data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 57340
    })
})

In [15]:
#small = data["train"].shuffle(seed=42).select(range(20_000))

In [16]:
#small

In [17]:
from sklearn.model_selection import train_test_split


In [18]:
data  = data['train'].train_test_split(seed=42)

In [19]:
data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 43005
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 14335
    })
})

In [20]:
data['train'].features

{'inputs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'targets': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [21]:
##### Map Target to Int
target_set = set()
for target in targets:
    target_set=target_set.union(target)
target_set

{'.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X'}

In [22]:
target_list =list(target_set)
id2label = {k:v for k,v in enumerate(target_list)}
label2id = {v:k for k,v in id2label.items()}

In [23]:

id2label

{0: 'NOUN',
 1: 'ADV',
 2: '.',
 3: 'PRT',
 4: 'DET',
 5: 'PRON',
 6: 'ADJ',
 7: 'X',
 8: 'CONJ',
 9: 'VERB',
 10: 'NUM',
 11: 'ADP'}

In [24]:
label2id

{'NOUN': 0,
 'ADV': 1,
 '.': 2,
 'PRT': 3,
 'DET': 4,
 'PRON': 5,
 'ADJ': 6,
 'X': 7,
 'CONJ': 8,
 'VERB': 9,
 'NUM': 10,
 'ADP': 11}

In [25]:
from transformers import AutoTokenizer

In [26]:
checkpoint = "bert-base-cased"

In [27]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [28]:
idx = 0
t = tokenizer(data['train'][idx]["inputs"],is_split_into_words=True)

In [29]:
t.tokens()

['[CLS]', 'Locke', '##d', '.', '[SEP]']

In [30]:
t.word_ids()

[None, 0, 0, 1, None]

#### We need to aligned the targets in this problem statement

In [31]:
def align_targets(labels,word_ids):
    aligned_labels = []
    for word in word_ids:
        if word is None:
            label = -100
        else:
            label = label2id[labels[word]]
        aligned_labels.append(label)
    return aligned_labels

In [32]:
labels = data["train"][idx]["targets"]
word_ids = t.word_ids()
aligned_targets = align_targets(labels,word_ids)

In [33]:
aligned_targets

[-100, 9, 9, 2, -100]

In [34]:
# Printing aligned label with corresponding tokenize word
aligned_labels = [id2label[i] if i>=0 else None for i in aligned_targets]
for x,y in zip(t.tokens(),aligned_labels):
    print(f"{x}\t {y}")

[CLS]	 None
Locke	 VERB
##d	 VERB
.	 .
[SEP]	 None


#### Tokenize both inputs and targets

In [35]:
def tokenize_func(batch):
    tokenized_inputs = tokenizer(
        batch['inputs'],truncation=True,is_split_into_words=True
    )

    labels_batch = batch['targets'] # original targets
    aligned_labels_batch =[]
    for i , labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_targets(labels,word_ids))
    tokenized_inputs['labels'] = aligned_labels_batch

    return tokenized_inputs

In [36]:
data['train'].column_names

['inputs', 'targets']

In [37]:
tokenized_datasets = data.map(
    tokenize_func,batched=True,remove_columns=data['train'].column_names
)

Map: 100%|██████████| 43005/43005 [00:04<00:00, 10528.45 examples/s]
Map: 100%|██████████| 14335/14335 [00:01<00:00, 10621.25 examples/s]


In [38]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 43005
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14335
    })
})

In [39]:
from transformers import DataCollatorForTokenClassification

In [40]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [41]:
def flatten(list_of_list):
    flattened = [val for sublist in list_of_list for val in sublist]
    return flattened

In [42]:
import numpy as np
from sklearn.metrics import f1_score,accuracy_score

In [43]:
def compute_metrics(logits_and_lables):
    logits,labels = logits_and_lables
    preds = np.argmax(logits,axis=-1)

    # remove -100 from the labels and predictions  
    labels_jagged = [[t for t in label if t!=-100 ]for label in labels]
    preds_jagged = [[p for p,t in zip(ps,ts) if t!=-100] for ps,ts in zip(preds,labels)]

    # flatten the labels and predictions
    labels_flat = flatten(labels_jagged)
    preds_flat = flatten(preds_jagged)

    acc = accuracy_score(labels_flat,preds_flat)
    f1 = f1_score(labels_flat,preds_flat,average="macro")

    return {
        "f1":f1,
        "accuracy" : acc
    }




In [44]:
import torch

In [45]:
from transformers import TrainingArguments,Trainer,AutoModelForTokenClassification

In [46]:
model = AutoModelForTokenClassification.from_pretrained(checkpoint,id2label=id2label,label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
device = torch.device("cuda")
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [48]:
training_args = TrainingArguments(
    output_dir="bert-finetuned-pos",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    seed=4534
)

In [49]:
tokenized_datasets['train'].features['labels']

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [50]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [53]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0325,0.048763,0.966751,0.98765
2,0.0181,0.049261,0.965658,0.98871
3,0.0087,0.058965,0.968099,0.988901
4,0.0033,0.069712,0.971625,0.989332
5,0.0021,0.075378,0.972065,0.98958


TrainOutput(global_step=26880, training_loss=0.014048063783862051, metrics={'train_runtime': 2319.7145, 'train_samples_per_second': 92.695, 'train_steps_per_second': 11.588, 'total_flos': 5630720765568984.0, 'train_loss': 0.014048063783862051, 'epoch': 5.0})

In [54]:
trainer.save_model("bert_fined_tuned_pos_model")

In [55]:
from transformers import pipeline

pipe = pipeline(
    "token-classification",
    model="/home/ubuntu/uzair/NLP/pos_tagging/bert_fined_tuned_pos_model",
    device=0
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [58]:
test_text1 = "The quick brown fox jumps over the lazy dog."
test_text2 = "Just had the best vacation ever!"

In [59]:
pipe(test_text2)

[{'entity': 'ADV',
  'score': 0.99996483,
  'index': 1,
  'word': 'Just',
  'start': 0,
  'end': 4},
 {'entity': 'VERB',
  'score': 0.9999906,
  'index': 2,
  'word': 'had',
  'start': 5,
  'end': 8},
 {'entity': 'DET',
  'score': 0.9999924,
  'index': 3,
  'word': 'the',
  'start': 9,
  'end': 12},
 {'entity': 'ADJ',
  'score': 0.9999534,
  'index': 4,
  'word': 'best',
  'start': 13,
  'end': 17},
 {'entity': 'NOUN',
  'score': 0.9999869,
  'index': 5,
  'word': 'vacation',
  'start': 18,
  'end': 26},
 {'entity': 'ADV',
  'score': 0.99996257,
  'index': 6,
  'word': 'ever',
  'start': 27,
  'end': 31},
 {'entity': '.',
  'score': 0.99999607,
  'index': 7,
  'word': '!',
  'start': 31,
  'end': 32}]