# Installetions, imports and global variables

In [None]:
! pip install datasets transformers seqeval

In [None]:
pip install evaluate


In [None]:
! pip install adapter_transformers

In [None]:
from re import template
from pathlib import Path
import random
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import pipeline
from datasets import load_dataset
from datasets import DatasetDict, Dataset
from sklearn.metrics import f1_score, accuracy_score


In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CUDA_LAUNCH_BLOCKING = "1"
OUT_PATH = Path("results")

TOKENIZER = "bert-base-uncased"
MODEL = "bert-base-uncased"

## Fine tuning

#### We will fine tuned the bert-base model on conll2003 dataset for NER task

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
from evaluate import load

metric = load("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
conll = load_dataset("conll2003")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

label_all_tokens = True

label_list = conll["train"].features[f"ner_tags"].feature.names

tokenized_conll = conll.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)



Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained(MODEL, num_labels=len(label_list))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    do_train=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_conll["train"],
    eval_dataset=tokenized_conll["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2283,0.109123,0.896824,0.891289,0.894048,0.974846
2,0.0458,0.124425,0.887151,0.902207,0.894616,0.974915
3,0.0255,0.128571,0.895956,0.904462,0.900189,0.976486
4,0.0143,0.139466,0.895451,0.908735,0.902044,0.976642
5,0.0102,0.14424,0.898756,0.90921,0.903953,0.977004


TrainOutput(global_step=4390, training_loss=0.05180166674637849, metrics={'train_runtime': 863.204, 'train_samples_per_second': 81.331, 'train_steps_per_second': 5.086, 'total_flos': 1702317283240608.0, 'train_loss': 0.05180166674637849, 'epoch': 5.0})

In [None]:
model.save_pretrained('./Fine_tune_BERT/')

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [None]:
model.push_to_hub("tranthai123/ner_bert")
tokenizer.push_to_hub("tranthai123/ner_bert")

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tranthai123/ner_bert/commit/22ebfe2b9c8f543eee33d59377dcdea7074fb5a4', commit_message='Upload tokenizer', commit_description='', oid='22ebfe2b9c8f543eee33d59377dcdea7074fb5a4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tranthai123/ner_bert', endpoint='https://huggingface.co', repo_type='model', repo_id='tranthai123/ner_bert'), pr_revision=None, pr_num=None)

In [None]:
from re import template

with open('Alice_book.txt') as f:
    alice_book = f.readlines()

alice_book = alice_book[40:-2]

for line in alice_book:
  if '*' in line or 'CHAPTER' in line:
    alice_book.remove(line)

tmp_paragraph = []
paragraphs_list = []
for i, line in enumerate(alice_book):
  if line != '\n':
    tmp_paragraph.append(line[:-1])
  else:
    if tmp_paragraph:
      tmp_paragraph = ' '.join(tmp_paragraph)
      paragraphs_list.append(tmp_paragraph)
    tmp_paragraph = []

In [None]:
label_list = [
    "O",       # Outside of a named entity
    "B-PER",   # Beginning of a person's name right after another person's name
    "I-PER",   # Person's name
    "B-ORG",   # Beginning of an organisation right after another organisation
    "I-ORG",   # Organisation
    "B-LOC",   # Beginning of a location right after another location
    "I-LOC",   # Location
    "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
    "I-MISC"   # Miscellaneous entity
]
ner_tags_map = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [None]:
device

device(type='cuda')

In [None]:
input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0).to(device)
attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0).to(device)

In [None]:
model.to(device)

In [None]:


results = pd.DataFrame(columns = ['tokens', 'ner_tags'])

for i, paragraph in enumerate(paragraphs_list):
  tokens = tokenizer(paragraph)
  input_ids = torch.tensor(tokens['input_ids']).unsqueeze(0).to(device)
  attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0).to(device)

  torch.tensor(tokens['input_ids']).unsqueeze(0).size()

  preds = model(input_ids=input_ids, attention_mask=attention_mask)
  preds = torch.argmax(preds.logits.squeeze(), axis=1)
  words = tokenizer.batch_decode(tokens['input_ids'])
  value_preds = [label_list[i] for i in preds]

  tmp = pd.DataFrame({'tokens': words, 'ner_tags': value_preds})
  results = pd.concat([results, tmp], ignore_index=True)

results

Unnamed: 0,tokens,ner_tags
0,[CLS],O
1,alice,B-PER
2,was,O
3,beginning,O
4,to,O
...,...,...
37990,s,O
37991,heavy,O
37992,sobs,O
37993,.,O


In [None]:
results.to_csv('Alice_results.csv')