<a href="https://colab.research.google.com/github/pelinbalci/LLM_Notebooks/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dependency

In [19]:
import sys
sys.path.append('..')
import pandas as pd
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments, AutoTokenizer, DataCollatorForTokenClassification
import nltk
import os
from nltk.stem import WordNetLemmatizer
from hydra.core.global_hydra import GlobalHydra
import hydra
from src.scripts.metrics import compute_metrics
from src.scripts.labels_manipulation import tokenize_and_align_labels
from src.scripts.preparation_ner_dataset import prepare_dataset
from src.scripts.predict_ner import predict

## Load conifg

In [2]:
# Clear existing Hydra instance
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()

# Re-initialize
with hydra.initialize(config_path="../config", version_base=None):
    cfg = hydra.compose(config_name="ner-config.yaml", return_hydra_config=True)
    global_cfg = hydra.compose(config_name="config.yaml")

## Paths

In [3]:
root_path = os.path.abspath(os.path.join('..')) 
data_path = os.path.join(root_path, "data", "processed")
models_path = os.path.join(root_path, "models")
output_model_path = os.path.join(root_path, "models", cfg.model.ner_model_name)
training_output_dir = os.path.join(root_path, cfg.paths.training_output_dir)
logging_dir = os.path.join(root_path, cfg.paths.logging_dir_name)


## Constants

In [4]:
EVALUATION_STRATEGY = "epoch"
SAVE_STRATEGY = "epoch"
REPORT_TO = "tensorboard"

# Load Data

In [5]:
structured_text_df = pd.read_csv(os.path.join(data_path, cfg.paths.data_file))

In [6]:
label2id = {label: id for id, label in enumerate(cfg.labels.label_ids)}
id2label = {id: label for label, id in label2id.items()}

In [7]:
# Download WordNet lexical database for English words
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vadim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
lemmatizer = WordNetLemmatizer()

## Tokenization

In [9]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model.name, use_fast=True)

In [12]:
raw_data = prepare_dataset(structured_text_df, lemmatizer, seed=global_cfg.SEED)
tokenized_datasets = raw_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_data["train"].column_names,
    fn_kwargs={"tokenizer": tokenizer}
)

Map:   0%|          | 0/1212 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1212/1212 [00:00<00:00, 11508.36 examples/s]
Map: 100%|██████████| 304/304 [00:00<00:00, 9842.06 examples/s]


In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [14]:
label_ids_count = len(cfg.labels.label_ids)
model = AutoModelForTokenClassification.from_pretrained(
    cfg.model.name, num_labels=label_ids_count, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = training_args = TrainingArguments(
    output_dir=training_output_dir,
    eval_strategy=cfg.training.evaluation_strategy,
    save_strategy=cfg.training.save_strategy,
    learning_rate=cfg.training.learning_rate,
    num_train_epochs=cfg.training.epochs,
    weight_decay=cfg.training.weight_decay,
    logging_dir=logging_dir,
    logging_steps=cfg.training.logging_steps,
    report_to=cfg.training.report_to
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer
)

trainer.train()

trainer.save_model(output_model_path)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.286,0.206489,0.995788,0.995798,0.995788,0.995721
2,0.124,0.087352,0.995788,0.995823,0.995788,0.995753
3,0.1375,0.074345,0.997051,0.9971,0.997051,0.997043


## Predict

In [17]:
model = AutoModelForTokenClassification.from_pretrained(output_model_path)
tokenizer = AutoTokenizer.from_pretrained(output_model_path, model_max_length=512)

In [20]:
# example text, must return "horse"
text = "Horse is there"
predictions = predict(text, model, tokenizer, lemmatizer)

print(predictions)

['horse']
