<a href="https://colab.research.google.com/github/pelinbalci/LLM_Notebooks/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dependency

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments, AutoTokenizer, DataCollatorForTokenClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import Dataset, DatasetDict
import nltk
import os
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from hydra.core.global_hydra import GlobalHydra
import hydra


## Load conifg

In [28]:
# Clear existing Hydra instance
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()

# Re-initialize
with hydra.initialize(config_path="../config", version_base=None):
    cfg = hydra.compose(config_name="ner-config.yaml", return_hydra_config=True)
    global_cfg = hydra.compose(config_name="config.yaml")

## Paths

In [None]:
root_path = os.path.abspath(os.path.join('..')) 
data_path = os.path.join(root_path, "data", "processed")
models_path = os.path.join(root_path, "models")
output_model_path = os.path.join(root_path, "models", cfg.model.ner_model_name)
training_output_dir = os.path.join(root_path, cfg.paths.training_output_dir)
logging_dir = os.path.join(root_path, cfg.paths.logging_dir_name)


## Constants

In [30]:
EVALUATION_STRATEGY = "epoch"
SAVE_STRATEGY = "epoch"
REPORT_TO = "tensorboard"

# Load Data

In [31]:
structured_text_df = pd.read_csv(os.path.join(data_path, cfg.paths.data_file))

In [32]:
label2id = {label: id for id, label in enumerate(cfg.labels.label_ids)}
id2label = {id: label for label, id in label2id.items()}

In [33]:
# Download WordNet lexical database for English words
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vadim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [34]:
lemmatizer = WordNetLemmatizer()

In [35]:
def prepare_dataset(df, split_ratio=0.2, seed=42):
    raw_data_dict = {}

    for idx in set(df.Sentence_ID.values):
        sentence = df[df.Sentence_ID == idx]
        words = list(sentence.Words.values)

        raw_data_dict[idx] = {
            'words': [lemmatizer.lemmatize(word) for word in words],
            'original_labels': list(sentence.Labels.values),
            'ner_tags': list(sentence.ner_tags.values)
        }

    data_list = [
        {
            'id': idx,
            'words': data['words'],
            'ner_tags': data['ner_tags']
        }
        for idx, data in raw_data_dict.items()
    ]

    data_list = shuffle(data_list, random_state=seed)

    train_dataset = Dataset.from_dict({k: [d[k] for d in data_list] for k in data_list[0]})

    train_valid_split = train_dataset.train_test_split(test_size=split_ratio, seed=seed)

    return DatasetDict({
        "train": train_valid_split["train"],
        "valid": train_valid_split["test"]
    })

## Tokenization

In [36]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model.name, use_fast=True)

In [37]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id

            if word_id is None:
                label = -100
            else:
                label = labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]

            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [38]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["words"], truncation=True, padding=True, is_split_into_words=True
    )

    all_labels = examples["ner_tags"]
    new_labels = []
    
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        labels_with_tokens = align_labels_with_tokens(labels, word_ids)
        new_labels.append(labels_with_tokens)

    tokenized_inputs["labels"] = new_labels

    return tokenized_inputs

In [39]:
raw_data = prepare_dataset(structured_text_df, seed=global_cfg.SEED)
tokenized_datasets = raw_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_data["train"].column_names
)

Map:   0%|          | 0/1212 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1212/1212 [00:00<00:00, 10711.70 examples/s]
Map: 100%|██████████| 304/304 [00:00<00:00, 10482.05 examples/s]


In [40]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [41]:
label_ids_count = len(cfg.labels.label_ids)
model = AutoModelForTokenClassification.from_pretrained(
    cfg.model.name, num_labels=label_ids_count, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    valid_indices = labels != -100
    predictions = predictions[valid_indices]
    labels = labels[valid_indices]

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    f1 = f1_score(labels, predictions, average='weighted', zero_division=0)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [43]:
training_args = training_args = TrainingArguments(
    output_dir=training_output_dir,
    evaluation_strategy=cfg.training.evaluation_strategy,
    save_strategy=cfg.training.save_strategy,
    learning_rate=cfg.training.learning_rate,
    num_train_epochs=cfg.training.epochs,
    weight_decay=cfg.training.weight_decay,
    logging_dir=logging_dir,
    logging_steps=cfg.training.logging_steps,
    report_to=cfg.training.report_to
)



In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

trainer.save_model(output_model_path)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2417,0.166201,0.995788,0.995817,0.995788,0.995732
2,0.1126,0.080554,0.99663,0.996668,0.99663,0.996595
3,0.1328,0.06801,0.997473,0.997507,0.997473,0.997461


## Predict

In [45]:
model = AutoModelForTokenClassification.from_pretrained(output_model_path)
tokenizer = AutoTokenizer.from_pretrained(output_model_path)

In [46]:
def predict(text, model, tokenizer, lemmatizer=None):
    # If lemmatizer is provided, lemmatize the text
    if lemmatizer:
        text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Disable gradient computation for inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted labels by selecting the class with the highest probability
    predictions = torch.argmax(outputs.logits, dim=-1)

    # Convert tensor to NumPy array for easier handling
    predicted_labels = predictions[0].cpu().numpy()

    # Map the predicted labels to their corresponding class names
    label_ids = model.config.id2label

    # Extract predicted animals (excluding 'O' which is for non-entities)
    predicted_animals = [label_ids[label] for label in predicted_labels if label_ids[label] != "O"]

    return predicted_animals

In [None]:
# example text, must return "horse"
text = "Horse is there"
predictions = predict(text, model, tokenizer, lemmatizer)

print(predictions)

['horse']
