To do:


1.   Test more hyperparameters to get metrics in 0.9 - HuggingFace Trainer has native support for hyperparameter search using either Optuna, Ray Tune, or Weights & Biases.
2.   Data augmentation - use another LLM to do NER categorization of text. and then add that to the training model.



In [5]:
#!pip install --upgrade transformers
!pip install -q transformers datasets seqeval optuna torch tqdm evaluate


In [6]:


# Clone the repo
!git clone https://github.com/NER-AncientLanguages/NERAncientGreekML4AL.git
%cd NERAncientGreekML4AL

# Verify data exists
!ls final_dataset/normal/*.conll

Cloning into 'NERAncientGreekML4AL'...
remote: Enumerating objects: 251, done.[K
remote: Total 251 (delta 0), reused 0 (delta 0), pack-reused 251 (from 3)[K
Receiving objects: 100% (251/251), 106.47 MiB | 2.97 MiB/s, done.
Resolving deltas: 100% (98/98), done.
Updating files: 100% (199/199), done.
Downloading Data/homogenisation/full_dataset_FINAL.csv (113 MB)
Error downloading object: Data/homogenisation/full_dataset_FINAL.csv (82d984c): Smudge error: Error downloading Data/homogenisation/full_dataset_FINAL.csv (82d984c506fbdcea63db80edc6d34c42f5128b2de3a34df706c6ecae87f02254): batch response: This repository exceeded its LFS budget. The account responsible for the budget should increase it to restore access.

Errors logged to /content/NERAncientGreekML4AL/NERAncientGreekML4AL/.git/lfs/logs/20251117T203313.347080711.log
Use `git lfs logs last` to view the log.
error: external filter 'git-lfs filter-process' failed
fatal: Data/homogenisation/full_dataset_FINAL.csv: smudge filter lfs 

In [7]:
import os, warnings, unicodedata, numpy as np
from pathlib import Path
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    TrainingArguments, Trainer, DataCollatorForTokenClassification
)

def read_conll(p: Path):
    """
    Parse CoNLL with format:
        [line_id]  token  [POS]  NER
    Example:
        110089790	βίβλος	O
    Returns: {"tokens": [...], "ner_tags": [...]}
    """
    sents, labs = [], []
    with p.open(encoding="utf-8") as f:
        sent, lab = [], []
        for i, raw in enumerate(f, 1):
            line = raw.strip()
            if not line or line.startswith("#"):
                if sent:
                    sents.append(sent)
                    labs.append(lab)
                    sent, lab = [], []
                continue

            # Split on whitespace (handles tabs and spaces)
            parts = line.split()
            if len(parts) < 2:
                print(f"Warning: Line {i} in {p.name} has <2 columns → SKIPPED")
                print(f"    → {line!r}")
                continue

            if len(parts) == 2:
                token = parts[0]
                ner   = parts[1]
            else:
                token = parts[1]   # skip ID
                ner   = parts[-1]  # last column is NER

            sent.append(unicodedata.normalize("NFC", token))
            lab.append(ner)

        if sent:
            sents.append(sent)
            labs.append(lab)

    print(f"Loaded {len(sents)} sentences from {p.name}")
    return {"tokens": sents, "ner_tags": labs}

# load data
train_path = Path("final_dataset/normal/train.conll")
val_path   = Path("final_dataset/normal/val.conll")
test_path  = Path("final_dataset/normal/test.conll")

raw = {
    "train": read_conll(train_path),
    "validation": read_conll(val_path),
    "test": read_conll(test_path),
}
data = DatasetDict({k: Dataset.from_dict(v) for k, v in raw.items()})

#Model name -------------------------------------------------------------
model_name = "Marijke/AG_BERT_hypopt_NER"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
#------------------------------------------------------------------------

all_labels = sorted({l for s in data["train"]["ner_tags"] for l in s})
label2id   = {l: i for i, l in enumerate(all_labels)}
id2label   = {i: l for l, i in label2id.items()}

#tokenise + align labels
def tokenise_align(example):
    tok = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    aligned = []
    for i, labs in enumerate(example["ner_tags"]):
        word_ids = tok.word_ids(batch_index=i)
        prev = None
        ids  = []
        for wid in word_ids:
            if wid is None:
                ids.append(-100)
            elif wid != prev:
                ids.append(label2id[labs[wid]])
            else:
                ids.append(-100)               # sub-word → ignore
            prev = wid
        aligned.append(ids)
    tok["labels"] = aligned
    return tok

tokenised = data.map(tokenise_align, batched=True,
                     remove_columns=data["train"].column_names)


model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
)

collator = DataCollatorForTokenClassification(tokenizer)


Loaded 30686 sentences from train.conll
Loaded 4434 sentences from val.conll
Loaded 4701 sentences from test.conll


Map:   0%|          | 0/30686 [00:00<?, ? examples/s]

Map:   0%|          | 0/4434 [00:00<?, ? examples/s]

Map:   0%|          | 0/4701 [00:00<?, ? examples/s]

In [8]:
import evaluate
from seqeval.metrics import classification_report

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=2)

    true_labels = []
    pred_labels = []

    for prediction, label in zip(preds, labels):
        true_seq = [id2label[l] for l in label if l != -100]
        pred_seq = [id2label[pred] for pred, l in zip(prediction, label) if l != -100]
        if true_seq:  # Only add if not empty
            true_labels.append(true_seq)
            pred_labels.append(pred_seq)

    if not true_labels:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0}


    metric = evaluate.load("seqeval")
    results = metric.compute(predictions=pred_labels, references=true_labels)
    print(results)

    return {
      "precision": results["overall_precision"],
      "recall": results['overall_recall'],
      "f1": results["overall_f1"]
    }



In [None]:
# ------------------------------------------------------------
# Hyper-parameters
# ------------------------------------------------------------
LEARNING_RATE = 3e-5
BATCH_SIZE    = 16
EPOCHS        = 5
WEIGHT_DECAY  = 0.01
WARMUP_RATIO  = 0.06
SEED          = 123
OUTPUT_DIR    = "./tuned_ner_model"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=SEED,
    logging_steps=10,
    save_total_limit=2,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised["train"],
    eval_dataset=tokenised["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

#Train the model
print("\nSTARTING TRAINING ...\n")
trainer.train()

#Save the model
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\nModel saved to {OUTPUT_DIR}")



  trainer = Trainer(



STARTING TRAINING ...



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0399,0.096823,0.82214,0.832586,0.82733


Downloading builder script: 0.00B [00:00, ?B/s]

{'GRP': {'precision': np.float64(0.8094170403587444), 'recall': np.float64(0.838884585592564), 'f1': np.float64(0.8238874096614682), 'number': np.int64(1291)}, 'LOC': {'precision': np.float64(0.7127583749109052), 'recall': np.float64(0.7434944237918215), 'f1': np.float64(0.727802037845706), 'number': np.int64(1345)}, 'PERS': {'precision': np.float64(0.8644025780862667), 'recall': np.float64(0.8601381351751357), 'f1': np.float64(0.8622650840751731), 'number': np.int64(4054)}, 'overall_precision': np.float64(0.822140221402214), 'overall_recall': np.float64(0.8325859491778774), 'overall_f1': np.float64(0.8273301151132566), 'overall_accuracy': 0.9771520316166481}


In [None]:
#Quick test
from transformers import pipeline
import unicodedata

ner = pipeline("ner", model=OUTPUT_DIR, tokenizer=OUTPUT_DIR,
               aggregation_strategy="simple")

txt = unicodedata.normalize("NFC", """
  ᾿Ανέστη δὲ βασιλεὺς ἕτερος ἐπ᾿ Αἴγυπτον, ὃς οὐκ ᾔδει τὸν ᾿Ιωσήφ.
  εἶπε δὲ τῷ ἔθνει αὐτοῦ· ἰδοὺ τὸ γένος τῶν υἱῶν ᾿Ισραὴλ μέγα πλῆθος καὶ ἰσχύει ὑπὲρ ἡμᾶς·
  δεῦτε οὖν κατασοφισώμεθα αὐτούς, μή ποτε πληθυνθῇ, καὶ ἡνίκα ἂν συμβῇ ἡμῖν πόλεμος,
  προστεθήσονται καὶ οὗτοι πρὸς τοὺς ὑπεναντίους καὶ ἐκπολεμήσαντες ἡμᾶς ἐξελεύσονται ἐκ τῆς γῆς.
  καὶ ἐπέστησεν αὐτοῖς ἐπιστάτας τῶν ἔργων, ἵνα κακώσωσιν αὐτοὺς ἐν τοῖς ἔργοις· καὶ Ισραήλᾠκοδόμησαν πόλεις ὀχυρὰς τῷ Φαραώ, τήν τε Πειθὼ καὶ Ῥαμεσσῆ καὶ ῎Ων, ἥ ἐστιν ῾Ηλιούπολις.
  καθότι δὲ αὐτοὺς ἐταπείνουν, τοσούτῳ πλείους ἐγίγνοντο, καὶ ἴσχυον σφόδρα σφόδρα· καὶ ἐβδελύσσοντο οἱ Αἰγύπτιοι ἀπὸ τῶν υἱῶν ᾿.
  καὶ κατεδυνάστευον οἱ Αἰγύπτιοι τοὺς υἱοὺς ᾿Ισραὴλ βίᾳ καὶ κατωδύνων αὐτῶν τὴν ζωὴν ἐν τοῖς ἔργοις τοῖς σκληροῖς, τῷ πηλῷ καὶ τῇ πλινθείᾳ καὶ πᾶσι τοῖς ἔργοις τοῖς ἐν τοῖς πεδίοις, κατὰ πάντα τὰ ἔργα, ὧν κατεδουλοῦντο αὐτοὺς μετὰ βίας.
""")

merged_results = []

for r in ner(txt):
    if r['word'].startswith("##"):
        merged_results[-1]['word'] += r['word'][2:]  # remove ## and join the subwords together instead of splitting it
        merged_results[-1]['score'] = max(merged_results[-1]['score'], r['score'])
    else:
        merged_results.append(r)

for r in merged_results:
    print(f"{r['word']:<20} → {r['entity_group']:<6} ({r['score']:.3f})")


In [None]:
# ------------------------------------------------------------
# Hyper-parameters
# ------------------------------------------------------------
LEARNING_RATE = 3e-5
BATCH_SIZE    = 32
EPOCHS        = 5
WEIGHT_DECAY  = 0.01
WARMUP_RATIO  = 1.0
SEED          = 123
OUTPUT_DIR    = f"./tuned_ner_model_lr{LEARNING_RATE}_bs{BATCH_SIZE}_ep{EPOCHS}"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=SEED,
    logging_steps=10,
    save_total_limit=2,
    report_to=[],
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised["train"],
    eval_dataset=tokenised["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Zero-shot transfer:  Due to its nature that the ground
truth label exists at word level, supervised training of NER models often requires large amount
of human annotation efforts. In real-world use
cases where one needs to build multi-lingual models, the required human labor scales at least linearly with number of languages, or even worse
for low resource languages. Li, B., He, Y., & Xu, W. (2021). Cross-Lingual Named Entity Recognition Using Parallel Corpus: A New Approach Using XLM-RoBERTa Alignment. arXiv
https://arxiv.org/pdf/2101.11112



In [None]:
# https://huggingface.co/datasets/hmcgovern/original-language-bibles-greek
import pandas as pd

dataset_hf_path = '/content/drive/MyDrive/Deep Learning Group Project/train-00000-of-00001.parquet'
dataset_hf = pd.read_parquet(dataset_hf_path)
dataset_hf.head(10)
len(dataset_hf)

In [None]:
import os
import torch
import joblib
import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline
from datasets import Dataset

# ----- SETTINGS -----
CACHE_FILE     = "/content/drive/MyDrive/Deep Learning Group Project/ner_labels_wordlevel.pkl.gz"
CSV_OUT_FILE   = "/content/drive/MyDrive/Deep Learning Group Project/ner_labels.csv"
WORDS_PER_BATCH = 64          # how many *words* to join into one fake sentence since the dataset is on word
HF_BATCH_SIZE   = 64          # pipeline internal batch size (sentences)
# -------------------------

device = 0 if torch.cuda.is_available() else -1
print(f"NER device: {'GPU' if device == 0 else 'CPU'}")

ner_model = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple",
    device=device,
    batch_size=HF_BATCH_SIZE,
)

def ner_on_word_batch(word_list):
    if not word_list:
        return []

    fake_sentence = " ".join(word_list)
    entities = ner_model(fake_sentence)
    labels = ["O"] * len(word_list)

    char_pos = 0
    for ent in entities:
        ent_start = ent["start"]
        ent_label = ent["entity_group"]

        word_idx = 0
        current_pos = 0
        while word_idx < len(word_list):
            word_len = len(word_list[word_idx])
            if current_pos <= ent_start < current_pos + word_len:
                labels[word_idx] = ent_label
                break
            current_pos += word_len + 1
            word_idx += 1

    return labels

if os.path.exists(CACHE_FILE):
    print(f"Loading cached NER labels from:\n    {CACHE_FILE}")
    final_labels = joblib.load(CACHE_FILE)
else:
    print(f"Cache missing → running batched NER on {len(dataset_hf)} words...")
    final_labels = []

    words = dataset_hf["translation"].tolist()

    for i in tqdm(range(0, len(words), WORDS_PER_BATCH),
                  desc="NER word batches",
                  total=(len(words) + WORDS_PER_BATCH - 1) // WORDS_PER_BATCH):

        batch_words = words[i:i + WORDS_PER_BATCH]
        batch_labels = ner_on_word_batch(batch_words)
        final_labels.extend(batch_labels)


    os.makedirs(os.path.dirname(CACHE_FILE), exist_ok=True)
    print(f"Saving {len(final_labels)} labels → {CACHE_FILE}")
    joblib.dump(final_labels, CACHE_FILE, compress=("gzip", 3))

dataset_hf = dataset_hf.assign(ner_label=final_labels)

os.makedirs(os.path.dirname(CSV_OUT_FILE), exist_ok=True)
dataset_hf[["text", "translation", "ner_label"]].to_csv(CSV_OUT_FILE, index=False)
print(f"CSV saved: {CSV_OUT_FILE}")


In [None]:
### 1. Need to convert the ner label to BIO format first
### 2. Join the new dataset to the existing trainingdata


from datasets import concatenate_datasets
import os

print(f"Original train rows: {len(data['train'])}")
print(f"Original train features: {data['train'].features}")

from datasets import Features, Value, Sequence

new_dataset = Dataset.from_dict({
    "tokens": dataset_hf["text"].tolist(),
    "ner_tags": dataset_hf["ner_label"].tolist(),
})
print(new_dataset)

expected_features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Value("string"),
})

print(f"Adding {len(new_dataset)} NER examples to train...")
data["train"] = concatenate_datasets([data["train"], new_dataset])

print(f"New train size: {len(data['train'])} rows")

print("Tokenizing full train split...")
tokenised_train = data["train"].map(
    tokenise_align,
    batched=True,
    num_proc=os.cpu_count(),
    batch_size=2000,
    remove_columns=data["train"].column_names,  # keep only tokenizer outputs
)

data["train"] = tokenised_train


print(f"Tokenization complete!")
print(f"Final train rows: {len(data['train'])}")
print(f"Final columns: {data['train'].column_names}")
print(f"Example: {data['train'][0].keys()}")

In [None]:
#and then save it for future training
def save_hf_dataset_to_conll(dataset, output_file):

    with open(output_file, "w", encoding="utf-8") as f:
        for example in dataset:
            tokens = example["tokens"]
            labels = example["ner_tags"]

            for token, label in zip(tokens, labels):
                f.write(f"{token}\t{label}\n")  # tab-separated
            f.write("\n")  # sentence boundary

# Example: convert train split
save_hf_dataset_to_conll(dataset_hf["train"], "train_from_hf.conll")
save_hf_dataset_to_conll(dataset_hf["test"], "test_from_hf.conll")



In [None]:
# ------------------------------------------------------------
# Hyper-parameters
# ------------------------------------------------------------
LEARNING_RATE = 3e-5
BATCH_SIZE    = 32
EPOCHS        = 10
WEIGHT_DECAY  = 0.001
WARMUP_RATIO  = 0.6
SEED          = 123
OUTPUT_DIR    = f"./tuned_ner_model_lr{LEARNING_RATE}_bs{BATCH_SIZE}_ep{EPOCHS}"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=SEED,
    logging_steps=10,
    save_total_limit=2,
    report_to=[],
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised["train"],
    eval_dataset=tokenised["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()