Installs and Imports

In [3]:
################################ INSTALLS ################################

# import torch
# print("GPU available:", torch.cuda.is_available())

# !pip install seqeval
# !pip install datasets
# !pip install evaluate

# !pip install -U transformers==4.35.2 datasets evaluate sentencepiece
# !pip uninstall -y transformers peft
# !pip install --no-cache-dir transformers==4.35.2

# !pip install --no-cache-dir transformers==4.35.2 accelerate==0.24.0

# !pip install sacremoses

################################ IMPORTS ################################

import pandas as pd
import csv
from datasets import Dataset, DatasetDict
from transformers import CamembertTokenizerFast
from transformers import CamembertForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForTokenClassification
import evaluate
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

0. Utilities

In [4]:
def add_sentence_ids(df):
    sentence_ids = []
    sentence_id = 0
    for misc in df["MISC"].fillna("_"):
        sentence_ids.append(sentence_id)
        if "EndOfSentence" in misc.split("|"):
            sentence_id += 1
    df["sentence_id"] = sentence_ids
    return df

def group_sentences(df):
    grouped = (
        df.groupby("sentence_id", sort=False)
          .agg({"TOKEN": list, "NE-COARSE-LIT": list})
          .reset_index(drop=True)
    )
    return Dataset.from_pandas(grouped)

def tokenize_and_align(batch, tokenizer, max_lenght = 124):
    tokenized_batch = tokenizer(
        batch["TOKEN"],
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=max_lenght,
        return_tensors=None
    )
    all_labels = []
    for i in range(len(batch["TOKEN"])):
        word_ids = tokenized_batch.word_ids(batch_index=i)
        labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                labels.append(-100)
            elif word_idx != previous_word_idx:
                labels.append(label_to_id[batch["NE-COARSE-LIT"][i][word_idx]])
            else:
                labels.append(-100)
            previous_word_idx = word_idx
        all_labels.append(labels)
    tokenized_batch["labels"] = all_labels
    return tokenized_batch

def evaluate_model(trainer, dataset, id_to_label):
    predictions, labels, _ = trainer.predict(dataset)
    preds = np.argmax(predictions, axis=2)
    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id_to_label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]
    metric = evaluate.load("seqeval")
    results = metric.compute(predictions=pred_labels, references=true_labels)
    print("NER Evaluation Report:\n")
    for label, scores in results.items():
        if isinstance(scores, dict):
            print(f"Entity: {label}")
            for metric, value in scores.items():
                print(f"  {metric}: {value:.4f}")
            print()
    print(f"overall_f1: {results['overall_f1']:.4f}")
    print(f"overall_precision: {results['overall_precision']:.4f}")
    print(f"overall_recall: {results['overall_recall']:.4f}")
    print(f"overall_accuracy: {results['overall_accuracy']:.4f}")
    return results, pred_labels, true_labels

def save_predictions_to_file(preds, labels, tokens, filename="ner_predictions.tsv"):
    rows = []
    for i in range(len(tokens)):
        for token, gold, pred in zip(tokens[i], labels[i], preds[i]):
            rows.append((token, gold, pred))
        rows.append(("", "", ""))
    df = pd.DataFrame(rows, columns=["Token", "Gold", "Pred"])
    df.to_csv(filename, sep="\t", index=False)
    print(f" Saved to {filename}")

def count_errors_by_label(preds, labels, target="ORG"):
    wrong = Counter()
    for pred_seq, gold_seq in zip(preds, labels):
        for pred, gold in zip(pred_seq, gold_seq):
            gold_clean = gold.split("-")[-1].lower()
            pred_clean = pred.split("-")[-1].lower()
            if gold_clean == target.lower() and pred_clean != target.lower():
                wrong[pred_clean] += 1
    return wrong

def plot_confusion_matrix(preds, labels, id_to_label):
    all_preds = []
    all_labels = []

    for p_list, l_list in zip(preds, labels):
        for p, l in zip(p_list, l_list):
            if l != -100:
                all_preds.append(id_to_label[p])
                all_labels.append(id_to_label[l])

    labels_unique = sorted(set(all_labels + all_preds))
    cm = confusion_matrix(all_labels, all_preds, labels=labels_unique)

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels_unique)
    disp.plot(xticks_rotation=45, cmap="Blues")
    plt.title("Confusion Matrix: Entity Classification")
    plt.grid(False)
    plt.show()

1. Data Preprocessing

In [5]:
train_path = "/content/HIPE-2022-v2.1-letemps-train-fr.tsv"
test_path = "/content/HIPE-2022-v2.1-letemps-test-fr.tsv"

columns = [
    "TOKEN", "NE-COARSE-LIT", "NE-COARSE-METO", "NE-FINE-LIT", "NE-FINE-METO",
    "NE-FINE-COMP", "NE-NESTED", "NEL-LIT", "NEL-METO", "MISC"
]

train_df = pd.read_csv(train_path, sep="\t", comment="#", header=0, quoting=csv.QUOTE_NONE, on_bad_lines="skip", names=columns)
test_df = pd.read_csv(test_path, sep="\t", comment="#", header=0, quoting=csv.QUOTE_NONE, on_bad_lines="skip", names=columns)

train_df = add_sentence_ids(train_df)
test_df = add_sentence_ids(test_df)
train_ds = group_sentences(train_df)
test_ds = group_sentences(test_df)
dataset = DatasetDict({"train": train_ds, "test": test_ds})

unique_labels = set(l for labels in train_ds["NE-COARSE-LIT"] for l in labels)
label_list = sorted(unique_labels)
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

3. CamenBERT model

3.1 Tokenization

In [6]:
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

model = CamembertForTokenClassification.from_pretrained(
    "camembert-base",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

train_tokenized = train_ds.map(lambda x: tokenize_and_align(x, tokenizer), batched=True)
test_tokenized = test_ds.map(lambda x: tokenize_and_align(x, tokenizer), batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1178 [00:00<?, ? examples/s]

Map:   0%|          | 0/2381 [00:00<?, ? examples/s]

3.2 Training

In [7]:
training_args = TrainingArguments(
    output_dir="./camembert-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    load_best_model_at_end=True
)

camembert_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer
)

camembert_trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtg-thgaborieau[0m ([33mtg-thgaborieau-ensae[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.316712
2,0.465000,0.287507
3,0.465000,0.252957


TrainOutput(global_step=885, training_loss=0.3639827577407751, metrics={'train_runtime': 306.4893, 'train_samples_per_second': 11.531, 'train_steps_per_second': 2.888, 'total_flos': 223651554076368.0, 'train_loss': 0.3639827577407751, 'epoch': 3.0})

3.3 Testing

In [13]:
cam_results, cam_preds, cam_labels = evaluate_model(camembert_trainer, test_tokenized, id_to_label)

NER Evaluation Report:

Entity: loc
  precision: 0.2737
  recall: 0.8489
  f1: 0.4139
  number: 589.0000

Entity: org
  precision: 0.0000
  recall: 0.0000
  f1: 0.0000
  number: 78.0000

Entity: pers
  precision: 0.0932
  recall: 0.1268
  f1: 0.1074
  number: 355.0000

overall_f1: 0.3271
overall_precision: 0.2359
overall_recall: 0.5333
overall_accuracy: 0.9551


4. Jean-Baptiste NER model

4.1 Tokenization

In [9]:
model_name = "Jean-Baptiste/camembert-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
    ignore_mismatched_sizes=True
)

train_tokenized = train_ds.map(lambda x: tokenize_and_align(x, tokenizer), batched=True)
test_tokenized = test_ds.map(lambda x: tokenize_and_align(x, tokenizer), batched=True)



tokenizer_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1178 [00:00<?, ? examples/s]

Map:   0%|          | 0/2381 [00:00<?, ? examples/s]

4.2 Training

In [10]:
training_args = TrainingArguments(
    output_dir="./camembert-ner-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"  # or "wandb"
)

jb_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer
)

jb_trainer.train()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.2035,0.212886
2,0.1284,0.220422
3,0.1052,0.208098


TrainOutput(global_step=885, training_loss=0.20925926380911788, metrics={'train_runtime': 233.4106, 'train_samples_per_second': 15.141, 'train_steps_per_second': 3.792, 'total_flos': 223651554076368.0, 'train_loss': 0.20925926380911788, 'epoch': 3.0})

4.3 Testing

In [12]:
jb_results, jb_preds, jb_labels = evaluate_model(jb_trainer, test_tokenized, id_to_label)

NER Evaluation Report:

Entity: loc
  precision: 0.3780
  recall: 0.8421
  f1: 0.5218
  number: 589.0000

Entity: org
  precision: 0.0000
  recall: 0.0000
  f1: 0.0000
  number: 78.0000

Entity: pers
  precision: 0.3683
  recall: 0.6930
  f1: 0.4809
  number: 355.0000

overall_f1: 0.4943
overall_precision: 0.3747
overall_recall: 0.7260
overall_accuracy: 0.9606
