In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/coarse-and-fine-grained-ner-dataset/coarse-and-fine-grained-ner-dataset.csv


In [28]:
import ast 
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/kaggle/input/coarse-and-fine-grained-ner-dataset/coarse-and-fine-grained-ner-dataset.csv')

print(df.head())

df['Coarse-grained Annotation'] = df['Coarse-grained Annotation'].apply(ast.literal_eval)

sentences = []
labels = []

for _, row in df.iterrows():
    text = row['Text']
    annotations = row['Coarse-grained Annotation'] 

    token_labels = ['O'] * len(text.split()) 

    for start, end, label in annotations:
        annotated_text = text[start:end].split()
        for idx, token in enumerate(text.split()):
            if token in annotated_text:
                token_labels[idx] = f"B-{label}" if idx == 0 else f"I-{label}"

    sentences.append(text.split()) 
    labels.append(token_labels)

unique_labels = sorted(set(label for sentence_labels in labels for label in sentence_labels))
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

numerical_labels = [[label_to_id[label] for label in sentence_labels] for sentence_labels in labels]

X_train, X_test, y_train, y_test = train_test_split(sentences, numerical_labels, test_size=0.2, random_state=42)

print(f"Number of training sentences: {len(X_train)}")
print(f"Number of testing sentences: {len(X_test)}")


                                                Text  \
0   grandes feuilles opposées, oblongues-elliptiq...   
1   feuilles opposées, groupées à l'extrémité des...   
2   feuilles opposées, obovées oblongues, arrondi...   
3   arbustes  petites feuilles opposées, groupées...   
4   arbustes  feuilles opposées ou alternes, obla...   

                                      Organ Entities  \
0  ['bouton', 'pédicelle', 'corolle', 'tube', 'fe...   
1  ['limbe', 'style', 'filets', 'rameaux', 'sépal...   
2  ['corolle', 'limbe', 'ovaire', 'lobes', 'base'...   
3  ['anthères', 'pétales', 'tube', 'feuilles', 's...   
4  ['base', 'nervure', 'feuilles', 'arbustes', 'l...   

                                 Descriptor Entities  \
0  ['fermée', 'pubes-cents', 'cunéiformes', 'vent...   
1  ['elliptiques', '1 cm de longueur', 'extrorses...   
2  ['cunéiforme', '10,5 mm de longueur', 'long', ...   
3  ['secondaires', 'accusé', 'saillantes', 'apicu...   
4  ['proéminente', 'décurrente', 'alternes', '

In [29]:
from transformers import BertTokenizerFast
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
max_len = 128 
batch_size = 16

class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.sentences[idx],
            is_split_into_words=True,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        word_ids = encoded.word_ids() 
        label_ids = []
        for word_id in word_ids:
            if word_id is None: 
                label_ids.append(-100) 
            else:
                label_ids.append(self.labels[idx][word_id])

        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'labels': torch.tensor(label_ids)
        }

# creating datasets and DataLoaders
train_dataset = NERDataset(X_train, y_train, tokenizer, max_len)
test_dataset = NERDataset(X_test, y_test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


for batch in train_loader:
    print(batch.keys())
    print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['labels'].shape)
    break

dict_keys(['input_ids', 'attention_mask', 'labels'])
torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 128])


In [30]:
from transformers import BertForTokenClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_labels = len(label_to_id)
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# model
num_labels = len(label_to_id) 
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}")

# Saving model
model_save_path = "bert_ner_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/5], Training Loss: 0.5573
Epoch [2/5], Training Loss: 0.1867
Epoch [3/5], Training Loss: 0.1041
Epoch [4/5], Training Loss: 0.0721
Epoch [5/5], Training Loss: 0.0526
Model saved to bert_ner_model.pth


In [31]:
from sklearn.metrics import classification_report

model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        active_labels = labels[labels != -100]
        active_predictions = predictions[labels != -100]

        all_predictions.extend(active_predictions.cpu().numpy())
        all_labels.extend(active_labels.cpu().numpy())

report = classification_report(all_labels, all_predictions, target_names=list(label_to_id.keys()))
print("Classification Report:\n", report)


Classification Report:
                precision    recall  f1-score   support

B-DESCRIPTEUR       0.00      0.00      0.00         1
     B-ORGANE       1.00      0.99      1.00       380
I-DESCRIPTEUR       0.88      0.91      0.90      2521
     I-ORGANE       0.96      0.97      0.96      2604
            O       0.98      0.97      0.98     14932

     accuracy                           0.97     20438
    macro avg       0.76      0.77      0.77     20438
 weighted avg       0.97      0.97      0.97     20438



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classification report reveals promising performance for most labels, but there are areas that need improvement:

Observations
Strong Performance:

Labels like I-ORGANE, B-ORGANE, and O show high precision, recall, and F1-scores, indicating that the model effectively recognizes these classes.
Weak Performance:

Labels such as B-DESCRIPTEUR have no predicted samples, resulting in undefined precision and F1-scores.
Imbalance Challenge:

The label B-DESCRIPTEUR appears to be significantly underrepresented in the dataset (only 1 instance in the test set).
Overall Performance:

The macro-average F1-score is affected by the underperformance on rare classes.
The weighted average F1-score is high because it is dominated by the majority classes.
Suggestions for Improvement
Handle Imbalance in Rare Classes:

Augment the dataset to include more instances of underrepresented classes like B-DESCRIPTEUR.
Use class weights in the loss function to give more importance to rare classes.

In [32]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=list(label_to_id.values()), y=all_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, ignore_index=-100)

In [33]:
from torch.optim.lr_scheduler import StepLR
scheduler = StepLR(optimizer, step_size=2, gamma=0.5)  # Decay learning rate every 2 epochs

In [34]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Flatten all labels in the training set to compute class frequencies
all_train_labels = np.concatenate([label for label in y_train])
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(all_train_labels),
    y=all_train_labels
)

# Convert to tensor for PyTorch
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

print("Class weights:", class_weights_tensor)

Class weights: tensor([2.7116e+04, 5.2247e+01, 1.4692e+00, 1.7624e+00, 2.6790e-01],
       device='cuda:0')


In [35]:
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, ignore_index=-100)

In [36]:
# train model
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}")

# Saving model
model_save_path = "bert_ner_model_with_weights.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Epoch [1/5], Training Loss: 0.0410
Epoch [2/5], Training Loss: 0.0321
Epoch [3/5], Training Loss: 0.0239
Epoch [4/5], Training Loss: 0.0203
Epoch [5/5], Training Loss: 0.0196
Model saved to bert_ner_model_with_weights.pth


In [37]:
from sklearn.metrics import classification_report

# Load model
model.load_state_dict(torch.load("bert_ner_model_with_weights.pth"))
model.eval()

# Evaluation
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        # Collect predictions and labels ignoring padding tokens (-100)
        active_labels = labels[labels != -100]
        active_predictions = predictions[labels != -100]

        all_predictions.extend(active_predictions.cpu().numpy())
        all_labels.extend(active_labels.cpu().numpy())

# Generate a report
report = classification_report(all_labels, all_predictions, target_names=list(label_to_id.keys()))
print("Classification Report:\n", report)


  model.load_state_dict(torch.load("bert_ner_model_with_weights.pth"))


Classification Report:
                precision    recall  f1-score   support

B-DESCRIPTEUR       0.00      0.00      0.00         1
     B-ORGANE       1.00      1.00      1.00       380
I-DESCRIPTEUR       0.86      0.95      0.90      2521
     I-ORGANE       0.96      0.97      0.97      2604
            O       0.99      0.97      0.98     14932

     accuracy                           0.97     20438
    macro avg       0.76      0.78      0.77     20438
 weighted avg       0.97      0.97      0.97     20438



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Updated Evaluation Analysis
The re-training with class weights has improved the model’s performance slightly, but there are still issues with rare classes:

Observations
Majority Classes (B-ORGANE, I-ORGANE, O):

These classes maintain high precision, recall, and F1-scores, indicating robust performance.
Underrepresented Class (B-DESCRIPTEUR):

This class still has no predicted samples, leading to undefined precision and F1-scores.
Overall Metrics:

The macro average F1-score remains limited due to poor performance on rare classes.
The weighted average F1-score is high, reflecting dominance by majority classes.
Suggestions for Further Improvement
1. Data Augmentation
Generate synthetic training samples for rare classes like B-DESCRIPTEUR to improve representation.
2. Over-Sampling Rare Classes
Duplicate instances of rare classes in the training set to give them more weight during training.
3. Fine-Grained Class Balancing
Add dynamic sampling or more aggressive weighting for extremely rare classes.
4. Alternative Architectures
Experiment with models such as RoBERTa or DeBERTa, which may handle rare class features better.
5. Custom Loss Functions

In [38]:
from sklearn.metrics import classification_report

# Load model
model.load_state_dict(torch.load("bert_ner_model_with_weights.pth"))
model.eval()

# Evaluation
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        # Collect predictions and labels ignoring padding tokens (-100)
        active_labels = labels[labels != -100]
        active_predictions = predictions[labels != -100]

        all_predictions.extend(active_predictions.cpu().numpy())
        all_labels.extend(active_labels.cpu().numpy())

# Generate a report
report = classification_report(all_labels, all_predictions, target_names=list(label_to_id.keys()))
print("Classification Report:\n", report)


  model.load_state_dict(torch.load("bert_ner_model_with_weights.pth"))


Classification Report:
                precision    recall  f1-score   support

B-DESCRIPTEUR       0.00      0.00      0.00         1
     B-ORGANE       1.00      1.00      1.00       380
I-DESCRIPTEUR       0.86      0.95      0.90      2521
     I-ORGANE       0.96      0.97      0.97      2604
            O       0.99      0.97      0.98     14932

     accuracy                           0.97     20438
    macro avg       0.76      0.78      0.77     20438
 weighted avg       0.97      0.97      0.97     20438



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Let's focus on data augmentation to improve the representation of rare classes, particularly B-DESCRIPTEUR. Here’s how to proceed:

Step 1: Oversample Rare Classes
Duplicate instances of rare classes in the training set to balance the class distribution.


In [39]:
import random
from collections import Counter

flat_labels = [label for sentence_labels in y_train for label in sentence_labels]
label_counts = Counter(flat_labels)
print("Class Distribution Before Oversampling:", label_counts)

min_count = max(label_counts.values())
augmented_sentences = []
augmented_labels = []

for sentence, labels in zip(X_train, y_train):
    if any(label_counts[label] < min_count for label in labels):
        multiplier = min_count // label_counts[labels[0]]
        for _ in range(multiplier):
            augmented_sentences.append(sentence)
            augmented_labels.append(labels)

X_train_augmented = X_train + augmented_sentences
y_train_augmented = y_train + augmented_labels

print(f"Original Training Samples: {len(X_train)}, Augmented Samples: {len(X_train_augmented)}")

Class Distribution Before Oversampling: Counter({4: 101219, 2: 18457, 3: 15386, 1: 519, 0: 1})
Original Training Samples: 670, Augmented Samples: 203244


In [40]:
# creating augmented dataset
train_dataset_augmented = NERDataset(X_train_augmented, y_train_augmented, tokenizer, max_len)
train_loader_augmented = DataLoader(train_dataset_augmented, batch_size=batch_size, shuffle=True)

In [47]:
from transformers import BertForTokenClassification, AdamW
from torch.optim.lr_scheduler import StepLR
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset

subset_indices = list(range(100)) 
train_subset = Subset(train_dataset, subset_indices)
train_loader = DataLoader(train_subset, batch_size=16, shuffle=True)

num_labels = len(label_to_id)
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100) 
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.5) 

num_epochs = 9
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    scheduler.step()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}")

model_save_path = "efficient_bert_ner_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/9], Training Loss: 0.9475
Epoch [2/9], Training Loss: 0.7174
Epoch [3/9], Training Loss: 0.6303
Epoch [4/9], Training Loss: 0.5874
Epoch [5/9], Training Loss: 0.5593
Epoch [6/9], Training Loss: 0.5482
Epoch [7/9], Training Loss: 0.5459
Epoch [8/9], Training Loss: 0.5375
Epoch [9/9], Training Loss: 0.5371
Model saved to efficient_bert_ner_model.pth


In [48]:
from sklearn.metrics import classification_report

model.load_state_dict(torch.load("efficient_bert_ner_model.pth"))
model.eval()

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        # Collect predictions and labels ignoring padding tokens (-100)
        active_labels = labels[labels != -100]
        active_predictions = predictions[labels != -100]

        all_predictions.extend(active_predictions.cpu().numpy())
        all_labels.extend(active_labels.cpu().numpy())

report = classification_report(all_labels, all_predictions, target_names=list(label_to_id.keys()))
print("Classification Report:\n", report)

  model.load_state_dict(torch.load("efficient_bert_ner_model.pth"))


Classification Report:
                precision    recall  f1-score   support

B-DESCRIPTEUR       0.00      0.00      0.00         1
     B-ORGANE       0.00      0.00      0.00       380
I-DESCRIPTEUR       0.17      0.04      0.06      2521
     I-ORGANE       0.70      0.70      0.70      2604
            O       0.83      0.96      0.89     14932

     accuracy                           0.80     20438
    macro avg       0.34      0.34      0.33     20438
 weighted avg       0.72      0.80      0.75     20438



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Analysis of Evaluation Results
Observations:
Strong Performance on Majority Class (O):

The model achieves high precision (0.83) and recall (0.96) for the O class, which dominates the dataset.
Mixed Results for Other Classes:

I-ORGANE performs moderately well, with an F1-score of 0.70.
I-DESCRIPTEUR shows poor performance, with an F1-score of 0.06.
The B-ORGANE and B-DESCRIPTEUR classes have no predictions (0.00 precision and recall).
Imbalanced Dataset Effect:

The model struggles with rare classes (B-DESCRIPTEUR and B-ORGANE) due to insufficient examples during training.
Overall Metrics:

The macro-average F1-score is 0.33, highlighting the model's difficulty with underrepresented classes.
Weighted average F1-score is 0.75, heavily influenced by the dominant O class.

Evaluate and Compare Models

In [49]:
from sklearn.metrics import classification_report

def evaluate_model(model_path, test_loader, label_to_id):
    model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_to_id))
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)

            active_labels = labels[labels != -100]
            active_predictions = predictions[labels != -100]

            all_predictions.extend(active_predictions.cpu().numpy())
            all_labels.extend(active_labels.cpu().numpy())

    report = classification_report(all_labels, all_predictions, target_names=list(label_to_id.keys()), output_dict=True)
    return report

model_paths = [
    "/kaggle/working/bert_ner_model.pth",
    "/kaggle/working/bert_ner_model_with_weights.pth",
    "/kaggle/working/efficient_bert_ner_model.pth"
]

model_names = ["Base Model", "Model with Class Weights", "Efficient Model"]

results = {}
for name, path in zip(model_names, model_paths):
    print(f"Evaluating: {name}")
    report = evaluate_model(path, test_loader, label_to_id)
    results[name] = report
    print(f"{name} - Classification Report:")
    for label, metrics in report.items():
        if label in label_to_id.keys():
            print(f"{label}: Precision={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}, F1-Score={metrics['f1-score']:.2f}")
    print()

print("Comparison Summary:")
for label in label_to_id.keys():
    print(f"\nLabel: {label}")
    for name in model_names:
        metrics = results[name][label]
        print(f"{name}: Precision={metrics['precision']:.2f}, Recall={metrics['recall']:.2f}, F1-Score={metrics['f1-score']:.2f}")

Evaluating: Base Model


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Base Model - Classification Report:
B-DESCRIPTEUR: Precision=0.00, Recall=0.00, F1-Score=0.00
B-ORGANE: Precision=1.00, Recall=0.99, F1-Score=1.00
I-DESCRIPTEUR: Precision=0.88, Recall=0.91, F1-Score=0.90
I-ORGANE: Precision=0.96, Recall=0.97, F1-Score=0.96
O: Precision=0.98, Recall=0.97, F1-Score=0.98

Evaluating: Model with Class Weights


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model with Class Weights - Classification Report:
B-DESCRIPTEUR: Precision=0.00, Recall=0.00, F1-Score=0.00
B-ORGANE: Precision=1.00, Recall=1.00, F1-Score=1.00
I-DESCRIPTEUR: Precision=0.86, Recall=0.95, F1-Score=0.90
I-ORGANE: Precision=0.96, Recall=0.97, F1-Score=0.97
O: Precision=0.99, Recall=0.97, F1-Score=0.98

Evaluating: Efficient Model


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))


Efficient Model - Classification Report:
B-DESCRIPTEUR: Precision=0.00, Recall=0.00, F1-Score=0.00
B-ORGANE: Precision=0.00, Recall=0.00, F1-Score=0.00
I-DESCRIPTEUR: Precision=0.17, Recall=0.04, F1-Score=0.06
I-ORGANE: Precision=0.70, Recall=0.70, F1-Score=0.70
O: Precision=0.83, Recall=0.96, F1-Score=0.89

Comparison Summary:

Label: B-DESCRIPTEUR
Base Model: Precision=0.00, Recall=0.00, F1-Score=0.00
Model with Class Weights: Precision=0.00, Recall=0.00, F1-Score=0.00
Efficient Model: Precision=0.00, Recall=0.00, F1-Score=0.00

Label: B-ORGANE
Base Model: Precision=1.00, Recall=0.99, F1-Score=1.00
Model with Class Weights: Precision=1.00, Recall=1.00, F1-Score=1.00
Efficient Model: Precision=0.00, Recall=0.00, F1-Score=0.00

Label: I-DESCRIPTEUR
Base Model: Precision=0.88, Recall=0.91, F1-Score=0.90
Model with Class Weights: Precision=0.86, Recall=0.95, F1-Score=0.90
Efficient Model: Precision=0.17, Recall=0.04, F1-Score=0.06

Label: I-ORGANE
Base Model: Precision=0.96, Recall=0.97,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
from torch.utils.data import ConcatDataset

b_descripteur_indices = [i for i, labels in enumerate(y_train) if 'B-DESCRIPTEUR' in labels]
b_descripteur_subset = Subset(train_dataset, b_descripteur_indices)

augmented_dataset = ConcatDataset([train_dataset, b_descripteur_subset] * 10)  # Multiply by 10
train_loader = DataLoader(augmented_dataset, batch_size=batch_size, shuffle=True)

In [54]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}")

model_save_path = "model_balanced.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Epoch [1/5], Training Loss: 0.4449
Epoch [2/5], Training Loss: 0.4031
Epoch [3/5], Training Loss: 0.3678
Epoch [4/5], Training Loss: 0.3380
Epoch [5/5], Training Loss: 0.3115
Model saved to model_balanced.pth


In [56]:
evaluate_model("model_balanced.pth", test_loader, label_to_id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'B-DESCRIPTEUR': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 1},
 'B-ORGANE': {'precision': 0.890818858560794,
  'recall': 0.9447368421052632,
  'f1-score': 0.9169859514687101,
  'support': 380},
 'I-DESCRIPTEUR': {'precision': 0.6605966679581557,
  'recall': 0.6763189210630702,
  'f1-score': 0.6683653469227754,
  'support': 2521},
 'I-ORGANE': {'precision': 0.844559585492228,
  'recall': 0.9389400921658986,
  'f1-score': 0.889252591380251,
  'support': 2604},
 'O': {'precision': 0.9423037296517618,
  'recall': 0.9187650683096705,
  'f1-score': 0.9303855413515987,
  'support': 14932},
 'accuracy': 0.8918680888540953,
 'macro avg': {'precision': 0.6676557683325879,
  'recall': 0.6957521847287805,
  'f1-score': 0.680997886224667,
  'support': 20438},
 'weighted avg': {'precision': 0.8940986308815676,
  'recall': 0.8918680888540953,
  'f1-score': 0.8925303039713608,
  'support': 20438}}

Testing the model :

Retrain and evaluation :