Let's start with preprocessing the coarse-and-fine-grained-ner-dataset.csv.

Step 1: Preprocessing the Dataset
We need to:

Tokenize the text.
Align the entity annotations with tokens.

In [None]:
import pandas as pd

data_path = "coarse-and-fine-grained-ner-dataset.csv"
data = pd.read_csv(data_path)

data['tokens'] = data['Text'].apply(lambda x: str(x).split())

print(data[['Text', 'tokens']].head())


                                                Text  \
0   grandes feuilles opposées, oblongues-elliptiq...   
1   feuilles opposées, groupées à l'extrémité des...   
2   feuilles opposées, obovées oblongues, arrondi...   
3   arbustes  petites feuilles opposées, groupées...   
4   arbustes  feuilles opposées ou alternes, obla...   

                                              tokens  
0  [grandes, feuilles, opposées,, oblongues-ellip...  
1  [feuilles, opposées,, groupées, à, l'extrémité...  
2  [feuilles, opposées,, obovées, oblongues,, arr...  
3  [arbustes, petites, feuilles, opposées,, group...  
4  [arbustes, feuilles, opposées, ou, alternes,, ...  


Great! Now that we have the tokens, the next step is to align the annotations (Coarse-grained and Fine-grained) with the tokens.

Here’s the code to align annotations with tokens by converting annotations into a token-level BIO format:

Step 2: Align Annotations with Tokens (BIO Format)

In [None]:
def align_annotations_with_tokens(tokens, annotations):
    labels = ['O'] * len(tokens)

    for start, end, entity in annotations:
        for i, token in enumerate(tokens):
            token_start = len(' '.join(tokens[:i])) + (i)
            token_end = token_start + len(token)

            if start <= token_start < end or start < token_end <= end:
                labels[i] = f'I-{entity}' if labels[i] != 'O' else f'B-{entity}'

    return labels

# alignment for Coarse-grained annotations
data['coarse_labels'] = data.apply(
    lambda row: align_annotations_with_tokens(row['tokens'], eval(row['Coarse-grained Annotation'])),
    axis=1
)

# alignment for Fine-grained annotations
data['fine_labels'] = data.apply(
    lambda row: align_annotations_with_tokens(row['tokens'], eval(row['Fine-grained Annotation'])),
    axis=1
)

# displaying aligned annotations
print(data[['tokens', 'coarse_labels', 'fine_labels']].head())


                                              tokens  \
0  [grandes, feuilles, opposées,, oblongues-ellip...   
1  [feuilles, opposées,, groupées, à, l'extrémité...   
2  [feuilles, opposées,, obovées, oblongues,, arr...   
3  [arbustes, petites, feuilles, opposées,, group...   
4  [arbustes, feuilles, opposées, ou, alternes,, ...   

                                       coarse_labels  \
0  [O, B-ORGANE, B-DESCRIPTEUR, B-DESCRIPTEUR, B-...   
1  [B-ORGANE, B-DESCRIPTEUR, O, O, O, B-ORGANE, B...   
2  [B-ORGANE, B-DESCRIPTEUR, O, B-DESCRIPTEUR, O,...   
3  [B-ORGANE, O, B-ORGANE, B-DESCRIPTEUR, O, O, O...   
4  [B-ORGANE, B-ORGANE, B-DESCRIPTEUR, O, B-DESCR...   

                                         fine_labels  
0  [O, B-ORGANE, B-DISPOSITION, B-DESCRIPTEUR, B-...  
1  [B-ORGANE, B-DISPOSITION, O, O, O, B-ORGANE, B...  
2  [B-ORGANE, B-DISPOSITION, O, B-FORME, O, O, B-...  
3  [B-ORGANE, O, B-ORGANE, B-DISPOSITION, O, O, O...  
4  [B-ORGANE, B-ORGANE, B-DISPOSITION, O, B-DISPO..

Explanation of Code:
Input: Tokens and annotation spans (start, end, entity type).
Output: A list of labels for each token in the BIO format (B-ENTITY, I-ENTITY, O).
How it Works:
For each token, calculate its character-level start and end indices.
If a token overlaps with an annotation span, assign the appropriate label.
Use B-ENTITY for the beginning of an entity and I-ENTITY for inside.

Now that the tokens and their corresponding coarse_labels and fine_labels are properly aligned, the next step is to split the data into training, validation, and test sets.

Step 3: Splitting Data
We’ll split the data into three subsets:

Training Set: For model training.
Validation Set: For hyperparameter tuning.
Test Set: For final model evaluation.
Here’s the code for splitting the dataset:

In [None]:
from sklearn.model_selection import train_test_split

X = data['tokens']
y_coarse = data['coarse_labels']
y_fine = data['fine_labels']

X_train, X_temp, y_coarse_train, y_coarse_temp, y_fine_train, y_fine_temp = train_test_split(
    X, y_coarse, y_fine, test_size=0.3, random_state=42
)
X_val, X_test, y_coarse_val, y_coarse_test, y_fine_val, y_fine_test = train_test_split(
    X_temp, y_coarse_temp, y_fine_temp, test_size=0.5, random_state=42
)

splits = {
    'train': (X_train, y_coarse_train, y_fine_train),
    'val': (X_val, y_coarse_val, y_fine_val),
    'test': (X_test, y_coarse_test, y_fine_test)
}

print(f"Training size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")


Training size: 586
Validation size: 126
Test size: 126


The data is successfully split. Now, we’ll create a PyTorch Dataset class to handle tokenized inputs and labels for efficient batching during training and evaluation.

Step 4: Create a PyTorch Dataset Class

In [None]:
all_labels = set(label for labels in pd.concat([y_coarse_train, y_coarse_val, y_coarse_test,
                                                y_fine_train, y_fine_val, y_fine_test]) for label in labels)

label_to_id = {label: idx for idx, label in enumerate(all_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("Label to ID mapping:", label_to_id)


Label to ID mapping: {'B-DEVELOPPEMENT': 0, 'I-ORGANE': 1, 'B-DESCRIPTEUR': 2, 'B-DISPOSITION': 3, 'I-COULEUR': 4, 'B-SURFACE': 5, 'I-DEVELOPPEMENT': 6, 'I-POSITION': 7, 'B-COULEUR': 8, 'B-POSITION': 9, 'I-DESCRIPTEUR': 10, 'B-ORGANE': 11, 'O': 12, 'I-STRUCTURE': 13, 'I-DISPOSITION': 14, 'I-MESURE': 15, 'B-STRUCTURE': 16, 'I-FORME': 17, 'B-FORME': 18, 'B-MESURE': 19, 'I-SURFACE': 20}


In [None]:
train_dataset = NERDataset(X_train, y_coarse_train, y_fine_train, tokenizer)
val_dataset = NERDataset(X_val, y_coarse_val, y_fine_val, tokenizer)
test_dataset = NERDataset(X_test, y_coarse_test, y_fine_test, tokenizer)

sample = train_dataset[0]
print(sample)


{'input_ids': tensor([  101, 12098, 23736, 11393,  4887, 15068, 12098,  8286,  2618,  3449,
         6651,  1010,  2139,  1015,  1011,  1019,  1049,  1025,  8223, 25639,
        24665, 26741,  3269,  2063,  4372, 17579, 13665,  1043, 20470,  2890,
         1010,  1037,  1048,  1005,  6453, 11968, 14876,  2483,  4078, 15333,
        26639, 13433, 17854,  2229,  1025,  4315, 14862,  4372,  7913, 11231,
         2094,  1081, 11721,  6826,  2072,  2139, 13433, 12146,  3802, 10448,
         4244,  3802,  2139, 15718, 21225,  2015, 13433, 12146, 25320,  9307,
         5602,  1010,  2556,  4630, 10861,  2140, 10997, 14925, 12502,  4244,
        15191,  2229,  3802,  4937,  9331, 29598,  4244,  3802, 26692,  4570,
         3802,  2000,  2102, 28353,  2226, 10997,  1025, 15333, 26639, 10768,
        19231,  4244,  1037, 14255, 10483,  4221, 19044, 22573, 10768, 19231,
         4244, 20146,  2015,  1010,  2177, 10285, 11968,  1017,  1011,  2260,
         4372, 18404,  1011,  2310, 28228,  6895, 

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    coarse_labels = [item['coarse_labels'] for item in batch]  # Already tensors
    fine_labels = [item['fine_labels'] for item in batch]      # Already tensors

    # Pad sequences and labels to the longest in the batch
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)  # Padding value for input IDs
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    coarse_labels = pad_sequence(coarse_labels, batch_first=True, padding_value=label_to_id['O'])  # Padding value for labels
    fine_labels = pad_sequence(fine_labels, batch_first=True, padding_value=label_to_id['O'])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'coarse_labels': coarse_labels,
        'fine_labels': fine_labels
    }

# Update DataLoaders with the corrected collate_fn
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Check one batch from the train DataLoader
for batch in train_loader:
    print(batch)
    break


{'input_ids': tensor([[  101,  4958, 11514,  ...,  7505,  1014,   102],
        [  101, 12810,  2063,  ..., 26639,  1010,   102],
        [  101,  1054,  4048,  ...,  3126,  1010,   102],
        ...,
        [  101,  1054,  4048,  ...,  7505,  1018,   102],
        [  101, 12098,  8286,  ...,  2015, 20146,   102],
        [  101, 12098, 13578,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'coarse_labels': tensor([[12, 12, 12,  ..., 12, 12, 12],
        [11, 12, 12,  ..., 12, 12, 12],
        [11, 12,  2,  ..., 12, 12, 12],
        ...,
        [11, 12, 12,  ..., 12, 12, 12],
        [11, 12, 12,  ..., 12, 12, 12],
        [11, 12, 12,  ..., 12, 12, 12]]), 'fine_labels': tensor([[12, 12, 12,  ..., 12, 12, 12],
        [11, 12, 12,  ..., 12, 12, 12],
        [11, 12,  2

The DataLoader is now working as expected, providing batches with properly padded sequences and labels. We are ready to move on to building the Vanilla RNN baseline model.

Step 6: Build the Vanilla RNN Baseline Model

In [None]:
import torch.nn as nn

class VanillaRNNNERModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx=0):
        super(VanillaRNNNERModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)

        rnn_output, _ = self.rnn(embedded)

        logits = self.fc(rnn_output)

        return logits

Step 7: Define the Loss Function and Optimizer

In [None]:
VOCAB_SIZE = tokenizer.vocab_size
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = len(label_to_id)
PADDING_IDX = tokenizer.pad_token_id

model = VanillaRNNNERModel(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    padding_idx=PADDING_IDX
)

criterion = nn.CrossEntropyLoss(ignore_index=label_to_id['O'])

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = criterion.to(device)

print("Model, loss function, and optimizer initialized!")


Model, loss function, and optimizer initialized!


Step 8: Define the Training Loop
Now, we will create the training loop to train the Vanilla RNN model on the training data.

Training Loop Code

In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5):
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coarse_labels = batch['coarse_labels'].to(device)  # Coarse-grained labels

            # Forward pass
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)  # Shape: (batch_size, seq_len, output_dim)

            # Reshape logits and labels
            batch_size, seq_len, num_classes = logits.size()
            logits = logits.view(-1, num_classes)
            coarse_labels = coarse_labels.view(-1)
            attention_mask = attention_mask.view(-1)

            # Masking valid tokens
            active_indices = torch.nonzero(attention_mask).squeeze()
            active_logits = logits[active_indices]
            active_labels = coarse_labels[active_indices]

            # Compute loss
            loss = criterion(active_logits, active_labels)
            train_loss += loss.item()

            # Backward pass and optimizer step
            loss.backward()
            optimizer.step()

        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                coarse_labels = batch['coarse_labels'].to(device)

                logits = model(input_ids, attention_mask)

                # Reshape logits and labels
                batch_size, seq_len, num_classes = logits.size()
                logits = logits.view(-1, num_classes)
                coarse_labels = coarse_labels.view(-1)
                attention_mask = attention_mask.view(-1)

                # Masking valid tokens
                active_indices = torch.nonzero(attention_mask).squeeze()
                active_logits = logits[active_indices]
                active_labels = coarse_labels[active_indices]

                loss = criterion(active_logits, active_labels)
                val_loss += loss.item()

        # Calculate average losses
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Training Loss: {avg_train_loss:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}")

# Call the training function
train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)


Epoch 1/5
Training Loss: 1.3227
Validation Loss: 0.9620
Epoch 2/5
Training Loss: 0.9801
Validation Loss: 0.9535
Epoch 3/5
Training Loss: 0.9922
Validation Loss: 0.9701
Epoch 4/5
Training Loss: 0.9774
Validation Loss: 0.9518
Epoch 5/5
Training Loss: 0.9843
Validation Loss: 0.9566


The training has successfully completed! The training and validation losses indicate that the model is learning, but there might be some slight overfitting or stagnation in validation loss improvements after a few epochs. This could be addressed with hyperparameter tuning or regularization techniques.

Next Step: Evaluate the Model
Now, let’s calculate evaluation metrics on the test set. We’ll use:

Token-Level Accuracy: Percentage of correctly predicted tokens.
F1 Score, Precision, Recall: Using classification metrics.

Evaluation Code

In [None]:
from sklearn.metrics import classification_report

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coarse_labels = batch['coarse_labels'].to(device)

            # Forward pass
            logits = model(input_ids, attention_mask)

            # Reshape logits and labels
            batch_size, seq_len, num_classes = logits.size()
            logits = logits.view(-1, num_classes)
            coarse_labels = coarse_labels.view(-1)
            attention_mask = attention_mask.view(-1)

            # Mask padding tokens
            active_indices = torch.nonzero(attention_mask).squeeze()
            active_logits = logits[active_indices]
            active_labels = coarse_labels[active_indices]

            # Compute loss
            loss = criterion(active_logits, active_labels)
            test_loss += loss.item()

            # Get predictions
            predictions = torch.argmax(active_logits, dim=1).cpu().numpy()
            labels = active_labels.cpu().numpy()

            all_predictions.extend(predictions)
            all_labels.extend(labels)

    avg_test_loss = test_loss / len(test_loader)
    print(f"Test Loss: {avg_test_loss:.4f}")

    # Get unique classes in the test set
    unique_labels = sorted(set(all_labels))
    target_names = [id_to_label[label] for label in unique_labels]

    # Generate classification report
    report = classification_report(
        all_labels, all_predictions, labels=unique_labels, target_names=target_names
    )
    print("Classification Report:\n", report)

# Evaluate the model
evaluate_model(model, test_loader, criterion, device)


Test Loss: 0.9697
Classification Report:
                precision    recall  f1-score   support

     I-ORGANE       0.00      0.00      0.00        90
B-DESCRIPTEUR       0.07      1.00      0.14      1137
I-DESCRIPTEUR       0.00      0.00      0.00       107
     B-ORGANE       0.12      0.01      0.02      1188
            O       0.00      0.00      0.00     12974

     accuracy                           0.07     15496
    macro avg       0.04      0.20      0.03     15496
 weighted avg       0.01      0.07      0.01     15496



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The model's performance is very low, which indicates that it is struggling to learn the task. Here’s a breakdown of the results and some potential next steps to improve performance.

Analysis of Results
Accuracy:

Overall accuracy is very low (0.07), which is indicative of poor predictions.
Class Imbalance:

The majority of the dataset consists of the 'O' class (non-entity tokens), which skews the weighted metrics.
Classes like I-ORGANE and B-ORGANE are underrepresented, causing the model to fail to learn these labels.
Warnings:

The UndefinedMetricWarning occurs because some classes (I-ORGANE, I-DESCRIPTEUR, O) have no predicted samples, leading to precision being undefined.

Improvement

Add Class Weights to the Loss Function
The first improvement is to address class imbalance by adding class weights to the CrossEntropyLoss. This will penalize misclassifications for rare classes more heavily.

In [None]:
import torch
import numpy as np

all_coarse_labels = torch.cat([batch['coarse_labels'].view(-1) for batch in train_loader])

class_counts = np.bincount(all_coarse_labels.cpu().numpy(), minlength=len(label_to_id))
class_weights = np.where(class_counts == 0, 1e6, 1.0 / class_counts)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=label_to_id['O'])

print("Class weights applied to the loss function:", class_weights)

Class weights applied to the loss function: tensor([1.0000e+06, 6.3898e-04, 5.2359e-05, 1.0000e+06, 1.0000e+06, 1.0000e+06,
        1.0000e+06, 1.0000e+06, 1.0000e+06, 1.0000e+06, 5.7078e-04, 5.1733e-05,
        4.2526e-06, 1.0000e+06, 1.0000e+06, 1.0000e+06, 1.0000e+06, 1.0000e+06,
        1.0000e+06, 1.0000e+06, 1.0000e+06])


  class_weights = np.where(class_counts == 0, 1e6, 1.0 / class_counts)


The class weights are now applied correctly, with zero-count classes assigned a high weight of 1*10**6 . The RuntimeWarning is harmless because we explicitly handle zero counts with np.where.

Next Step: Re-Train the Model
Now that the class weights are set, you can re-train the model using the updated criterion. Use the same training loop as before:

In [None]:
train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)

Epoch 1/5
Training Loss: 1.4919
Validation Loss: 1.4521
Epoch 2/5
Training Loss: 1.4062
Validation Loss: 1.3919
Epoch 3/5
Training Loss: 1.4105
Validation Loss: 1.4069
Epoch 4/5
Training Loss: 1.3940
Validation Loss: 1.3972
Epoch 5/5
Training Loss: 1.3965
Validation Loss: 1.3866


The model has successfully re-trained with class weights, and the losses (both training and validation) indicate the model is learning but slowly improving. Next, we’ll evaluate the model again to check how the class weights have influenced its performance.

Next Step: Evaluate the Model
Run the updated evaluation function to calculate metrics for the test set:

In [None]:
evaluate_model(model, test_loader, criterion, device)

Test Loss: 1.3880
Classification Report:
                precision    recall  f1-score   support

     I-ORGANE       0.00      0.00      0.00        90
B-DESCRIPTEUR       0.00      0.00      0.00      1137
I-DESCRIPTEUR       0.00      0.00      0.00       107
     B-ORGANE       0.08      1.00      0.14      1188
            O       0.00      0.00      0.00     12974

     accuracy                           0.08     15496
    macro avg       0.02      0.20      0.03     15496
 weighted avg       0.01      0.08      0.01     15496



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The results indicate minimal improvement in model performance, with most metrics still poor. The key observations are:

Observations
Class Imbalance Impact:

Class weights enabled the model to predict the class B-ORGANE better (recall is 1.00), but it significantly struggles with other classes, including O.
Undefined Precision/Recall:

Many classes (I-ORGANE, B-DESCRIPTEUR, I-DESCRIPTEUR) have no predicted samples, causing precision to be undefined.
Overall F1 Score:

Still very low due to the trade-off between improving rare class performance and the model's inability to handle the majority class (O).

Replace Vanilla RNN with LSTM
Replacing the Vanilla RNN with an LSTM should improve the model's ability to capture long-term dependencies, which is critical for Named Entity Recognition.

In [None]:
class LSTMNERModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx=0):
        super(LSTMNERModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)

        lstm_output, _ = self.lstm(embedded)

        logits = self.fc(lstm_output)

        return logits

Explanation:
Bidirectional LSTM:

The LSTM processes sequences in both forward and backward directions, improving context understanding.
The hidden state size is doubled (hidden_dim * 2).
Embedding Layer:

Remains unchanged, converting token IDs to dense vectors.
Fully Connected Layer:

Maps the concatenated hidden states from the bidirectional LSTM to the output dimension.

In [None]:
model = LSTMNERModel(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    padding_idx=PADDING_IDX
).to(device)

In [None]:
train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)

Epoch 1/5
Training Loss: 3.0374
Validation Loss: 3.0376
Epoch 2/5
Training Loss: 3.0364
Validation Loss: 3.0376
Epoch 3/5
Training Loss: 3.0363
Validation Loss: 3.0376
Epoch 4/5
Training Loss: 3.0363
Validation Loss: 3.0376
Epoch 5/5
Training Loss: 3.0366
Validation Loss: 3.0376


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)

Epoch 1/5
Training Loss: 2.9412
Validation Loss: 2.8167
Epoch 2/5
Training Loss: 2.3909
Validation Loss: 1.4680
Epoch 3/5
Training Loss: 1.4100
Validation Loss: 1.3960
Epoch 4/5
Training Loss: 1.3985
Validation Loss: 1.3968
Epoch 5/5
Training Loss: 1.3961
Validation Loss: 1.3938


In [None]:
evaluate_model(model, test_loader, criterion, device)

Test Loss: 1.3938
Classification Report:
                precision    recall  f1-score   support

     I-ORGANE       0.00      0.08      0.01        90
B-DESCRIPTEUR       0.08      0.40      0.13      1137
I-DESCRIPTEUR       0.01      0.49      0.01       107
     B-ORGANE       0.08      0.06      0.07      1188
            O       0.00      0.00      0.00     12974

     accuracy                           0.04     15496
    macro avg       0.03      0.20      0.04     15496
 weighted avg       0.01      0.04      0.01     15496



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The model's performance has slightly improved in terms of precision and recall for some classes, such as B-DESCRIPTEUR and I-DESCRIPTEUR, but the overall results are still not satisfactory. This suggests the model architecture or the features provided are not sufficient to handle the complexity of the task.

Next Steps
To further improve, we can incorporate pretrained embeddings (such as BERT) into the model. Pretrained embeddings will give the model a stronger foundation for understanding token-level relationships, which can lead to significant performance improvements.

Step 3: Use BERT Embeddings

In [None]:
from transformers import BertModel

class BERTLSTMNERModel(nn.Module):
    def __init__(self, bert_model_name, hidden_dim, output_dim):
        super(BERTLSTMNERModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = bert_output.last_hidden_state

        lstm_output, _ = self.lstm(embeddings)

        logits = self.fc(lstm_output)
        return logits


BERT Model:

The pretrained BERT model generates token embeddings.
We use the last_hidden_state as input to the LSTM layer.
LSTM Layer:

Processes the contextualized embeddings for token-level predictions.
Output Layer:

Maps the LSTM outputs to the label space.
Next Steps:
Replace the existing model with BERTLSTMNERModel:

In [None]:
model = BERTLSTMNERModel(bert_model_name='bert-base-uncased', hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)

Epoch 1/5
Training Loss: 3.0460
Validation Loss: 3.0511
Epoch 2/5
Training Loss: 3.0456
Validation Loss: 3.0511
Epoch 3/5
Training Loss: 3.0466
Validation Loss: 3.0511
Epoch 4/5
Training Loss: 3.0471
Validation Loss: 3.0511
Epoch 5/5
Training Loss: 3.0459
Validation Loss: 3.0511


In [None]:
optimizer = torch.optim.AdamW([
    {'params': model.bert.parameters(), 'lr': 1e-5},
    {'params': model.lstm.parameters(), 'lr': 1e-4},
    {'params': model.fc.parameters(), 'lr': 1e-4}
])

train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)

Epoch 1/5
Training Loss: 1.8755
Validation Loss: 1.4176
Epoch 2/5
Training Loss: 1.4098
Validation Loss: 1.3923
Epoch 3/5
Training Loss: 1.4087
Validation Loss: 1.3899
Epoch 4/5
Training Loss: 1.3991
Validation Loss: 1.4124
Epoch 5/5
Training Loss: 1.4013
Validation Loss: 1.4011


The differential learning rate strategy has improved the training process, as indicated by a significant reduction in both training and validation losses, especially in the early epochs. However, the model's performance is now showing signs of stagnation or slight overfitting in later epochs, as the validation loss no longer decreases.

Next Step: Evaluate the Model on the Test Set
Let’s evaluate the updated model to check its performance on the test set.

In [None]:
evaluate_model(model, test_loader, criterion, device)

Test Loss: 1.3919
Classification Report:
                precision    recall  f1-score   support

     I-ORGANE       0.00      0.00      0.00        90
B-DESCRIPTEUR       0.00      0.00      0.00      1137
I-DESCRIPTEUR       0.01      0.98      0.01       107
     B-ORGANE       0.08      0.04      0.05      1188
            O       0.00      0.00      0.00     12974

     accuracy                           0.01     15496
    macro avg       0.02      0.20      0.01     15496
 weighted avg       0.01      0.01      0.00     15496



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The test results indicate that the model is still underperforming significantly, particularly for the majority class (O) and key entity classes. Despite the changes, the metrics show that the model is not learning the task effectively.

Key Observations
Low Precision and Recall for Most Classes:

The model struggles to predict most classes, with only slight improvement in I-DESCRIPTEUR.
Imbalanced Dataset Challenge:

The imbalance in the dataset remains a significant issue, as the model tends to ignore rare classes in favor of overrepresented ones (though this is not fully effective either).
Validation-Test Disparity:

While the validation loss decreases, the test performance indicates that the model is not generalizing well.
Suggestions for Further Improvement
1. Use a Transformer-Based Sequence Labeling Model (Directly)
Instead of combining BERT with LSTM, directly fine-tune a pretrained Transformer model like BERT, DistilBERT, or RoBERTa for sequence labeling.
Hugging Face's transformers library provides a prebuilt BertForTokenClassification class tailored for such tasks.
Code to Use BERT for Token Classification

In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coarse_labels = batch['coarse_labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            batch_size, seq_len, num_classes = logits.size()
            logits = logits.view(-1, num_classes)
            coarse_labels = coarse_labels.view(-1)
            attention_mask = attention_mask.view(-1)


            active_indices = attention_mask.nonzero(as_tuple=True)[0]
            active_logits = logits[active_indices]
            active_labels = coarse_labels[active_indices]


            loss = criterion(active_logits, active_labels)
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                coarse_labels = batch['coarse_labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                batch_size, seq_len, num_classes = logits.size()
                logits = logits.view(-1, num_classes)
                coarse_labels = coarse_labels.view(-1)
                attention_mask = attention_mask.view(-1)

                active_indices = attention_mask.nonzero(as_tuple=True)[0]
                active_logits = logits[active_indices]
                active_labels = coarse_labels[active_indices]

                loss = criterion(active_logits, active_labels)
                val_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Training Loss: {avg_train_loss:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}")

train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=5)


Epoch 1/5
Training Loss: 1.1921
Validation Loss: 0.9823
Epoch 2/5
Training Loss: 1.0033
Validation Loss: 0.9656
Epoch 3/5
Training Loss: 0.9908
Validation Loss: 0.9685
Epoch 4/5
Training Loss: 0.9987
Validation Loss: 0.9846


KeyboardInterrupt: 

The training and validation losses have significantly improved with this update, indicating that the model is now learning better with the Transformer-based architecture. However, the slight increase in validation loss in later epochs suggests the model may start overfitting.

In [None]:
from sklearn.metrics import classification_report

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            coarse_labels = batch['coarse_labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            batch_size, seq_len, num_classes = logits.size()
            logits = logits.view(-1, num_classes)
            coarse_labels = coarse_labels.view(-1)
            attention_mask = attention_mask.view(-1)

            active_indices = attention_mask.nonzero(as_tuple=True)[0]
            active_logits = logits[active_indices]
            active_labels = coarse_labels[active_indices]

            loss = criterion(active_logits, active_labels)
            test_loss += loss.item()

            predictions = torch.argmax(active_logits, dim=1).cpu().numpy()
            labels = active_labels.cpu().numpy()

            all_predictions.extend(predictions)
            all_labels.extend(labels)

    avg_test_loss = test_loss / len(test_loader)
    print(f"Test Loss: {avg_test_loss:.4f}")

    unique_labels = sorted(set(all_labels))
    target_names = [id_to_label[label] for label in unique_labels]
    report = classification_report(
        all_labels, all_predictions, labels=unique_labels, target_names=target_names
    )
    print("Classification Report:\n", report)


In [None]:
evaluate_model(model, test_loader, criterion, device)

Test Loss: 0.9851
Classification Report:
                precision    recall  f1-score   support

     I-ORGANE       0.00      0.00      0.00        90
B-DESCRIPTEUR       0.00      0.00      0.00      1137
I-DESCRIPTEUR       0.00      0.00      0.00       107
     B-ORGANE       0.08      1.00      0.14      1188
            O       0.00      0.00      0.00     12974

     accuracy                           0.08     15496
    macro avg       0.02      0.20      0.03     15496
 weighted avg       0.01      0.08      0.01     15496



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Save the entire model

In [None]:
model_save_path = "model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to model.pth


In [None]:
import ast
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('coarse-and-fine-grained-ner-dataset.csv')

print(df.head())

df['Coarse-grained Annotation'] = df['Coarse-grained Annotation'].apply(ast.literal_eval)

sentences = []
labels = []

for _, row in df.iterrows():
    text = row['Text']
    annotations = row['Coarse-grained Annotation']

    token_labels = ['O'] * len(text.split())

    for start, end, label in annotations:
        annotated_text = text[start:end].split()
        for idx, token in enumerate(text.split()):
            if token in annotated_text:
                token_labels[idx] = f"B-{label}" if idx == 0 else f"I-{label}"

    sentences.append(text.split())
    labels.append(token_labels)

unique_labels = sorted(set(label for sentence_labels in labels for label in sentence_labels))
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

numerical_labels = [[label_to_id[label] for label in sentence_labels] for sentence_labels in labels]

X_train, X_test, y_train, y_test = train_test_split(sentences, numerical_labels, test_size=0.2, random_state=42)

print(f"Number of training sentences: {len(X_train)}")
print(f"Number of testing sentences: {len(X_test)}")


                                                Text  \
0   grandes feuilles opposées, oblongues-elliptiq...   
1   feuilles opposées, groupées à l'extrémité des...   
2   feuilles opposées, obovées oblongues, arrondi...   
3   arbustes  petites feuilles opposées, groupées...   
4   arbustes  feuilles opposées ou alternes, obla...   

                                      Organ Entities  \
0  ['bouton', 'pédicelle', 'corolle', 'tube', 'fe...   
1  ['limbe', 'style', 'filets', 'rameaux', 'sépal...   
2  ['corolle', 'limbe', 'ovaire', 'lobes', 'base'...   
3  ['anthères', 'pétales', 'tube', 'feuilles', 's...   
4  ['base', 'nervure', 'feuilles', 'arbustes', 'l...   

                                 Descriptor Entities  \
0  ['fermée', 'pubes-cents', 'cunéiformes', 'vent...   
1  ['elliptiques', '1 cm de longueur', 'extrorses...   
2  ['cunéiforme', '10,5 mm de longueur', 'long', ...   
3  ['secondaires', 'accusé', 'saillantes', 'apicu...   
4  ['proéminente', 'décurrente', 'alternes', '

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class NERDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.sentences[idx],
            is_split_into_words=True,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        word_ids = encoded.word_ids()
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(self.labels[idx][word_id])

        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze(),
            'labels': torch.tensor(label_ids)
        }

# creating datasets and DataLoaders
train_dataset = NERDataset(X_train, y_train, tokenizer, max_len)
test_dataset = NERDataset(X_test, y_test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


for batch in train_loader:
    print(batch.keys())
    print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['labels'].shape)
    break

dict_keys(['input_ids', 'attention_mask', 'labels'])
torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 128])


In [None]:
from transformers import BertForTokenClassification
import torch.nn as nn
import torch

# model
num_labels = len(label_to_id)
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}")

# Saving model
model_save_path = "bert_ner_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/5], Training Loss: 0.5465
Epoch [2/5], Training Loss: 0.1675
Epoch [3/5], Training Loss: 0.0957
Epoch [4/5], Training Loss: 0.0612
Epoch [5/5], Training Loss: 0.0488
Model saved to bert_ner_model.pth
