# Bangla Named Entity Recognition Using Bangla T5 Encoder

In [None]:
import pandas as pd
import json
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, Features, Value
from sklearn.model_selection import train_test_split

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, T5EncoderModel
from datasets import Dataset
from tqdm import tqdm

# Set up
model_name = "csebuetnlp/banglat5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder = T5EncoderModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import json

# Replace 'data.json' with the path to your JSON file
with open('/kaggle/input/bner-6k/data_storage.json', 'r') as file:
    data = json.load(file)

# Print loaded data
print(data)


# Process the data

In [None]:
import ast  # Safely evaluate string to list

dummy_data = data
# Process the data
for item in dummy_data:
    item['tokens'] = ast.literal_eval(item['tokens'])  # Convert string to list
    item['ner_tags'] = [int(tag) for tag in item['ner_tag']]  # Convert to int
    del item['id']  # Remove 'id'


print(dummy_data)


In [None]:
# Split the data into train, validation, and test sets (80%, 10%, 10%)
train_data, temp_data = train_test_split(dummy_data, test_size=0.2, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Create DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(validation_data),
    "test": Dataset.from_list(test_data),
})



In [None]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=16)
    word_ids = tokenized_inputs.word_ids()

    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(example["ner_tags"][word_idx])
        else:
            aligned_labels.append(example["ner_tags"][word_idx])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)


In [None]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_ids = [torch.tensor(x["input_ids"]) for x in batch]
    attention_mask = [torch.tensor(x["attention_mask"]) for x in batch]
    labels = [torch.tensor(x["labels"]) for x in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

train_loader = DataLoader(tokenized_dataset["train"], batch_size=2, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(tokenized_dataset["validation"], batch_size=2, collate_fn=collate_fn)
test_loader = DataLoader(tokenized_dataset["test"], batch_size=2, collate_fn=collate_fn)


In [None]:
len(train_loader)

# Model Definition

In [None]:

class BanglaT5ForTokenClassification(nn.Module):
    def __init__(self, encoder, num_labels):
        super().__init__()
        self.encoder = encoder
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(encoder.config.d_model, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return {"loss": loss, "logits": logits}

In [None]:
num_labels = 9  # Total NER labels
model = BanglaT5ForTokenClassification(encoder, num_labels)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
pip install seqeval


In [None]:
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score


# Define the label map (customize to match your training)
id2label = {
    0: "O",
    1: "B-PER",
    2: "I-PER",
    3: "B-ORG",
    4: "I-ORG",
    5: "B-LOC",
    6: "I-LOC",
    7: "B-MISC",
    8: "I-MISC"
}




In [None]:
import os
import torch
from torch.utils.data import ConcatDataset, DataLoader, Subset
from sklearn.model_selection import KFold
from tqdm import tqdm
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score


# Combine train, validation, and test into one dataset
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]
test_dataset = tokenized_dataset["test"]
dataset = ConcatDataset([train_dataset, val_dataset, test_dataset])

In [None]:

# Set config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
k_folds = 5
num_epochs = 3
batch_size = 8
learning_rate = 2e-5
checkpoint_dir = "./checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
dataset_size = len(dataset)
print(dataset_size)


# Five-fold cross-validation to check model performance

In [None]:


fold_metrics = []
for fold, (train_idx, val_idx) in enumerate(kf.split(range(dataset_size))):
    print(f"\n Fold {fold + 1}/{k_folds}")

    # Convert np.int64 → int
    train_idx = [int(i) for i in train_idx]
    val_idx = [int(i) for i in val_idx]

    # Prepare data loaders
    train_loader = DataLoader(
        Subset(dataset, train_idx),
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn
    )

    val_loader = DataLoader(
        Subset(dataset, val_idx),
        batch_size=batch_size,
        collate_fn=collate_fn
    )


    # Initialize model and optimizer
    model = BanglaT5ForTokenClassification(encoder, num_labels)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Fold {fold+1} - Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs["loss"]

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Fold {fold+1} - Epoch {epoch+1} Training Loss: {avg_loss:.4f}")



    # Evaluation
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs["logits"], dim=-1)

            for i in range(len(input_ids)):
                pred_tags, true_tags = [], []
                for j in range(len(input_ids[i])):
                    if labels[i][j].item() != -100:
                        true_tag = id2label[labels[i][j].item()]
                        pred_tag = id2label[preds[i][j].item()]
                        true_tags.append(true_tag)
                        pred_tags.append(pred_tag)
                all_labels.append(true_tags)
                all_preds.append(pred_tags)

    # Compute metrics
    fold_f1 = f1_score(all_labels, all_preds)
    fold_precision = precision_score(all_labels, all_preds)
    fold_recall = recall_score(all_labels, all_preds)
    fold_accuracy = accuracy_score(all_labels, all_preds)

    print(f"Fold {fold+1} Metrics — F1: {fold_f1:.4f}, Precision: {fold_precision:.4f}, Recall: {fold_recall:.4f}, Accuracy: {fold_accuracy:.4f}")

    fold_metrics.append({
        "fold": fold + 1,
        "f1": fold_f1,
        "precision": fold_precision,
        "recall": fold_recall,
        "accuracy": fold_accuracy
    })




In [None]:
#  Final Summary
print("\n Summary of 5-Fold Cross-Validation:")
avg_f1 = np.mean([m["f1"] for m in fold_metrics])
avg_precision = np.mean([m["precision"] for m in fold_metrics])
avg_recall = np.mean([m["recall"] for m in fold_metrics])
avg_accuracy = np.mean([m["accuracy"] for m in fold_metrics])

for m in fold_metrics:
    print(f"Fold {m['fold']} → F1: {m['f1']:.4f}, Precision: {m['precision']:.4f}, Recall: {m['recall']:.4f}, Accuracy: {m['accuracy']:.4f}")

print(f"\n Average — F1: {avg_f1:.4f}, Precision: {avg_precision:.4f}, Recall: {avg_recall:.4f}, Accuracy: {avg_accuracy:.4f}")