# Bangla Named Entity Recognition Using MuRIL (Multilingual Representations for Indian Languages)


Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!apt install git-lfs
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118

In [None]:
import pandas as pd
import json
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, Features, Value
from sklearn.model_selection import train_test_split

In [None]:
# Open the data file
filepath = '/kaggle/input/bner-6k/data_storage.json'

with open(filepath, 'r') as file:
    # Load the JSON data from the file
    data = json.load(file)

In [None]:
data[0]

In [None]:
ner_labels = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
ner_feature = ClassLabel(names=ner_labels)

# Map the raw ner_tag values (like '#1', '#2', etc.) to the actual ner_labels
ner_mapping = {
    '1': 'O',
    '2': 'B-PER',
    '3': 'I-PER',
    '4': 'B-ORG',
    '5': 'I-ORG',
    '6': 'B-LOC',
    '7': 'I-LOC',
    '8': 'B-MISC',
    '9': 'I-MISC'
}

In [None]:
data[0]

# Function to process tokens and ner_tags

In [None]:

def process_data(data):
    for entry in data:
        # Check if 'tokens' is a string and convert it to a list, otherwise leave it as is
        if isinstance(entry['tokens'], str):
            entry['tokens'] = eval(entry['tokens'])  # Convert string representation of list to an actual list

        # Map 'ner_tag' to actual class labels using the ner_mapping
        if isinstance(entry['ner_tag'], str):
            entry['ner_tag'] = eval(entry['ner_tag'])  # Convert string representation of list to an actual list

        # Map each ner_tag from numeric to the respective class label
        entry['ner_tag'] = [tag for tag in entry['ner_tag']]  # Default to 'O' if invalid tag

    return data

# Process the data
data = process_data(data)

In [None]:
data_df=pd.DataFrame(data)


In [None]:
# Retain only necessary columns
data_df = data_df[['id', 'tokens', 'ner_tag']].copy()

# Convert 'id' to int32
data_df['id'] = data_df['id'].astype('int32')

# Ensure 'tokens' is a list of strings and clean 'ner_tag' data
data_df['tokens'] = data_df['tokens'].apply(lambda x: list(map(str, x)))

# Convert DataFrames to Datasets

In [None]:
features = Features({
    'id': Value('int32'),
    'tokens': Sequence(Value('string')),
    'ner_tag': Sequence(Value('string'))  # Keep tags as strings
})


dataset = Dataset.from_pandas(data_df, features=features)

# Create DatasetDict
raw_datasets = DatasetDict({
    'samples': dataset
})


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertForTokenClassification
import torch
# Load MuRIL
model_checkpoint = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

checkpoint=model_checkpoint

In [None]:
tokenizer

In [None]:
data=raw_datasets

In [None]:
tokenizer.is_fast

# Align labels with tokens function

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            if word_id is None or word_id >= len(labels) or word_id < 0:
                label = -100
            else:
                try:
                    label = int(labels[word_id])
                except (ValueError, TypeError):
                    label = -100  # Default to -100 if conversion fails
            new_labels.append(label)

        elif word_id is None or word_id >= len(labels) or word_id < 0:
            new_labels.append(-100)

        else:
            try:
                label = int(labels[word_id])
                if label % 2 == 1:
                    label += 1
            except (ValueError, TypeError):
                label = -100  # Default to -100 if conversion fails

            new_labels.append(label)

    return new_labels


In [None]:

def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

  all_labels = examples['ner_tag']

  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))

  tokenized_inputs['labels'] = new_labels

  return tokenized_inputs

In [None]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['samples'].column_names)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
!pip install seqeval
!pip install evaluate

In [None]:
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score

label_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_preds = [[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
                  for prediction, label in zip(predictions, labels)]

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }


In [None]:
len(tokenized_datasets["samples"] )

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForTokenClassification
from sklearn.model_selection import KFold
import numpy as np
import torch

# Set seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Define number of folds
k = 5

# Convert HuggingFace Dataset to list of examples for manual slicing
dataset_list = tokenized_datasets['samples'].shuffle(seed=seed).to_list()
kf = KFold(n_splits=k)

# Store evaluation results for each fold
results = []
fold_metrics = []

# Five-fold cross-validation to check model performance

In [None]:
for fold, (train_index, val_index) in enumerate(kf.split(dataset_list)):
    print(f"\n=== Fold {fold + 1}/{k} ===")

    # Split the dataset into train and validation subsets
    train_data = [dataset_list[i] for i in train_index]
    val_data = [dataset_list[i] for i in val_index]



    # Convert lists back to HuggingFace Dataset
    from datasets import Dataset
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)



    # Load a fresh model for each fold
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(label_names),
        id2label={i: label for i, label in enumerate(label_names)},
        label2id={label: i for i, label in enumerate(label_names)}
    )

    # Define fold-specific output directory
    fold_output_dir = f"./fold_{fold + 1}_results"

    # Define TrainingArguments
    args = TrainingArguments(
        output_dir=fold_output_dir,
        save_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f"{fold_output_dir}/logs",
        seed=seed,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train and evaluate
    trainer.train()

    eval_result = trainer.evaluate()
    results.append(eval_result)
    print(f"Fold {fold + 1} Evaluation:", eval_result)



In [None]:
import pandas as pd

df_results = pd.DataFrame(results)
print("\n=== Cross-Validation Average Results ===")
print(df_results.mean())
