In [114]:
import json
import torch
import pandas as pd
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer, AutoConfig
from datasets import load_dataset, DatasetDict, Dataset
from torch.utils.data import random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support
import numpy as np

In [115]:
def load_json_to_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Convert to Pandas DataFrame
    df = pd.DataFrame(data)

    # Convert DataFrame to Hugging Face Dataset
    return Dataset.from_pandas(df)

tags_list = ["O", "B-PER", "I-PER", "B-EMAIL"]  #one tage for whole email
tag_to_id = {tag: i for i, tag in enumerate(tags_list)}
id_to_tag = {i: tag for tag, i in tag_to_id.items()}


MODEL_NAME = "prajjwal1/bert-tiny"  # 4.4M parameters
num_labels = len(tags_list)

# Load the model configuration
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=num_labels, id2label=id_to_tag, label2id=tag_to_id)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)

def encode_tags(tags, tag_to_id):
    return [int(tag_to_id[tag]) for tag in tags] #Converts NER tags from string labels to integer IDs.

# Tokenize Data
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
                            examples["tokens"], 
                            truncation=True, 
                            is_split_into_words=True, 
                            padding='max_length',  # Ensures all sequences are the same length  
                            max_length=128  # Adjust based on model (BERT: 512, RoBERTa: 256-512, smaller models: 128)
                        )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != 0 else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Prepare Dataset
def prepare_dataset(dataset):
    dataset = dataset.map(lambda x: {'ner_tags': encode_tags(x['ner_tags'], tag_to_id)})
    tokenized_dataset =  dataset.map(tokenize_and_align_labels, batched=True)
    
    return tokenized_dataset
    


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Flatten the predictions and labels for metric computation
    true_predictions = [
        pred for pred, label in zip(predictions.flatten(), labels.flatten()) if label != -100
    ]
    true_labels = [label for label in labels.flatten() if label != -100]

    # Calculate overall metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, true_predictions, average='weighted'
    )
    accuracy = accuracy_score(true_labels, true_predictions)

    # Initialize dictionaries to store metrics for each entity
    entity_metrics = {}

    # Get unique labels
    unique_labels = set(true_labels)

    for label in unique_labels:
        # Create binary labels for the current entity
        binary_true_labels = [1 if l == label else 0 for l in true_labels]
        binary_true_predictions = [1 if p == label else 0 for p in true_predictions]

        # Compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(binary_true_labels, binary_true_predictions).ravel()

        # Calculate FPR and FNR
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

        # Ensure label is a native Python int
        label = int(label)

        # Store metrics
        entity_metrics[label] = {
            'precision': float(precision_score(binary_true_labels, binary_true_predictions)),
            'recall': float(recall_score(binary_true_labels, binary_true_predictions)),
            'f1': float(f1_score(binary_true_labels, binary_true_predictions)),
            'fpr': float(fpr),
            'fnr': float(fnr)
        }

    return {
        'accuracy': float(accuracy),
        'f1': float(f1),
        'precision': float(precision),
        'recall': float(recall),
        'entity_metrics': entity_metrics
    }



# Train the Model
def train_ner_model(dataset, test_data):
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=test_data,
        compute_metrics=compute_metrics
    )
    trainer.train()
    return trainer

Some weights of BertForTokenClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [116]:
file_path = "email_added_data2.json"  
dataset = load_json_to_dataset(file_path)
test_data = load_json_to_dataset("Internship_task_data/Internship_task_data/test_data.json")
print("Loaded Dataset:")
print(dataset)

Loaded Dataset:
Dataset({
    features: ['sequence', 'tokens', 'ner_tags'],
    num_rows: 28516
})


In [117]:
dataset = prepare_dataset(dataset)
test_data = prepare_dataset(test_data)

Map:   0%|          | 0/28516 [00:00<?, ? examples/s]

Map:   0%|          | 0/28516 [00:00<?, ? examples/s]

Map:   0%|          | 0/3650 [00:00<?, ? examples/s]

Map:   0%|          | 0/3650 [00:00<?, ? examples/s]

In [118]:
trainer = train_ner_model(dataset, test_data)



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Entity Metrics
1,0.1158,0.075507,0.975502,0.975309,0.975283,0.975502,"{0: {'precision': 0.9885505948573622, 'recall': 0.9925502536767067, 'f1': 0.9905463867969877, 'fpr': 0.05670299036999493, 'fnr': 0.007449746323293302}, 1: {'precision': 0.9094315245478036, 'recall': 0.8560136203332117, 'f1': 0.8819144271126981, 'fpr': 0.008206893322094221, 'fnr': 0.1439863796667883}, 2: {'precision': 0.9102796478508545, 'recall': 0.9299034519243486, 'f1': 0.9199869152764149, 'fpr': 0.008050837612398058, 'fnr': 0.07009654807565137}}"
2,0.088,0.060834,0.979175,0.97904,0.979019,0.979175,"{0: {'precision': 0.9895871976257787, 'recall': 0.993629182454563, 'f1': 0.991604071064168, 'fpr': 0.051571211353269135, 'fnr': 0.006370817545437031}, 1: {'precision': 0.9244848329706898, 'recall': 0.8783898820381856, 'f1': 0.9008480917934647, 'fpr': 0.006907370984358902, 'fnr': 0.12161011796181442}, 2: {'precision': 0.9295056238556108, 'recall': 0.9399550324031213, 'f1': 0.9347011244821464, 'fpr': 0.006261762587420712, 'fnr': 0.06004496759687872}}"
3,0.0801,0.058456,0.979923,0.979798,0.979808,0.979923,"{0: {'precision': 0.9899799086290518, 'recall': 0.9936420268447755, 'f1': 0.99180758727676, 'fpr': 0.049607197161682715, 'fnr': 0.006357973155224456}, 1: {'precision': 0.9299011426370523, 'recall': 0.8808220843974218, 'f1': 0.9046964776417685, 'fpr': 0.006392245012644001, 'fnr': 0.11917791560257814}, 2: {'precision': 0.9293506493506494, 'recall': 0.9464356566591721, 'f1': 0.9378153463075815, 'fpr': 0.006319849438881015, 'fnr': 0.053564343340827936}}"


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [103]:
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

  _warn_prf(average, modifier, msg_start, len(result))


Evaluation Results: {'eval_loss': 0.09296087175607681, 'eval_accuracy': 0.9708668396715044, 'eval_f1': 0.9705975578101452, 'eval_precision': 0.9705608735485539, 'eval_recall': 0.9708668396715044, 'eval_entity_metrics': {0: {'precision': 0.986765213610731, 'recall': 0.9911759039239612, 'f1': 0.9889656409796358, 'fpr': 0.0655727318803852, 'fnr': 0.00882409607603879}, 1: {'precision': 0.8899634082592787, 'recall': 0.8281649033199562, 'f1': 0.8579527559055118, 'fpr': 0.009857637913271518, 'fnr': 0.1718350966800438}, 2: {'precision': 0.89136024685009, 'recall': 0.9169422034122471, 'f1': 0.9039702718560532, 'fpr': 0.009816677896791283, 'fnr': 0.08305779658775295}}, 'eval_runtime': 10.924, 'eval_samples_per_second': 334.127, 'eval_steps_per_second': 41.835, 'epoch': 3.0}


In [119]:
save_directory = "./saved_model3" 
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./saved_model3\\tokenizer_config.json',
 './saved_model3\\special_tokens_map.json',
 './saved_model3\\vocab.txt',
 './saved_model3\\added_tokens.json',
 './saved_model3\\tokenizer.json')