In [1]:
!pip install -U datasets evaluate transformers
import json
from tqdm import tqdm
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    BartForConditionalGeneration, BartTokenizer,
)
from transformers import (
    MT5ForConditionalGeneration, MT5Tokenizer,
    LEDForConditionalGeneration, LEDTokenizer,
    PegasusForConditionalGeneration, PegasusTokenizer
)
import difflib
from google.colab import drive
import json
from collections import defaultdict
from transformers import pipeline
from datasets import load_dataset, Dataset, Features, Value
from evaluate import load as load_metric
from google.colab import drive
import spacy
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support
import re
from collections import defaultdict


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [2]:
#used entities
MASK_MAP = {
    "PERSON": "[PERSON]",
    "ORG": "[ORG]",
    "GPE": "[LOCATION]",
    "DATE": "[DATE]"
}

# allowed masks
ALLOWED_MASKS = set(MASK_MAP.values())

In [3]:
# Match to closest allowed tag using string similarity
def normalize_tag(tag):
    match = difflib.get_close_matches(tag, ALLOWED_MASKS, n=1, cutoff=0.8)
    return match[0] if match else None

#extract and find all masks
def extract_mask_tags(text):
    raw_tags = re.findall(r'\[([a-zA-Z ]+)\]', text)
    full_tags = [f"[{tag.upper()}]" for tag in raw_tags]
    normalized = []
    for tag in full_tags:
        if tag in ALLOWED_MASKS:
            normalized.append(tag)
        else:
            fixed = normalize_tag(tag)
            if fixed:
                normalized.append(fixed)
    return normalized


In [4]:
def evaluate(predicted: str, reference: str):
    pred_tags = extract_mask_tags(predicted)
    ref_tags = extract_mask_tags(reference)

    pred_counter = Counter(pred_tags)
    ref_counter = Counter(ref_tags)

    all_tags = set(pred_counter.keys()).union(ref_counter.keys())

    # Initialize per-tag confusion matrix
    confusion = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})

    # Compute TP, FP, FN per tag
    for tag in all_tags:
        tp = min(pred_counter[tag], ref_counter[tag])
        fp = max(pred_counter[tag] - ref_counter[tag], 0)
        fn = max(ref_counter[tag] - pred_counter[tag], 0)

        confusion[tag]["tp"] = tp
        confusion[tag]["fp"] = fp
        confusion[tag]["fn"] = fn

    # Compute total TP, FP, FN across all tags
    total_tp = sum(v["tp"] for v in confusion.values())
    total_fp = sum(v["fp"] for v in confusion.values())
    total_fn = sum(v["fn"] for v in confusion.values())

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) else 0.0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
    exact_match = int(predicted.strip() == reference.strip())

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "exact_match": exact_match,
        "tp": total_tp,
        "fp": total_fp,
        "fn": total_fn,
        "confusion_matrix": dict(confusion)  # Convert defaultdict to normal dict
    }

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# load dataset
dataset = load_dataset(
    "json",
    data_files="/content/drive/MyDrive/role_aware_squad.json",
    field=None
)

# Split into train (80%), val (10%), test (10%)
train_val = dataset["train"]
train_val = train_val.select(range(1000, 2000))
train_val = train_val.train_test_split(test_size=0.2, seed=42)
train_dataset = train_val["train"]
val_test = train_val["test"].train_test_split(test_size=0.5, seed=42)

eval_dataset = val_test["train"]  # validation during training
test_dataset = val_test["test"]   # final testing

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
# format preprocessed dataset
def gen(ds):
    for ex in ds:
        for role in ["EMPLOYER", "EMPLOYEE", "CUSTOMER"]:
            yield {
                "role": role,
                "context": ex["original_context"],
                "masked_context": ex["role_contexts"][role],
            }

features = Features({
    "role": Value("string"),
    "context": Value("string"),
    "masked_context": Value("string"),
})

format_train_dataset = Dataset.from_generator(lambda: gen(train_dataset), features=features)
format_eval_dataset  = Dataset.from_generator(lambda: gen(eval_dataset),  features=features)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:

#used models
model_configs = [
  {
      "name": "BART-Base",
       "model_cls": BartForConditionalGeneration,
        "tokenizer_cls": BartTokenizer,
        "pretrained": "facebook/bart-base",
       "out_dir": "/content/drive/MyDrive/models/role-aware-rag/bart-base"
    },
    {
        "name": "DistilBART",
       "model_cls": BartForConditionalGeneration,
       "tokenizer_cls": BartTokenizer,
        "pretrained": "sshleifer/distilbart-cnn-12-6",
        "out_dir": "/content/drive/MyDrive/models/role-aware-rag/distilbart"
    },
    {
        "name": "T5-Base",
        "model_cls": T5ForConditionalGeneration,
        "tokenizer_cls": T5Tokenizer,
        "pretrained": "t5-base",
        "out_dir": "/content/drive/MyDrive/models/role-aware-rag/t5-base"
    },
    {
         "name": "LED-Base",
        "model_cls": LEDForConditionalGeneration,
        "tokenizer_cls": LEDTokenizer,
        "pretrained": "allenai/led-base-16384",
        "out_dir": "/content/drive/MyDrive/models/role-aware-rag/led-base"
    },


]


In [13]:
def role_avg(metrics):
        return {metric: sum(x[metric] for x in metrics) / len(metrics) for metric in ['precision', 'recall', 'f1', 'exact_match']}



def print_confusion_matrix(confusion_matrix):
      print(f"{'TAG':<10} {'TP':<5} {'FP':<5} {'FN':<5}")
      for tag, counts in confusion_matrix.items():
          print(f"{tag:<10} {counts['tp']:<5} {counts['fp']:<5} {counts['fn']:<5}")


def evaluate_masked_context_prediction(model_path, dataset, tokenizer, model, batch_size=8):
    tokenizer = tokenizer.from_pretrained(model_path)
    model = model.from_pretrained(model_path)
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

    all_scores = []
    dataset = list(dataset)
    role_scores = defaultdict(list)
    global_confusion = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})

    # Split dataset into batches
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]

        prompts = [
            f"Please mask all PERSON, ORG, GPE and DATE entities from the following text, considering the role {ex['role']}:\n{ex['context']}"
            for ex in batch
        ]
        labels = [ex["masked_context"] for ex in batch]
        roles = [ex['role'] for ex in batch]
        # Batch predict
        predictions = pipe(prompts, max_new_tokens=512, do_sample=False, num_beams=3)

        for role, pred, label in zip(roles, predictions, labels):
            generated = pred["generated_text"]
            print(generated)
            result = evaluate(generated, label)

            for tag, counts in result.get("confusion_matrix", {}).items():
                global_confusion[tag]["tp"] += counts["tp"]
                global_confusion[tag]["fp"] += counts["fp"]
                global_confusion[tag]["fn"] += counts["fn"]

            role_scores[role].append(result)
            all_scores.append(result)



    print_confusion_matrix(global_confusion)


    # Compute per-role averages
    role_results = {role: role_avg(scores) for role, scores in role_scores.items()}

    print(role_results)


In [None]:
if __name__ == "__main__":

    for model in model_configs:
      print(model['name'])
      results = evaluate_masked_context_prediction(model['out_dir'], test_dataset, model['tokenizer_cls'], model['model_cls'])


      print(json.dumps(results, indent=2))

BART-Base


Device set to use cuda:0
 26%|██▋       | 10/38 [02:25<06:15, 13.41s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 95%|█████████▍| 36/38 [07:43<00:21, 10.64s/it]