<a href="https://colab.research.google.com/github/tubagokhan/RegNLP2025/blob/main/RIRAGEvaluationMetricFull.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown 1BB21XL5geQ0Uw6gk0_uRFk6tthPQ0vSh
!gdown 1BGb3EYmhmTMVx3qTihCBpUrJ3WO2lM8H

Downloading...
From: https://drive.google.com/uc?id=1BB21XL5geQ0Uw6gk0_uRFk6tthPQ0vSh
To: /content/ObligationClassificationDataset.json
100% 545k/545k [00:00<00:00, 114MB/s]
Downloading...
From: https://drive.google.com/uc?id=1BGb3EYmhmTMVx3qTihCBpUrJ3WO2lM8H
To: /content/selected_test_samples.json
100% 135k/135k [00:00<00:00, 120MB/s]


In [2]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check if CUDA is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Load and preprocess the data
json_path = "./ObligationClassificationDataset.json"
with open(json_path, 'r') as file:
    data = json.load(file)

texts = [item['Text'] for item in data]
labels = [1 if item['Obligation'] else 0 for item in data]  # Converting True/False to 1/0

# Step 2: Tokenization using LegalBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')

class ObligationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = ObligationDataset(X_train, y_train, tokenizer)
val_dataset = ObligationDataset(X_val, y_val, tokenizer)

# Step 3: Fine-tuning LegalBERT for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=2)
model.to(device)  # Move model to the GPU

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Step 4: Train the model
trainer.train()

# Step 5: Evaluate the model
trainer.evaluate()

# Step 6: Save the model and tokenizer for future use
model.save_pretrained('./obligation-classifier-legalbert')
tokenizer.save_pretrained('./obligation-classifier-legalbert')

print("Model fine-tuning and evaluation completed.")


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4774,0.384716,0.85,0.88707,0.813814,0.97482
2,0.0512,0.080557,0.980435,0.983957,0.975265,0.992806
3,0.0662,0.02558,0.991304,0.992806,0.992806,0.992806
4,0.0002,0.034434,0.993478,0.994595,0.99639,0.992806
5,0.0208,0.100885,0.984783,0.98725,1.0,0.97482
6,0.0478,0.057023,0.991304,0.992754,1.0,0.985612


Model fine-tuning and evaluation completed.


In [3]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


In [91]:
import json
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from nltk.tokenize import sent_tokenize as sent_tokenize_uncached
import nltk
from functools import cache
import tqdm

nltk.download('punkt')

# Set up device for torch operations
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the tokenizer and model for obligation detection
model_name = './obligation-classifier-legalbert'
obligation_tokenizer = AutoTokenizer.from_pretrained(model_name)
obligation_model = AutoModelForSequenceClassification.from_pretrained(model_name)
obligation_model.to(device)
obligation_model.eval()

# Load NLI model and tokenizer for obligation coverage using Microsoft's model
coverage_nli_model = pipeline("text-classification", model="microsoft/deberta-large-mnli", device=device)

# Load NLI model and tokenizer for entailment and contradiction checks
nli_tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-v3-xsmall')
nli_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-v3-xsmall')
nli_model.to(device)
nli_model.eval()

# Define a cached version of sentence tokenization
@cache
def sent_tokenize(passage: str):
  return sent_tokenize_uncached(passage)

def softmax(logits):
    e_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return e_logits / np.sum(e_logits, axis=1, keepdims=True)

def get_nli_probabilities(premises, hypotheses):
    features = tokenizer(premises, hypotheses, padding=True, truncation=True, return_tensors="pt").to("cuda")
    nli_model.eval()
    with torch.no_grad():
        logits = nli_model(**features).logits.cpu().numpy()
    probabilities = softmax(logits)
    return probabilities

def get_nli_matrix(passages, answers):
    print(f"{len(passages)} passages and {len(answers)} answers.")
    entailment_matrix = np.zeros((len(passages), len(answers)))
    contradiction_matrix = np.zeros((len(passages), len(answers)))

    batch_size = 16
    for i, pas in enumerate(tqdm.tqdm(passages)):
      for b in range(0, len(answers), batch_size):
        e = b + batch_size
        probs = get_nli_probabilities([pas] * len(answers[b:e]), answers[b:e])  # Get NLI probabilities
        entailment_matrix[i, b:e] = probs[:, 1]
        contradiction_matrix[i, b:e] = probs[:, 0]
    return entailment_matrix, contradiction_matrix

def calculate_scores_from_matrix(nli_matrix, score_type='entailment'):
    if nli_matrix.size == 0:
        print("Warning: NLI matrix is empty. Returning default score of 0.")
        return 0.0  # or some other default score or handling as appropriate for your use case

    if score_type == 'entailment':
        reduced_vector = np.max(nli_matrix, axis=0)
    elif score_type == 'contradiction':
        reduced_vector = np.min(nli_matrix, axis=0)
    score = np.round(np.mean(reduced_vector), 5)
    return score

def calculate_obligation_coverage_score(passages, answers):
    obligation_sentences_source = [sent for passage in passages for sent in sent_tokenize(passage)]
    obligation_sentences_answer = [sent for answer in answers for sent in sent_tokenize(answer)]
    covered_count = 0

    for obligation in obligation_sentences_source:
        obligation_covered = False
        for answer_sentence in obligation_sentences_answer:
            nli_result = coverage_nli_model(f"{answer_sentence} [SEP] {obligation}")
            if nli_result[0]['label'].lower() == 'entailment' and nli_result[0]['score'] > 0.7:
                covered_count += 1
                obligation_covered = True
                break


    coverage_score = covered_count / len(obligation_sentences_source) if obligation_sentences_source else 0
    return coverage_score

def calculate_final_composite_score(passages, answers):
    passage_sentences = [sent for passage in passages for sent in sent_tokenize(passage)]
    answer_sentences = [sent for answer in answers for sent in sent_tokenize(answer)]
    entailment_matrix, contradiction_matrix = get_nli_matrix(passage_sentences, answer_sentences)
    entailment_score = calculate_scores_from_matrix(entailment_matrix, 'entailment')
    contradiction_score = calculate_scores_from_matrix(contradiction_matrix, 'contradiction')
    obligation_coverage_score = calculate_obligation_coverage_score(passages, answers)
    print(f"Entailment Score: {entailment_score}")
    print(f"Contradiction Score: {contradiction_score}")
    print(f"Obligation Coverage Score: {obligation_coverage_score}")


    # New formula: (O + E - C + 1) / 3
    composite_score = (obligation_coverage_score + entailment_score - contradiction_score + 1) / 3
    print(f"Final Composite Score: {composite_score}")
    return np.round(composite_score, 5)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [94]:
def main(input_file_path):
    with open(input_file_path, 'r') as file:
        test_data = json.load(file)

    # Prepare the data
    composite_scores = []
    entailment_scores = []
    contradiction_scores = []
    obligation_coverage_scores = []
    total_items = len(test_data)

    for index, item in enumerate(test_data):
        question = [item['QuestionID']]
        passages = [item['RetrievedPassage']]
        answers = [item['Answer']]
        print(f"Processing {index + 1}/{total_items}: QuestionID {question}")

        # Calculate and store scores
        passage_sentences = [sent for passage in passages for sent in sent_tokenize(passage)]
        answer_sentences = [sent for answer in answers for sent in sent_tokenize(answer)]
        entailment_matrix, contradiction_matrix = get_nli_matrix(passage_sentences, answer_sentences)
        entailment_score = calculate_scores_from_matrix(entailment_matrix, 'entailment')
        contradiction_score = calculate_scores_from_matrix(contradiction_matrix, 'contradiction')
        obligation_coverage_score = calculate_obligation_coverage_score(passages, answers)
        final_composite_score = (obligation_coverage_score + entailment_score - contradiction_score + 1) / 3

        # Append to respective lists
        entailment_scores.append(entailment_score)
        contradiction_scores.append(contradiction_score)
        obligation_coverage_scores.append(obligation_coverage_score)
        composite_scores.append(final_composite_score)

    # Calculate averages
    avg_entailment = np.mean(entailment_scores)
    avg_contradiction = np.mean(contradiction_scores)
    avg_obligation_coverage = np.mean(obligation_coverage_scores)
    avg_composite = np.mean(composite_scores)

    print("\n")
    print("Average Entailment Score:", avg_entailment)
    print("Average Contradiction Score:", avg_contradiction)
    print("Average Obligation Coverage Score:", avg_obligation_coverage)
    print("Average Final Composite Score:", avg_composite)

if __name__ == "__main__":
    main('/content/selected_test_samples.json')


Processing 1/100: QuestionID ['480fBQ']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 41.04it/s]


Processing 2/100: QuestionID ['412Gsb']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 44.16it/s]


Processing 3/100: QuestionID ['498QLc']
2 passages and 8 answers.


100%|██████████| 2/2 [00:00<00:00, 41.95it/s]


Processing 4/100: QuestionID ['717Ccv']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 38.81it/s]


Processing 5/100: QuestionID ['444QbG']
1 passages and 4 answers.


100%|██████████| 1/1 [00:00<00:00, 40.58it/s]


Processing 6/100: QuestionID ['004VzI']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 42.64it/s]


Processing 7/100: QuestionID ['604fIU']
1 passages and 4 answers.


100%|██████████| 1/1 [00:00<00:00, 40.77it/s]


Processing 8/100: QuestionID ['676pPm']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 37.19it/s]


Processing 9/100: QuestionID ['363ksV']
1 passages and 4 answers.


100%|██████████| 1/1 [00:00<00:00, 37.60it/s]


Processing 10/100: QuestionID ['522OXp']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 41.39it/s]


Processing 11/100: QuestionID ['374xcL']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 40.75it/s]


Processing 12/100: QuestionID ['068FmL']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 43.41it/s]


Processing 13/100: QuestionID ['461TVG']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 41.39it/s]


Processing 14/100: QuestionID ['963iPV']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 39.67it/s]


Processing 15/100: QuestionID ['047Pro']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 36.63it/s]


Processing 16/100: QuestionID ['668dtU']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 38.26it/s]


Processing 17/100: QuestionID ['167EUP']
1 passages and 4 answers.


100%|██████████| 1/1 [00:00<00:00, 31.37it/s]


Processing 18/100: QuestionID ['416Qvz']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 38.84it/s]


Processing 19/100: QuestionID ['892Nnp']
2 passages and 4 answers.


100%|██████████| 2/2 [00:00<00:00, 37.55it/s]


Processing 20/100: QuestionID ['960tSW']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 39.11it/s]


Processing 21/100: QuestionID ['879QAk']
2 passages and 5 answers.


100%|██████████| 2/2 [00:00<00:00, 39.58it/s]


Processing 22/100: QuestionID ['764gdj']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 41.56it/s]


Processing 23/100: QuestionID ['603Wpb']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 39.67it/s]


Processing 24/100: QuestionID ['816SDV']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 39.46it/s]


Processing 25/100: QuestionID ['961KBv']
2 passages and 4 answers.


100%|██████████| 2/2 [00:00<00:00, 40.37it/s]


Processing 26/100: QuestionID ['138hAv']
1 passages and 7 answers.


100%|██████████| 1/1 [00:00<00:00, 21.93it/s]


Processing 27/100: QuestionID ['484ZhU']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 42.81it/s]


Processing 28/100: QuestionID ['758qWf']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 40.20it/s]


Processing 29/100: QuestionID ['912NoL']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 41.21it/s]


Processing 30/100: QuestionID ['025iMN']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 42.13it/s]


Processing 31/100: QuestionID ['279suk']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 40.61it/s]


Processing 32/100: QuestionID ['032DMK']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 41.56it/s]


Processing 33/100: QuestionID ['782Qku']
1 passages and 6 answers.


100%|██████████| 1/1 [00:00<00:00, 28.47it/s]


Processing 34/100: QuestionID ['815PqS']
1 passages and 7 answers.


100%|██████████| 1/1 [00:00<00:00, 25.55it/s]


Processing 35/100: QuestionID ['883Epz']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 39.73it/s]


Processing 36/100: QuestionID ['458NMV']
1 passages and 7 answers.


100%|██████████| 1/1 [00:00<00:00, 22.11it/s]


Processing 37/100: QuestionID ['478ooN']
3 passages and 3 answers.


100%|██████████| 3/3 [00:00<00:00, 41.42it/s]


Processing 38/100: QuestionID ['583bFk']
3 passages and 5 answers.


100%|██████████| 3/3 [00:00<00:00, 42.18it/s]


Processing 39/100: QuestionID ['789xKp']
3 passages and 2 answers.


100%|██████████| 3/3 [00:00<00:00, 44.54it/s]


Processing 40/100: QuestionID ['806xAh']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 41.42it/s]


Processing 41/100: QuestionID ['263MYD']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 43.61it/s]


Processing 42/100: QuestionID ['512CCd']
1 passages and 4 answers.


100%|██████████| 1/1 [00:00<00:00, 38.55it/s]


Processing 43/100: QuestionID ['118NHW']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 41.11it/s]


Processing 44/100: QuestionID ['495bWI']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 43.22it/s]


Processing 45/100: QuestionID ['721KPW']
2 passages and 8 answers.


100%|██████████| 2/2 [00:00<00:00, 38.93it/s]


Processing 46/100: QuestionID ['071LNb']
2 passages and 5 answers.


100%|██████████| 2/2 [00:00<00:00, 41.09it/s]


Processing 47/100: QuestionID ['933OCr']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 42.25it/s]


Processing 48/100: QuestionID ['920ODC']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 36.42it/s]


Processing 49/100: QuestionID ['273Xrh']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 38.88it/s]


Processing 50/100: QuestionID ['740LnA']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 41.02it/s]


Processing 51/100: QuestionID ['024WGu']
2 passages and 4 answers.


100%|██████████| 2/2 [00:00<00:00, 41.96it/s]


Processing 52/100: QuestionID ['235tie']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 31.19it/s]


Processing 53/100: QuestionID ['763Wbw']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 40.67it/s]


Processing 54/100: QuestionID ['479LCb']
2 passages and 4 answers.


100%|██████████| 2/2 [00:00<00:00, 38.94it/s]


Processing 55/100: QuestionID ['960LTI']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 36.15it/s]


Processing 56/100: QuestionID ['178SZD']
3 passages and 4 answers.


100%|██████████| 3/3 [00:00<00:00, 39.57it/s]


Processing 57/100: QuestionID ['904Tvk']
1 passages and 4 answers.


100%|██████████| 1/1 [00:00<00:00, 26.52it/s]


Processing 58/100: QuestionID ['425NlW']
1 passages and 9 answers.


100%|██████████| 1/1 [00:00<00:00, 21.80it/s]


Processing 59/100: QuestionID ['317uvO']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 39.40it/s]


Processing 60/100: QuestionID ['209TfQ']
2 passages and 4 answers.


100%|██████████| 2/2 [00:00<00:00, 41.54it/s]


Processing 61/100: QuestionID ['486Tlx']
2 passages and 6 answers.


100%|██████████| 2/2 [00:00<00:00, 39.89it/s]


Processing 62/100: QuestionID ['653IYi']
2 passages and 4 answers.


100%|██████████| 2/2 [00:00<00:00, 41.35it/s]


Processing 63/100: QuestionID ['899UQp']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 37.91it/s]


Processing 64/100: QuestionID ['404dcd']
4 passages and 4 answers.


100%|██████████| 4/4 [00:00<00:00, 42.30it/s]


Processing 65/100: QuestionID ['372XxU']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 41.80it/s]


Processing 66/100: QuestionID ['111jCi']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 41.90it/s]


Processing 67/100: QuestionID ['598Qor']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 42.74it/s]


Processing 68/100: QuestionID ['999Mea']
1 passages and 1 answers.


100%|██████████| 1/1 [00:00<00:00, 39.95it/s]


Processing 69/100: QuestionID ['400KWE']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 42.40it/s]


Processing 70/100: QuestionID ['567sGE']
5 passages and 3 answers.


100%|██████████| 5/5 [00:00<00:00, 41.93it/s]


Processing 71/100: QuestionID ['997Bdm']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 43.07it/s]


Processing 72/100: QuestionID ['745UOH']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 43.82it/s]


Processing 73/100: QuestionID ['725dmx']
1 passages and 5 answers.


100%|██████████| 1/1 [00:00<00:00, 41.86it/s]


Processing 74/100: QuestionID ['468lyv']
1 passages and 6 answers.


100%|██████████| 1/1 [00:00<00:00, 42.82it/s]


Processing 75/100: QuestionID ['254Ntd']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 41.25it/s]


Processing 76/100: QuestionID ['421nme']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 41.43it/s]


Processing 77/100: QuestionID ['030MYu']
1 passages and 10 answers.


100%|██████████| 1/1 [00:00<00:00, 15.76it/s]


Processing 78/100: QuestionID ['341lax']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 38.60it/s]


Processing 79/100: QuestionID ['529moD']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 38.87it/s]


Processing 80/100: QuestionID ['026gPX']
1 passages and 4 answers.


100%|██████████| 1/1 [00:00<00:00, 41.38it/s]


Processing 81/100: QuestionID ['460Cff']
1 passages and 4 answers.


100%|██████████| 1/1 [00:00<00:00, 41.16it/s]


Processing 82/100: QuestionID ['362RPB']
1 passages and 6 answers.


100%|██████████| 1/1 [00:00<00:00, 28.95it/s]


Processing 83/100: QuestionID ['220DZp']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 40.43it/s]


Processing 84/100: QuestionID ['149sax']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 36.87it/s]


Processing 85/100: QuestionID ['925ylk']
1 passages and 3 answers.


100%|██████████| 1/1 [00:00<00:00, 38.98it/s]


Processing 86/100: QuestionID ['267ELG']
1 passages and 8 answers.


100%|██████████| 1/1 [00:00<00:00, 20.82it/s]


Processing 87/100: QuestionID ['377uHs']
2 passages and 4 answers.


100%|██████████| 2/2 [00:00<00:00, 39.89it/s]


Processing 88/100: QuestionID ['607iAm']
2 passages and 5 answers.


100%|██████████| 2/2 [00:00<00:00, 40.55it/s]


Processing 89/100: QuestionID ['852bat']
2 passages and 5 answers.


100%|██████████| 2/2 [00:00<00:00, 40.74it/s]


Processing 90/100: QuestionID ['427eRp']
2 passages and 4 answers.


100%|██████████| 2/2 [00:00<00:00, 38.83it/s]


Processing 91/100: QuestionID ['535Les']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 38.75it/s]


Processing 92/100: QuestionID ['227gxK']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 38.84it/s]


Processing 93/100: QuestionID ['918qXF']
4 passages and 3 answers.


100%|██████████| 4/4 [00:00<00:00, 42.16it/s]


Processing 94/100: QuestionID ['490lYX']
1 passages and 9 answers.


100%|██████████| 1/1 [00:00<00:00, 23.76it/s]


Processing 95/100: QuestionID ['665fcY']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 41.63it/s]


Processing 96/100: QuestionID ['713MyI']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 41.66it/s]


Processing 97/100: QuestionID ['812zsB']
2 passages and 2 answers.


100%|██████████| 2/2 [00:00<00:00, 42.48it/s]


Processing 98/100: QuestionID ['175pJE']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 39.32it/s]


Processing 99/100: QuestionID ['516QYq']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 38.86it/s]


Processing 100/100: QuestionID ['395PNP']
2 passages and 5 answers.


100%|██████████| 2/2 [00:00<00:00, 40.26it/s]




Average Entailment Score: 0.7024035
Average Contradiction Score: 0.1372604
Average Obligation Coverage Score: 0.34450000000000003
Average Final Composite Score: 0.6365476999999999


In [95]:
if __name__ == "__main__":
    main('/content/GoldenQuestionAnswerforEvaluationMetric.json')

Processing 1/21: QuestionID ['1']
3 passages and 4 answers.


100%|██████████| 3/3 [00:00<00:00, 25.40it/s]


Processing 2/21: QuestionID ['2']
6 passages and 7 answers.


100%|██████████| 6/6 [00:00<00:00, 23.80it/s]


Processing 3/21: QuestionID ['3']
11 passages and 12 answers.


100%|██████████| 11/11 [00:00<00:00, 24.43it/s]


Processing 4/21: QuestionID ['4']
3 passages and 4 answers.


100%|██████████| 3/3 [00:00<00:00, 30.56it/s]


Processing 5/21: QuestionID ['6']
4 passages and 5 answers.


100%|██████████| 4/4 [00:00<00:00, 34.40it/s]


Processing 6/21: QuestionID ['8']
1 passages and 2 answers.


100%|██████████| 1/1 [00:00<00:00, 35.74it/s]


Processing 7/21: QuestionID ['10']
9 passages and 10 answers.


100%|██████████| 9/9 [00:00<00:00, 11.98it/s]


Processing 8/21: QuestionID ['11']
2 passages and 3 answers.


100%|██████████| 2/2 [00:00<00:00, 36.49it/s]


Processing 9/21: QuestionID ['13']
92 passages and 93 answers.


100%|██████████| 92/92 [01:14<00:00,  1.23it/s]


Processing 10/21: QuestionID ['14']
3 passages and 4 answers.


100%|██████████| 3/3 [00:00<00:00, 39.40it/s]


Processing 11/21: QuestionID ['15']
19 passages and 28 answers.


100%|██████████| 19/19 [00:04<00:00,  4.50it/s]


Processing 12/21: QuestionID ['16']
19 passages and 20 answers.


100%|██████████| 19/19 [00:06<00:00,  2.97it/s]


Processing 13/21: QuestionID ['17']
36 passages and 40 answers.


100%|██████████| 36/36 [00:03<00:00,  9.82it/s]


Processing 14/21: QuestionID ['18']
4 passages and 8 answers.


100%|██████████| 4/4 [00:00<00:00, 25.68it/s]


Processing 15/21: QuestionID ['19']
6 passages and 7 answers.


100%|██████████| 6/6 [00:00<00:00, 31.89it/s]


Processing 16/21: QuestionID ['20']
15 passages and 19 answers.


100%|██████████| 15/15 [00:01<00:00, 10.44it/s]


Processing 17/21: QuestionID ['21']
4 passages and 6 answers.


100%|██████████| 4/4 [00:00<00:00, 15.05it/s]


Processing 18/21: QuestionID ['22']
4 passages and 10 answers.


100%|██████████| 4/4 [00:00<00:00,  8.93it/s]


Processing 19/21: QuestionID ['23']
7 passages and 9 answers.


100%|██████████| 7/7 [00:00<00:00,  8.92it/s]


Processing 20/21: QuestionID ['24']
11 passages and 17 answers.


100%|██████████| 11/11 [00:02<00:00,  4.39it/s]


Processing 21/21: QuestionID ['25']
3 passages and 4 answers.


100%|██████████| 3/3 [00:00<00:00, 39.39it/s]




Average Entailment Score: 0.8372866666666667
Average Contradiction Score: 0.06893619047619048
Average Obligation Coverage Score: 1.0
Average Final Composite Score: 0.9227834920634921
