<a href="https://colab.research.google.com/github/tubagokhan/RegNLP2025/blob/main/RePASSCorrectVersion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers




In [None]:
import json
import os
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check if CUDA is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Folder path input
folder_path = "/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet"

# Step 2: Load and preprocess the data
json_path = os.path.join(folder_path, "ObligationClassificationDataset.json")
with open(json_path, 'r') as file:
    data = json.load(file)

texts = [item['Text'] for item in data]
labels = [1 if item['Obligation'] else 0 for item in data]  # Converting True/False to 1/0

# Step 3: Tokenization using LegalBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')

class ObligationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = ObligationDataset(X_train, y_train, tokenizer)
val_dataset = ObligationDataset(X_val, y_val, tokenizer)

# Step 4: Fine-tuning LegalBERT for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=2)
model.to(device)  # Move model to the GPU

# Ensure the directories exist for saving results and logs
output_dir = os.path.join(folder_path, 'results')
log_dir = os.path.join(folder_path, 'logs')
save_dir = os.path.join(folder_path, 'obligation-classifier-legalbert')

os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
os.makedirs(save_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=log_dir,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Step 5: Train the model
trainer.train()

# Step 6: Evaluate the model
trainer.evaluate()

# Step 7: Save the model and tokenizer for future use
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Model fine-tuning and evaluation completed.")


In [None]:
import os
import json
import csv
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from nltk.tokenize import sent_tokenize as sent_tokenize_uncached
import nltk
from functools import cache

nltk.download('punkt')

# Set up device for torch operations
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Define the folder path
folder_path = "/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet"

# Load the tokenizer and model for obligation detection
model_name = os.path.join(folder_path, 'obligation-classifier-legalbert')
obligation_tokenizer = AutoTokenizer.from_pretrained(model_name)
obligation_model = AutoModelForSequenceClassification.from_pretrained(model_name)
obligation_model.to(device)
obligation_model.eval()

# Load NLI model and tokenizer for obligation coverage using Microsoft's model
coverage_nli_model = pipeline("text-classification", model="microsoft/deberta-large-mnli", device=device)

# Load NLI model and tokenizer for entailment and contradiction checks
nli_tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-v3-xsmall')
nli_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-v3-xsmall')
nli_model.to(device)
nli_model.eval()

tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')

# Define a cached version of sentence tokenization
@cache
def sent_tokenize(passage: str):
    return sent_tokenize_uncached(passage)

def softmax(logits):
    e_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return e_logits / np.sum(e_logits, axis=1, keepdims=True)

def get_nli_probabilities(premises, hypotheses):
    features = nli_tokenizer(premises, hypotheses, padding=True, truncation=True, return_tensors="pt").to(device)
    nli_model.eval()
    with torch.no_grad():
        logits = nli_model(**features).logits.cpu().numpy()
    probabilities = softmax(logits)
    return probabilities

def get_nli_matrix(passages, answers):
    entailment_matrix = np.zeros((len(passages), len(answers)))
    contradiction_matrix = np.zeros((len(passages), len(answers)))

    batch_size = 16
    for i, pas in enumerate(passages):
        for b in range(0, len(answers), batch_size):
            e = b + batch_size
            probs = get_nli_probabilities([pas] * len(answers[b:e]), answers[b:e])  # Get NLI probabilities
            entailment_matrix[i, b:e] = probs[:, 1]
            contradiction_matrix[i, b:e] = probs[:, 0]
    return entailment_matrix, contradiction_matrix

def calculate_scores_from_matrix(nli_matrix, score_type='entailment'):
    if nli_matrix.size == 0:
        return 0.0  # or some other default score or handling as appropriate for your use case

    if score_type == 'entailment':
        reduced_vector = np.max(nli_matrix, axis=0)
    elif score_type == 'contradiction':
        reduced_vector = np.max(nli_matrix, axis=0)
    score = np.round(np.mean(reduced_vector), 5)
    return score

def calculate_obligation_coverage_score(passages, answers):
    obligation_sentences_source = [sent for passage in passages for sent in sent_tokenize(passage)]
    obligation_sentences_answer = [sent for answer in answers for sent in sent_tokenize(answer)]
    covered_count = 0

    for obligation in obligation_sentences_source:
        for answer_sentence in obligation_sentences_answer:
            nli_result = coverage_nli_model(f"{answer_sentence} [SEP] {obligation}")
            if nli_result[0]['label'].lower() == 'entailment' and nli_result[0]['score'] > 0.7:
                covered_count += 1
                break

    coverage_score = covered_count / len(obligation_sentences_source) if obligation_sentences_source else 0
    return coverage_score

def calculate_final_composite_score(passages, answers):
    passage_sentences = [sent for passage in passages for sent in sent_tokenize(passage)]
    answer_sentences = [sent for answer in answers for sent in sent_tokenize(answer)]

    # Calculate NLI matrix for entailment and contradiction
    entailment_matrix, contradiction_matrix = get_nli_matrix(passage_sentences, answer_sentences)

    # Calculate scores
    entailment_score = calculate_scores_from_matrix(entailment_matrix, 'entailment')
    contradiction_score = calculate_scores_from_matrix(contradiction_matrix, 'contradiction')
    obligation_coverage_score = calculate_obligation_coverage_score(passages, answers)

    # Final composite score formula
    composite_score = (obligation_coverage_score + entailment_score - contradiction_score + 1) / 3

    # Return all scores
    return np.round(composite_score, 5), entailment_score, contradiction_score, obligation_coverage_score

def calculate_average_scores_from_csv(output_file_csv):
    """Calculate average scores from the CSV file."""
    entailment_scores = []
    contradiction_scores = []
    obligation_coverage_scores = []
    composite_scores = []

    with open(output_file_csv, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                entailment_scores.append(float(row['entailment_score']))
                contradiction_scores.append(float(row['contradiction_score']))
                obligation_coverage_scores.append(float(row['obligation_coverage_score']))
                composite_scores.append(float(row['composite_score']))
            except ValueError:
                # Handle the case where the value cannot be converted to float, e.g., header row or invalid data
                print(f"Skipping invalid row: {row}")

    avg_entailment = np.mean(entailment_scores) if entailment_scores else 0.0
    avg_contradiction = np.mean(contradiction_scores) if contradiction_scores else 0.0
    avg_obligation_coverage = np.mean(obligation_coverage_scores) if obligation_coverage_scores else 0.0
    avg_composite = np.mean(composite_scores) if composite_scores else 0.0

    return avg_entailment, avg_contradiction, avg_obligation_coverage, avg_composite


def main(input_file_path, group_method_name):
    # Create a directory with the group_method_name in the folder path
    output_dir = os.path.join(folder_path, group_method_name)
    os.makedirs(output_dir, exist_ok=True)

    # Define the paths for result files
    output_file_csv = os.path.join(output_dir, 'results.csv')
    output_file_txt = os.path.join(output_dir, 'results.txt')

    processed_question_ids = set()
    saved_items_count = 0

    # Check if the output CSV file already exists and read processed QuestionIDs
    if os.path.exists(output_file_csv):
        with open(output_file_csv, 'r') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                processed_question_ids.add(row['QuestionID'])
                saved_items_count += 1

    with open(input_file_path, 'r') as file:
        test_data = json.load(file)

    total_items = len(test_data)

    # Open the CSV file for appending results
    with open(output_file_csv, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        if not processed_question_ids:
            # Write the header if the file is empty or new
            writer.writerow(['QuestionID', 'entailment_score', 'contradiction_score', 'obligation_coverage_score', 'composite_score'])

        for index, item in enumerate(test_data, start=1):
            question_id = item['QuestionID']

            # Skip if the QuestionID has already been processed
            if question_id in processed_question_ids:
                continue

            # Skip if the "Answer" is null or empty
            if not item.get('Answer') or not item['Answer'].strip():
                continue

            # Merge "RetrievedPassages" if it's a list
            if isinstance(item['RetrievedPassages'], list):
                item['RetrievedPassages'] = " ".join(item['RetrievedPassages'])

            passages = [item['RetrievedPassages']]
            answers = [item['Answer']]
            composite_score, entailment_score, contradiction_score, obligation_coverage_score = calculate_final_composite_score(passages, answers)

            # Write the result to the CSV file
            writer.writerow([question_id, entailment_score, contradiction_score, obligation_coverage_score, composite_score])

            # Increment the saved items count and print status
            saved_items_count += 1
            print(f"{saved_items_count}/{total_items}")

    # Calculate average scores from the CSV file
    avg_entailment, avg_contradiction, avg_obligation_coverage, avg_composite = calculate_average_scores_from_csv(output_file_csv)

    # Print and save results to a text file
    results = (
        f"Average Entailment Score: {avg_entailment}\n"
        f"Average Contradiction Score: {avg_contradiction}\n"
        f"Average Obligation Coverage Score: {avg_obligation_coverage}\n"
        f"Average Final Composite Score: {avg_composite}\n"
    )

    print(results)

    with open(output_file_txt, 'w') as txtfile:
        txtfile.write(results)

    print(f"Processing complete. Results saved to {output_dir}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/419 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/283M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:


if __name__ == "__main__":
  group_methodName = 'SAMPLE'
  input_file = '/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/sample.json'
  main(input_file, group_methodName)


Average Entailment Score: 0.852965
Average Contradiction Score: 0.18055700000000002
Average Obligation Coverage Score: 0.85
Average Final Composite Score: 0.840804

Processing complete. Results saved to /content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/SAMPLE


In [None]:
if __name__ == "__main__":
  group_methodName = 'REALGOLD2'
  input_file = '/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/GoldenQuestionAnswerforEvaluationMetric.json'
  main(input_file, group_methodName)
  # correct tokenizer

1/21
2/21
3/21
4/21
5/21
6/21
7/21
8/21
9/21
10/21
11/21
12/21
13/21
14/21
15/21
16/21
17/21
18/21
19/21
20/21
21/21
Average Entailment Score: 0.7638504761904761
Average Contradiction Score: 0.3336495238095238
Average Obligation Coverage Score: 1.0
Average Final Composite Score: 0.8100671428571428

Processing complete. Results saved to /content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/REALGOLD2


In [None]:
if __name__ == "__main__":
  group_methodName = 'REALGOLD'
  input_file = '/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/GoldenQuestionAnswerforEvaluationMetric.json'
  main(input_file, group_methodName)

1/21
2/21
3/21
4/21
5/21
6/21
7/21
8/21
9/21
10/21
11/21
12/21
13/21
14/21
15/21
16/21
17/21
18/21
19/21
20/21
21/21
Average Entailment Score: 0.8372866666666667
Average Contradiction Score: 0.2598147619047619
Average Obligation Coverage Score: 1.0
Average Final Composite Score: 0.8591576190476189

Processing complete. Results saved to /content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/REALGOLD


In [10]:
if __name__ == "__main__":
  group_methodName = 'hidden-rankfusion'
  input_file = '/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/HiddenData/retrieval_results.rank_fusion_bm25_answers.json'
  main(input_file, group_methodName)

Average Entailment Score: 0.31177562780269064
Average Contradiction Score: 0.12474058295964126
Average Obligation Coverage Score: 0.20458789630112878
Average Final Composite Score: 0.46387426008968613

Processing complete. Results saved to /content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/hidden-rankfusion


In [9]:
if __name__ == "__main__":
  group_methodName = 'hidden-onlybm25'
  input_file = '/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/HiddenData/retrieval_results.passage_only_bm25_answers.json'
  main(input_file, group_methodName)

Average Entailment Score: 0.30961760089686097
Average Contradiction Score: 0.11972473094170405
Average Obligation Coverage Score: 0.22031420627805384
Average Final Composite Score: 0.47006894618834083

Processing complete. Results saved to /content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/hidden-onlybm25


In [8]:
if __name__ == "__main__":
  group_methodName = 'test-bm25_0.1'
  input_file = '/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/retrieval_results.ObliQA_test-bm25-bm25_0.1_answers.json'
  main(input_file, group_methodName)

Skipping invalid row: {'QuestionID': 'QuestionID', 'entailment_score': 'entailment_score', 'contradiction_score': 'contradiction_score', 'obligation_coverage_score': 'obligation_coverage_score', 'composite_score': 'composite_score'}
Average Entailment Score: 0.3203488980617373
Average Contradiction Score: 0.13155613065326632
Average Obligation Coverage Score: 0.22252020211885457
Average Final Composite Score: 0.4704376130653266

Processing complete. Results saved to /content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/test-bm25_0.1


In [7]:
if __name__ == "__main__":
  group_methodName = 'test-bm25'
  input_file = '/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/retrieval_results.ObliQA_test-bm25_answers.json'
  main(input_file, group_methodName)

Average Entailment Score: 0.30875058168761227
Average Contradiction Score: 0.123509644524237
Average Obligation Coverage Score: 0.21488916752033266
Average Final Composite Score: 0.4667100610412926

Processing complete. Results saved to /content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/TestSet/test-bm25


In [6]:
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')