<a href="https://colab.research.google.com/github/tubagokhan/RegNLPDataset/blob/main/ForTedEvaluationMetric.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import json
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import nltk
nltk.download('punkt')

print("Dependencies imported and NLTK punkt downloaded.")

# Load NLI model
nli_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-v3-xsmall')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-v3-xsmall')

print("NLI model and tokenizer loaded.")

def softmax(logits):
    e_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return e_logits / np.sum(e_logits, axis=1, keepdims=True)

def get_nli_probabilities(premises, hypotheses):
    """
    Computes the Natural Language Inference (NLI) probabilities for a set of premise-hypothesis pairs.

    This function tokenizes the input premises and hypotheses, feeds them into a pre-trained NLI model, and
    returns the softmax probabilities of the model's logits. The probabilities indicate how likely it is
    that the relationship between each premise and its corresponding hypothesis is entailment, neutral, or contradiction.

    Parameters:
    premises (list of str): The premises to evaluate, where each premise is a statement that is assumed to be true.
    hypotheses (list of str): The hypotheses to evaluate against the premises, where each hypothesis is a statement whose truth value is being assessed in the context of the premises.

    Returns:
    numpy.ndarray: A 2D array of shape (number of pairs, 3), containing the rounded probabilities for entailment, neutral, and contradiction for each premise-hypothesis pair.
    """

    # Tokenize the premises and hypotheses for input into the NLI model
    features = tokenizer(premises, hypotheses, padding=True, truncation=True, return_tensors="pt")

    # Set the model to evaluation mode to disable dropout and batch normalization
    nli_model.eval()

    # Disable gradient calculation to speed up the process and reduce memory usage
    with torch.no_grad():
        # Get the raw logits from the NLI model
        logits = nli_model(**features).logits.numpy()

    # Apply the softmax function to the logits to get probabilities
    probabilities = softmax(logits)

    # Round the probabilities to three decimal places for readability
    return np.round(probabilities, 3)


def get_nli_matrix(passages, answers):
    """
    Generates two matrices representing entailment and contradiction probabilities between a list of passages and answers.

    This function uses a Natural Language Inference (NLI) model to compute the probabilities that each answer
    entails or contradicts each passage. The entailment and contradiction probabilities are stored in separate matrices.

    Parameters:
    passages (list of str): The list of text passages.
    answers (list of str): The list of answers to evaluate against the passages.

    Returns:
    tuple: A tuple containing two numpy arrays - the entailment matrix and the contradiction matrix.
    """

    # Log the number of passages and answers being processed
    print(f"{len(passages)} passages and {len(answers)} answers.")

    # Initialize matrices to store entailment and contradiction probabilities
    entailment_matrix = np.zeros((len(passages), len(answers)))
    contradiction_matrix = np.zeros((len(passages), len(answers)))

    # Iterate over each passage and answer pair
    for i, pas in enumerate(passages):
        for j, ans in enumerate(answers):
            # Get the NLI probabilities for the current passage-answer pair
            probs = get_nli_probabilities([pas], [ans])

            # Optional logging for debugging or analysis
            print(pas)
            print(ans)
            print(probs)
            print("---")

            # Update the entailment matrix with the probability of entailment
            entailment_matrix[i, j] = probs[:, 1][0]  # Access the first element to avoid deprecation warning

            # Update the contradiction matrix with the probability of contradiction
            contradiction_matrix[i, j] = probs[:, 0][0]  # Access the first element to avoid deprecation warning

    # Optional logging of the final matrices for debugging or analysis
    print("Entailment Matrix")
    print(entailment_matrix)
    print("Contradiction Matrix")
    print(contradiction_matrix)

    # Return both the entailment and contradiction matrices
    return entailment_matrix, contradiction_matrix


def calculate_scores_from_matrix(nli_matrix, score_type='entailment'):
    """
    Calculates a score from an NLI (Natural Language Inference) matrix based on the specified score type.

    The function supports calculating scores for 'entailment' or 'contradiction' by reducing the NLI matrix
    along a specific axis and then computing the mean of the reduced vector. The score is rounded to three decimal places.

    Parameters:
    nli_matrix (numpy.ndarray): The NLI matrix containing inference scores.
    score_type (str): The type of score to calculate, 'entailment' or 'contradiction'. Defaults to 'entailment'.

    Returns:
    float: The calculated score, rounded to three decimal places.
    """

    # Check if the NLI matrix is empty and return a default score if true
    if nli_matrix.size == 0:
        print("Warning: NLI matrix is empty. Returning default score of 0.")
        return 0.0  # Return a default score of 0.0 if the NLI matrix is empty

    # Reduce the NLI matrix to a vector based on the score type
    if score_type == 'entailment':
        reduced_vector = np.max(nli_matrix, axis=0)  # Max reduction for entailment scores
    elif score_type == 'contradiction':
        reduced_vector = np.max(nli_matrix, axis=0)  # Max reduction for contradiction scores (same as entailment in this implementation)

    # Calculate the mean of the reduced vector and round it to three decimal places
    score = np.round(np.mean(reduced_vector), 3)

    return score  # Return the calculated score


def check_obligation(text):
    """
    Determines whether a given text contains an obligation by predicting it using a trained machine learning model.

    The function loads a dataset of texts and their associated obligation labels from a JSON file,
    trains a Logistic Regression model using TF-IDF features on a subset of the data, and then predicts
    whether the input text contains an obligation.

    Parameters:
    text (str): The text to be checked for an obligation.

    Returns:
    int: The predicted label indicating the presence (1) or absence (0) of an obligation in the text.
    """

    # Path to the JSON file containing the dataset
    json_path = "/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/Processed_Obligations.json"

    # Open and load the dataset from the JSON file
    with open(json_path, 'r') as file:
        data = json.load(file)

    # Extract texts and their corresponding obligation labels from the dataset
    texts = [item['Text'] for item in data]
    labels = [item['Obligation'] for item in data]

    # Split the dataset into training and unused test sets (test set is unused here)
    X_train, _, y_train, _ = train_test_split(texts, labels, test_size=0.2, random_state=42)

    # Define a machine learning pipeline consisting of TF-IDF vectorization followed by Logistic Regression
    pipeline = make_pipeline(TfidfVectorizer(stop_words='english'), LogisticRegression(max_iter=1000))

    # Train the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict and return the obligation label for the input text (1 for presence, 0 for absence of obligation)
    return pipeline.predict([text])[0]


def calculate_coverage_score(passages, answers):
    """
    Calculate the coverage score indicating the extent to which the provided answers cover the obligations mentioned in the passages.

    Parameters:
    passages (list of str): The text passages containing obligations.
    answers (list of str): The answers that are supposed to cover the obligations in the passages.

    Returns:
    float: The coverage score, rounded to three decimal places, representing the proportion of obligations covered by the answers.
    """

    # Initialize the counter for covered obligations
    covered_obligations = 0

    # Calculate the total number of obligations in all passages
    total_obligations = sum(
        check_obligation(sent)  # Check if a sentence contains an obligation
        for passage in passages  # Iterate over each passage
        for sent in sent_tokenize(passage)  # Tokenize the passage into sentences
    )

    # Iterate over each answer and passage to check for covered obligations
    for ans in answers:
        for passage in passages:
            for sent in sent_tokenize(passage):  # Tokenize the passage into sentences
                # If the sentence contains an obligation, it is considered covered
                if check_obligation(sent):
                    print(f"Obligation: {sent}")  # Optional: Print the obligation sentence
                    covered_obligations += 1  # Increment the counter for covered obligations
                    break  # Move to the next passage/answer after covering an obligation

    # Calculate the coverage score as the ratio of covered obligations to total obligations
    coverage_score = (
        covered_obligations / total_obligations if total_obligations > 0 else 0
    )

    # Round the coverage score to three decimal places for reporting
    coverage_score_rounded = np.round(coverage_score, 3)
    print(f"Obligation Coverage score: {coverage_score_rounded}")  # Print the rounded coverage score

    return coverage_score_rounded  # Return the rounded coverage score


def calculate_final_composite_score(passages, answers, we=0.4, wc=0.3, wo=0.3):
    #print("Calculating final composite score.")
    passage_sentences = [sent for passage in passages for sent in sent_tokenize(passage)]
    answer_sentences = [sent for answer in answers for sent in sent_tokenize(answer)]
    entailment_matrix, contradiction_matrix = get_nli_matrix(passage_sentences, answer_sentences)

    entailment_score = calculate_scores_from_matrix(entailment_matrix, 'entailment')
    print(f"Entailment score: {np.round(entailment_score, 3)}")
    contradiction_score = calculate_scores_from_matrix(contradiction_matrix, 'contradiction')
    print(f"Contradiction score: {np.round(contradiction_score, 3)}")
    coverage_score = calculate_coverage_score(passages, answers)

    composite_score = we * entailment_score - wc * contradiction_score + wo * coverage_score
    print(f"Final composite score: {np.round(composite_score, 3)}")
    return np.round(composite_score, 3)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dependencies imported and NLTK punkt downloaded.




NLI model and tokenizer loaded.


In [21]:
import re
from collections import Counter

def calculate_conciseness_penalty(retrieved_passage, answer):
    """
    Calculates the overlap ratio between words in the retrieved passage and the answer.
    This ratio is used to penalize the conciseness score of the answer, based on the amount
    of verbatim text copied from the source.

    Parameters:
    - retrieved_passage (str): The source text from which information is retrieved.
    - answer (str): The respondent's answer that needs to be evaluated for conciseness.

    Returns:
    - float: The overlap ratio, representing the fraction of the answer's content that is
      directly copied from the retrieved passage. A higher ratio indicates more copying,
      which can be used to penalize the conciseness score.
    """

    # Tokenize words in both the retrieved passage and the answer, converting to lowercase
    retrieved_tokens = re.findall(r'\w+', retrieved_passage.lower())
    answer_tokens = re.findall(r'\w+', answer.lower())

    # Count occurrences of each word in the answer
    answer_word_count = Counter(answer_tokens)

    # Initialize the count of overlapping words
    overlap_count = 0

    # Iterate over unique words in the retrieved passage and count how many are in the answer
    for word in set(retrieved_tokens):
        if word in answer_word_count:
            overlap_count += min(retrieved_tokens.count(word), answer_word_count[word])

    # Calculate the total number of words in the answer
    total_words_in_answer = len(answer_tokens)

    # Calculate the overlap ratio
    overlap_ratio = overlap_count / total_words_in_answer if total_words_in_answer > 0 else 0

    print(f"OVerlap Ratio: {np.round(overlap_ratio, 3)}")
    return overlap_ratio

In [22]:
import re
from collections import Counter

def calculate_normalized_overlap_score(retrieved_passage, answer):
    """
    Calculates the normalized overlap score between the retrieved passage and the answer.
    This score reflects the proportion of words in the answer that are also found in the retrieved passage,
    normalized by the total number of words in the answer.

    Parameters:
    - retrieved_passage (str): The source text from which information is retrieved.
    - answer (str): The respondent's answer that needs to be evaluated for originality.

    Returns:
    - float: The normalized overlap score, representing the proportion of the answer's content
      that overlaps with the retrieved passage. A higher score indicates more overlap.
    """

    # Tokenize words in both the retrieved passage and the answer, converting to lowercase
    retrieved_tokens = re.findall(r'\w+', retrieved_passage.lower())
    answer_tokens = re.findall(r'\w+', answer.lower())

    # Create sets of unique words from both the retrieved passage and the answer
    unique_retrieved_tokens = set(retrieved_tokens)
    unique_answer_tokens = set(answer_tokens)

    # Calculate the overlap by finding the intersection of unique words in both sets
    overlap = unique_retrieved_tokens.intersection(unique_answer_tokens)

    # Count the occurrences of overlapping words in the answer
    answer_word_count = Counter(answer_tokens)
    overlap_count = sum(answer_word_count[word] for word in overlap)

    # Calculate the total number of words in the answer
    total_words_in_answer = len(answer_tokens)

    # Calculate the normalized overlap score
    normalized_overlap_score = overlap_count / total_words_in_answer if total_words_in_answer > 0 else 0

    print(f'Normalized Overlap Score: {normalized_overlap_score:.2f}')
    return normalized_overlap_score



In [23]:
Question= "Can a Model Portfolio that follows an environmentally-conscious investment mandate use the ADGM Green Portfolio or ADGM Climate Transition Portfolio designation without obtaining the relevant designation?"
RetrievedPassage=["A Model Portfolio that follows an environmentally-conscious investment mandate is not required to become an ADGM Green Portfolio or an ADGM Climate Transition Portfolio. However, only those Model Portfolios which have obtained the relevant designation may use such terminology and designation marks in related documentation and promotional materials."]
Answer=["No, a Model Portfolio that follows an environmentally-conscious investment mandate is not permitted to use the ADGM Green Portfolio or ADGM Climate Transition Portfolio designation without obtaining the relevant designation. Only Model Portfolios that have received the official designation are allowed to use such terminology and designation marks in their documentation and promotional materials."]
calculate_final_composite_score(RetrievedPassage, Answer)
calculate_conciseness_penalty(RetrievedPassage[0], Answer[0])
temp=calculate_normalized_overlap_score(RetrievedPassage[0], Answer[0])


2 passages and 2 answers.
A Model Portfolio that follows an environmentally-conscious investment mandate is not required to become an ADGM Green Portfolio or an ADGM Climate Transition Portfolio.
No, a Model Portfolio that follows an environmentally-conscious investment mandate is not permitted to use the ADGM Green Portfolio or ADGM Climate Transition Portfolio designation without obtaining the relevant designation.
[[0.988 0.003 0.01 ]]
---
A Model Portfolio that follows an environmentally-conscious investment mandate is not required to become an ADGM Green Portfolio or an ADGM Climate Transition Portfolio.
Only Model Portfolios that have received the official designation are allowed to use such terminology and designation marks in their documentation and promotional materials.
[[0.948 0.    0.052]]
---
However, only those Model Portfolios which have obtained the relevant designation may use such terminology and designation marks in related documentation and promotional materials.
No

In [24]:
Question=["In determining which economic activities are environmentally sustainable, what accreditation standards are considered credible by the Regulator?"]
RetrievedPassage=["The Regulator considers an Acceptable Green Taxonomy to be one which uses credible and independent accreditation standards published by a governmental body or a reputable industry association and made publicly available to allow determination of which economic activities are environmentally sustainable.  Examples of green taxonomies that are considered Acceptable Green Taxonomies include:a.The EU Green Taxonomy; b.The ASEAN Taxonomy for Sustainable Finance; and c.The Common Principles for Climate Mitigation Finance Tracking."]
Answer=["The Regulator considers accreditation standards published by a governmental body or a reputable industry association as credible for determining which economic activities are environmentally sustainable. These accreditation standards should also be made publicly available to allow transparency and enable stakeholders to assess the sustainability of economic activities effectively. Examples of green taxonomies that meet these criteria and are considered Acceptable Green Taxonomies by the Regulator include the EU Green Taxonomy, the ASEAN Taxonomy for Sustainable Finance, and the Common Principles for Climate Mitigation Finance Tracking."]
calculate_final_composite_score(RetrievedPassage, Answer)
calculate_conciseness_penalty(RetrievedPassage[0], Answer[0])
temp=calculate_normalized_overlap_score(RetrievedPassage[0], Answer[0])

2 passages and 3 answers.
The Regulator considers an Acceptable Green Taxonomy to be one which uses credible and independent accreditation standards published by a governmental body or a reputable industry association and made publicly available to allow determination of which economic activities are environmentally sustainable.
The Regulator considers accreditation standards published by a governmental body or a reputable industry association as credible for determining which economic activities are environmentally sustainable.
[[0.    0.995 0.005]]
---
The Regulator considers an Acceptable Green Taxonomy to be one which uses credible and independent accreditation standards published by a governmental body or a reputable industry association and made publicly available to allow determination of which economic activities are environmentally sustainable.
These accreditation standards should also be made publicly available to allow transparency and enable stakeholders to assess the susta

In [25]:
#Question= "Can a Model Portfolio that follows an environmentally-conscious investment mandate use the ADGM Green Portfolio or ADGM Climate Transition Portfolio designation without obtaining the relevant designation?"
#RetrievedPassage=["A Model Portfolio that follows an environmentally-conscious investment mandate is not required to become an ADGM Green Portfolio or an ADGM Climate Transition Portfolio. However, only those Model Portfolios which have obtained the relevant designation may use such terminology and designation marks in related documentation and promotional materials."]
#Answer=["No, a Model Portfolio that follows an environmentally-conscious investment mandate is not permitted to use the ADGM Green Portfolio or ADGM Climate Transition Portfolio designation without obtaining the relevant designation. Only Model Portfolios that have received the official designation are allowed to use such terminology and designation marks in their documentation and promotional materials."]

Question=["In determining which economic activities are environmentally sustainable, what accreditation standards are considered credible by the Regulator?"]
RetrievedPassage=["The Regulator considers an Acceptable Green Taxonomy to be one which uses credible and independent accreditation standards published by a governmental body or a reputable industry association and made publicly available to allow determination of which economic activities are environmentally sustainable.  Examples of green taxonomies that are considered Acceptable Green Taxonomies include:a.The EU Green Taxonomy; b.The ASEAN Taxonomy for Sustainable Finance; and c.The Common Principles for Climate Mitigation Finance Tracking."]
Answer=["The Regulator considers accreditation standards published by a governmental body or a reputable industry association as credible for determining which economic activities are environmentally sustainable. In an effort to enhance transparency and inclusivity, the Regulator has also begun integrating elements from the Atlantean Sustainability Framework, a set of guidelines rumored to originate from an ancient, advanced civilization, focusing on harmonizing human activities with the natural world. These accreditation standards should be made publicly available to allow transparency and enable stakeholders to assess the sustainability of economic activities effectively. To further this goal, the Regulator has introduced the Ethereal Compliance System, an innovative digital platform that employs spectral analysis to measure the environmental impact of economic activities, providing a more intuitive understanding of sustainability metrics. Examples of green taxonomies that meet these criteria and are considered Acceptable Green Taxonomies by the Regulator include the EU Green Taxonomy, the ASEAN Taxonomy for Sustainable Finance, the Common Principles for Climate Mitigation Finance Tracking, and the newly added Arcadian Environmental Codex, believed to offer insights into ancient ecological wisdom that aligns modern economic practices with timeless natural principles."]
calculate_final_composite_score(RetrievedPassage, Answer)
calculate_conciseness_penalty(RetrievedPassage[0], Answer[0])
temp=calculate_normalized_overlap_score(RetrievedPassage[0], Answer[0])

2 passages and 5 answers.
The Regulator considers an Acceptable Green Taxonomy to be one which uses credible and independent accreditation standards published by a governmental body or a reputable industry association and made publicly available to allow determination of which economic activities are environmentally sustainable.
The Regulator considers accreditation standards published by a governmental body or a reputable industry association as credible for determining which economic activities are environmentally sustainable.
[[0.    0.995 0.005]]
---
The Regulator considers an Acceptable Green Taxonomy to be one which uses credible and independent accreditation standards published by a governmental body or a reputable industry association and made publicly available to allow determination of which economic activities are environmentally sustainable.
In an effort to enhance transparency and inclusivity, the Regulator has also begun integrating elements from the Atlantean Sustainabil