Let's First import and download all the necessary dependencies

In [1]:
!pip install rouge_score
!pip install textstat
import os
import re
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import textstat

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=7ac162efaa911578b2628e6dae967f54ef4e6231473b2c239ae80a7a5ebf08d7
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-

Let's write a function to load and read the contents of the .txt files

In [2]:
def load_data(data_folder):
    # Initialize an empty list to store the text data
    texts = []

    # Iterate over all files in the specified data folder
    for filename in os.listdir(data_folder):
        # Check if the file has a .txt extension
        if filename.endswith(".txt"):
            # Open the file in read mode with UTF-8 encoding
            with open(os.path.join(data_folder, filename), 'r', encoding='utf-8') as file:
                # Read the content of the file and append it to the texts list
                texts.append(file.read())

    # Return the list of text data
    return texts

Let's Load the text files

In [3]:
# Define the folder where the dataset of terms of service (ToS) documents is stored
data_folder = 'DataSet'

# Load the text data from the specified folder using the load_data function
input_texts = load_data(data_folder)

Let us train with legal_bert_uncased model

In [4]:
# Define the model name for the pre-trained Legal BERT model
legal_bert_uncased_model_name = "nlpaueb/legal-bert-base-uncased"

# Load the tokenizer for the Legal BERT model using the specified model name
legal_bert_uncased_tokenizer = AutoTokenizer.from_pretrained(legal_bert_uncased_model_name)

# Load the Legal BERT model for question answering using the specified model name
legal_bert_uncased_model = AutoModelForQuestionAnswering.from_pretrained(legal_bert_uncased_model_name)

# Define a list of sample questions relevant for Terms and Conditions (T&C) analysis
questions = [
    "What are the obligations of the user?",
    "What are the limitations of liability?",
    "What are the privacy terms?",
    "What are the data usage terms?",
]

# Define a function to answer questions based on the input text using the Legal BERT model
def answer_questions(text, questions):
    # Tokenize the input text and convert it to tensor format
    inputs = legal_bert_uncased_tokenizer(text, return_tensors="pt", truncation=True)

    # Initialize an empty list to store the results
    results = []

    # Iterate over each question
    for question in questions:
        # Tokenize the question and text together and convert them to tensor format
        question_inputs = legal_bert_uncased_tokenizer(question, text, return_tensors="pt", truncation=True)

        # Perform inference without gradient calculation
        with torch.no_grad():
            # Get the model outputs for the question and text inputs
            outputs = legal_bert_uncased_model(**question_inputs)

        # Get the start and end positions of the answer from the model outputs
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1

        # Convert the token IDs to a string to get the answer text
        answer = legal_bert_uncased_tokenizer.convert_tokens_to_string(
            legal_bert_uncased_tokenizer.convert_ids_to_tokens(question_inputs["input_ids"][0][answer_start:answer_end])
        )

        # Append the question and answer pair to the results list
        results.append((question, answer))

    # Return the list of question and answer pairs
    return results

# Process all input texts and get answers for each text using the answer_questions function
answered_texts = [answer_questions(text, questions) for text in input_texts]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Let's Wrtie a function to evaluate the metrics of the model

Evaluating legal_bert_uncased Model

In [5]:
def evaluate_metrics(original_texts, all_answers):
    # Initialize an empty list to store BLEU scores
    bleu_scores = []

    # Initialize the ROUGE scorer with ROUGE-1 and ROUGE-L metrics, using a stemmer
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

    # Initialize an empty list to store ROUGE scores
    rouge_scores = []

    # Iterate over pairs of original texts and their corresponding answers
    for orig, answers in zip(original_texts, all_answers):
        # Join all answers into a single string for evaluation purposes
        simplified_text = ' '.join([answer for question, answer in answers])

        # Calculate the BLEU score for the current pair and append it to the list
        bleu = sentence_bleu([orig.split()], simplified_text.split())
        bleu_scores.append(bleu)

        # Calculate the ROUGE score for the current pair and append it to the list
        rouge_score = rouge.score(orig, simplified_text)
        rouge_scores.append(rouge_score)

    # Calculate the average BLEU score
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    # Calculate the average ROUGE scores for each metric
    avg_rouge = {key: sum(score[key].fmeasure for score in rouge_scores) / len(rouge_scores) for key in rouge_scores[0]}

    # Return the average BLEU and ROUGE scores
    return avg_bleu, avg_rouge

def evaluate_readability(all_answers):
    # Join all answers into a single string for each set of answers
    simplified_texts = [' '.join([answer for question, answer in answers]) for answers in all_answers]

    # Calculate the Flesch Reading Ease score for each simplified text
    readability_scores = [textstat.flesch_reading_ease(text) for text in simplified_texts]

    # Calculate the average readability score
    avg_readability = sum(readability_scores) / len(readability_scores)

    # Return the average readability score
    return avg_readability

Let's Now Evaluate the model and calculate the metrics

In [6]:
avg_bleu, avg_rouge = evaluate_metrics(input_texts, answered_texts)
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE Score: {avg_rouge}")

avg_readability = evaluate_readability(answered_texts)
print(f"Average Readability Score: {avg_readability}")

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.0389344721083877
Average ROUGE Score: {'rouge1': 0.12027101724352414, 'rougeL': 0.09375199229820871}
Average Readability Score: 122.4279310344828
