Let's First import and download all the necessary dependencies

In [1]:
!pip install rouge_score
!pip install textstat
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, BartTokenizer, AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import accuracy_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import textstat

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e31249f5f3efe1c03c7c3a1e9e243f6ea78f4a2afa0064af34511b5b031201fb
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-

Let's write a function to load and read the contents of the .txt files

In [2]:
def load_data(data_folder):
    texts = []
    for filename in os.listdir(data_folder):
        if filename.endswith(".txt"):
            with open(os.path.join(data_folder, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts

Now Let's write a function to pre-process our text

In [3]:
def preprocess_text(text):
    # Remove special characters and multiple spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

Let's Load the text files

In [4]:
data_folder = 'tos'
texts = load_data(data_folder)
processed_texts = [preprocess_text(text) for text in texts]

Let us train with legal_bert_uncased model

In [5]:
# Load legal_bert_uncased model and tokenizer
legal_bert_uncased_model_name = "nlpaueb/legal-bert-base-uncased"  # legal_bert_uncased fine-tuned model
legal_bert_uncased_tokenizer = AutoTokenizer.from_pretrained(legal_bert_uncased_model_name)
legal_bert_uncased_model = AutoModelForQuestionAnswering.from_pretrained(legal_bert_uncased_model_name)

# Sample questions relevant for T&C analysis
questions = [
    "What are the obligations of the user?",
    "What are the limitations of liability?",
    "What are the privacy terms?",
    "What are the data usage terms?",
]

def answer_questions(text, questions):
    inputs = legal_bert_uncased_tokenizer(text, return_tensors="pt", truncation=True)
    results = []
    for question in questions:
        question_inputs = legal_bert_uncased_tokenizer(question, text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = legal_bert_uncased_model(**question_inputs)
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = legal_bert_uncased_tokenizer.convert_tokens_to_string(legal_bert_uncased_tokenizer.convert_ids_to_tokens(question_inputs["input_ids"][0][answer_start:answer_end]))
        results.append((question, answer))
    return results

# Process all documents
answered_texts = [answer_questions(text, questions) for text in processed_texts]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Let's Wrtie a function to evaluate the metrics of the model

In [6]:
def evaluate_metrics(original_texts, answered_texts):
    bleu_scores = []
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    for orig, summ in zip(original_texts, answered_texts):
        bleu = sentence_bleu([orig.split()], summ.split())
        bleu_scores.append(bleu)

        rouge_score = rouge.score(orig, summ)
        rouge_scores.append(rouge_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {key: sum(score[key].fmeasure for score in rouge_scores) / len(rouge_scores) for key in rouge_scores[0]}

    return avg_bleu, avg_rouge

def evaluate_readability(answered_texts):
    readability_scores = [textstat.flesch_reading_ease(text) for text in answered_texts]
    avg_readability = sum(readability_scores) / len(readability_scores)
    return avg_readability

Evaluating legal_bert_uncased Model

In [7]:
def evaluate_legal_bert_uncased_metrics(original_texts, all_answers):
    bleu_scores = []
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    for orig, answers in zip(original_texts, all_answers):
        # Join all answers for evaluation purposes
        simplified_text = ' '.join([answer for question, answer in answers])
        bleu = sentence_bleu([orig.split()], simplified_text.split())
        bleu_scores.append(bleu)

        rouge_score = rouge.score(orig, simplified_text)
        rouge_scores.append(rouge_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {key: sum(score[key].fmeasure for score in rouge_scores) / len(rouge_scores) for key in rouge_scores[0]}

    return avg_bleu, avg_rouge

def evaluate_legal_bert_uncased_readability(all_answers):
    simplified_texts = [' '.join([answer for question, answer in answers]) for answers in all_answers]
    readability_scores = [textstat.flesch_reading_ease(text) for text in simplified_texts]
    avg_readability = sum(readability_scores) / len(readability_scores)
    return avg_readability

Let's Now Evaluate the model and calculate the metrics

In [8]:
avg_bleu, avg_rouge = evaluate_legal_bert_uncased_metrics(processed_texts, answered_texts)
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE Score: {avg_rouge}")

avg_readability = evaluate_legal_bert_uncased_readability(answered_texts)
print(f"Average Readability Score: {avg_readability}")

Average BLEU Score: 0.08261734439601696
Average ROUGE Score: {'rouge1': 0.20077190830226765, 'rougeL': 0.14678698091805173}
Average Readability Score: -106.45862068965518
