Let's First import and download all the necessary dependencies

In [None]:
!pip install rouge_score
!pip install textstat
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BartForConditionalGeneration, BartTokenizer, AutoTokenizer, AutoModelForQuestionAnswering
import torch
from sklearn.metrics import accuracy_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import textstat

Let's write a function to load and read the contents of the .txt files

In [2]:
def load_data(data_folder):
    texts = []
    for filename in os.listdir(data_folder):
        if filename.endswith(".txt"):
            with open(os.path.join(data_folder, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts

Now Let's write a function to pre-process our text

In [3]:
def preprocess_text(text):
    # Remove special characters and multiple spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()

Let's Load the text files

In [4]:
data_folder = 'tos'
texts = load_data(data_folder)
processed_texts = [preprocess_text(text) for text in texts]

Let us train with legal_bert_uncased model

In [None]:
# Load legal_bert_uncased model and tokenizer
legal_bert_uncased_model_name = "nlpaueb/legal-bert-base-uncased"  # legal_bert_uncased fine-tuned model
legal_bert_uncased_tokenizer = AutoTokenizer.from_pretrained(legal_bert_uncased_model_name)
legal_bert_uncased_model = AutoModelForQuestionAnswering.from_pretrained(legal_bert_uncased_model_name)

# Sample questions relevant for T&C analysis
questions = [
    "What are the obligations of the user?",
    "What are the limitations of liability?",
    "What are the privacy terms?",
    "What are the data usage terms?",
]

def answer_questions(text, questions):
    inputs = legal_bert_uncased_tokenizer(text, return_tensors="pt", truncation=True)
    results = []
    for question in questions:
        question_inputs = legal_bert_uncased_tokenizer(question, text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = legal_bert_uncased_model(**question_inputs)
        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = legal_bert_uncased_tokenizer.convert_tokens_to_string(legal_bert_uncased_tokenizer.convert_ids_to_tokens(question_inputs["input_ids"][0][answer_start:answer_end]))
        results.append((question, answer))
    return results

# Process all documents
answered_texts = [answer_questions(text, questions) for text in processed_texts]

Let's Wrtie a function to evaluate the metrics of the model

In [7]:
def evaluate_metrics(original_texts, answered_texts):
    bleu_scores = []
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    for orig, summ in zip(original_texts, answered_texts):
        bleu = sentence_bleu([orig.split()], summ.split())
        bleu_scores.append(bleu)

        rouge_score = rouge.score(orig, summ)
        rouge_scores.append(rouge_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {key: sum(score[key].fmeasure for score in rouge_scores) / len(rouge_scores) for key in rouge_scores[0]}

    return avg_bleu, avg_rouge

def evaluate_readability(answered_texts):
    readability_scores = [textstat.flesch_reading_ease(text) for text in answered_texts]
    avg_readability = sum(readability_scores) / len(readability_scores)
    return avg_readability

Evaluating legal_bert_uncased Model

In [12]:
def evaluate_legal_bert_uncased_metrics(original_texts, all_answers):
    bleu_scores = []
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = []

    for orig, answers in zip(original_texts, all_answers):
        # Join all answers for evaluation purposes
        simplified_text = ' '.join([answer for question, answer in answers])
        bleu = sentence_bleu([orig.split()], simplified_text.split())
        bleu_scores.append(bleu)

        rouge_score = rouge.score(orig, simplified_text)
        rouge_scores.append(rouge_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {key: sum(score[key].fmeasure for score in rouge_scores) / len(rouge_scores) for key in rouge_scores[0]}

    return avg_bleu, avg_rouge

def evaluate_legal_bert_uncased_readability(all_answers):
    simplified_texts = [' '.join([answer for question, answer in answers]) for answers in all_answers]
    readability_scores = [textstat.flesch_reading_ease(text) for text in simplified_texts]
    avg_readability = sum(readability_scores) / len(readability_scores)
    return avg_readability

Let's Now Evaluate the model and calculate the metrics

In [None]:
avg_bleu, avg_rouge = evaluate_legal_bert_uncased_metrics(processed_texts, answered_texts)
print(f"Average BLEU Score: {avg_bleu}")
print(f"Average ROUGE Score: {avg_rouge}")

avg_readability = evaluate_legal_bert_uncased_readability(answered_texts)
print(f"Average Readability Score: {avg_readability}")