In [6]:
## Data Loading
import zipfile
import os

# Define the input zip file and output directory
zip_file = "ToSDRData.zip"
output_dir = "DataSet"

def recreate_folder(folder_path):
    # If the folder exists, delete it
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    # Recreate the folder
    os.makedirs(folder_path)

# Recreate the output folder
recreate_folder(output_dir)

# Open the zip file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    # List all files in the zip
    all_files = zip_ref.namelist()

    # Filter files in the 'text' folder
    text_files = [f for f in all_files if f.startswith('text/') and not f.endswith('/')]

    # Extract only the files in the 'text' folder
    for file in text_files:
        # Determine the target path in the DataSet folder
        target_path = os.path.join(output_dir, os.path.relpath(file, 'text'))

        # Ensure the directory structure exists
        os.makedirs(os.path.dirname(target_path), exist_ok=True)

        # Extract the file
        with zip_ref.open(file) as source, open(target_path, 'wb') as target:
            target.write(source.read())

print(f"All files from 'text' folder in {zip_file} have been extracted to {output_dir}.")

All files from 'text' folder in ToSDRData.zip have been extracted to DataSet.


In [8]:
!pip install transformers torch
!pip install ftfy
!pip install spacy
# Load spaCy's English model
!python -m spacy download en_core_web_sm
# Import the spacy module
import spacy # This line is added to import spacy into the current scope

nlp = spacy.load("en_core_web_sm")

# imports
import os
import re
import ftfy
import shutil
import chardet
from bs4 import BeautifulSoup

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [12]:
# Pre-processing

# Load spaCy model for sentence segmentation
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")  # Add the sentencizer component

# Input and output folder paths
input_folder = "DataSet"
output_folder = "PreprocessedDataSet"

def recreate_folder(folder_path):
    # If the folder exists, delete it
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    # Recreate the folder
    os.makedirs(folder_path)

# Recreate the output folder
recreate_folder(output_folder)

# Function to remove HTML tags but retain URLs
def remove_html_tags(text):
    # Parse HTML content using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")

    # Find and preserve URLs
    urls = [a['href'] for a in soup.find_all('a', href=True)]

    # Extract the plain text
    plain_text = soup.get_text(separator=" ")

    # Reinsert URLs into the text
    for url in urls:
        plain_text += f" {url}"

    return plain_text

# Function to preprocess a single file
def preprocess_text(text):
    # Step 1: Remove HTML tags but retain URLs
    text = remove_html_tags(text)

    # Step 2: Remove special characters and normalize white spaces
    text = re.sub(r"[^\x00-\x7F]+", " ", text)  # Remove non-ASCII characters
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces and remove leading/trailing spaces

    # Step 3: Retain enumerations (e.g., 1.1, A, i., etc.)
    enumerations = re.findall(r"(^|\s)(\d+\.\d+|[A-Za-z]|[ivxIVX]+)(?=[\.\)]\s)", text)
    enumerations = {e[1] for e in enumerations}

    # Step 4: Break into sentences for better readability
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    # Step 5: Highlight enumerations as important points
    important_points = [sent for sent in sentences if any(e in sent for e in enumerations)]

    # Combine all text and important points
    processed_text = "\n".join(important_points + sentences)
    return processed_text

# Loop through all files in the input folder
for file_name in os.listdir(input_folder):
    input_path = os.path.join(input_folder, file_name)

    # Ensure we process only text files
    if os.path.isfile(input_path) and input_path.endswith(".txt"):
        # Read the file content
        with open(input_path, "r", encoding="utf-8") as file:
            content = file.read()

        # Preprocess the text
        preprocessed_content = preprocess_text(content)

        # Save the preprocessed text to the output folder
        output_path = os.path.join(output_folder, file_name)
        with open(output_path, "w", encoding="utf-8") as file:
            file.write(preprocessed_content)

print(f"Preprocessing completed. Files saved in '{output_folder}'.")


Preprocessing completed. Files saved in 'PreprocessedDataSet'.


In [11]:
import os
import re
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Initialize LegalBERT model and tokenizer
model_name = "nlpaueb/legal-bert-base-uncased"  # LegalBERT model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Input and output folders
input_folder = "DataSet"
preprocessed_folder = "PreprocessedDataSet"
summarized_folder = "SummarizedDataSet"

def recreate_folder(folder_path):
    # If the folder exists, delete it
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    # Recreate the folder
    os.makedirs(folder_path)

# Recreate the folders
recreate_folder(preprocessed_folder)
recreate_folder(summarized_folder)

# Define keywords for extracting privacy, data, liability, etc.
keywords = ["privacy", "data", "liability", "third-party", "security", "cookies", "user rights"]

# Function to extract relevant clauses based on keywords
def extract_legal_clauses(text, keywords):
    clauses = []
    sentences = text.split(".")  # Split text into sentences

    for sentence in sentences:
        # Check if the sentence contains any keyword
        if any(keyword.lower() in sentence.lower() for keyword in keywords):
            clauses.append(sentence.strip())

    return clauses

# Function to summarize extracted clauses using LegalBERT
def summarize_text(clauses):
    summary = ""
    for clause in clauses:
        # Tokenize and encode the clause
        inputs = tokenizer(clause, return_tensors="pt", truncation=True, max_length=512)

        # Get model predictions (classification result)
        with torch.no_grad():
            outputs = model(**inputs)

        # You can process the outputs here if required (e.g., extract categories, confidence, etc.)
        # For now, we'll just append the sentence to the summary
        summary += clause + " "

    return summary.strip()

# Loop through all files in the input folder
for file_name in os.listdir(input_folder):
    input_path = os.path.join(input_folder, file_name)

    # Ensure we process only text files
    if os.path.isfile(input_path) and input_path.endswith(".txt"):
        # Read the file content
        with open(input_path, "r", encoding="utf-8") as file:
            content = file.read()

        # Preprocess the text
        preprocessed_content = preprocess_text(content)

        # Extract relevant clauses based on keywords
        relevant_clauses = extract_legal_clauses(preprocessed_content, keywords)

        # Generate summary from the extracted clauses
        summary = summarize_text(relevant_clauses)

        # Save the summary to the output folder
        output_path = os.path.join(output_folder, f"summary_{file_name}")
        with open(output_path, "w", encoding="utf-8") as file:
            file.write(summary)

print(f"Summarization completed. Summaries saved in '{output_folder}'.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Summarization completed. Summaries saved in 'PreprocessedDataSet'.
