In [None]:
# Import required libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Paths to training and test data directories
train_path = '/content/drive/MyDrive/LegalLoom/dataset/IN-Abs/train-data'
test_path = '/content/drive/MyDrive/LegalLoom/dataset/IN-Abs/test-data'

# Function to load documents and summaries from directories
def load_data(data_path):
    docs = []
    summaries = []

    # Load judgments and summaries from directories
    judgements_path = os.path.join(data_path, 'judgement')
    summaries_path = os.path.join(data_path, 'summary')

    for filename in os.listdir(judgements_path):
        if filename.endswith('.txt'):
            # Read the judgment (legal case document)
            with open(os.path.join(judgements_path, filename), 'r', encoding='utf-8') as f:
                docs.append(f.read())

            # Read the corresponding summary
            with open(os.path.join(summaries_path, filename), 'r', encoding='utf-8') as f:
                summaries.append(f.read())

    return pd.DataFrame({'document': docs, 'summary': summaries})

# Load training and test data
train_data = load_data(train_path)
test_data = load_data(test_path)

# Split the training data into train and validation sets
train_df, val_df = train_test_split(train_data, test_size=0.1, random_state=42)

# Output data shapes
print("Training data loaded with shape:", train_df.shape)
print("Validation data loaded with shape:", val_df.shape)
print("Test data loaded with shape:", test_data.shape)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training data loaded with shape: (6327, 2)
Validation data loaded with shape: (703, 2)
Test data loaded with shape: (100, 2)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install the datasets library (if not already installed)
!pip install datasets

# Import necessary libraries
import pandas as pd
from datasets import Dataset

# Assuming train_df, val_df, and test_data are your pandas DataFrames
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_data.reset_index(drop=True))



Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:0

In [None]:
!pip install transformers

from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Loaded {model_name} model and tokenizer successfully!")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loaded t5-base model and tokenizer successfully!


In [None]:
from transformers import T5Tokenizer

# Load the tokenizer (assuming you've already loaded the T5 model and tokenizer earlier)
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Preprocessing function to tokenize inputs and outputs
def preprocess_function(examples):
    # Prefix the input with "summarize: " as per T5's training protocol
    inputs = ["summarize: " + doc for doc in examples['document']]

    # Tokenize the inputs (documents)
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Tokenize the summaries (outputs)
    with tokenizer.as_target_tokenizer():  # For newer versions of transformers
        labels = tokenizer(examples['summary'], max_length=128, truncation=True, padding='max_length')

    # Store the tokenized summaries as labels
    model_inputs['labels'] = labels['input_ids']

    return model_inputs

# Apply the preprocessing function to the datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Print a summary of the tokenized data
print(tokenized_train)
print(tokenized_val)
print(tokenized_test)

Map:   0%|          | 0/6327 [00:00<?, ? examples/s]



Map:   0%|          | 0/703 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 6327
})
Dataset({
    features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 703
})
Dataset({
    features: ['document', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})


In [None]:
# Import necessary libraries for model training
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the T5 model and move it to GPU if available
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",             # Output directory for model checkpoints
    per_device_train_batch_size=4,      # Batch size for training (adjust based on available memory)
    per_device_eval_batch_size=4,       # Batch size for evaluation
    num_train_epochs=3,                 # Number of training epochs
    eval_strategy="epoch",              # Evaluation strategy
    logging_dir='./logs',               # Log directory
    logging_steps=100,                  # Log every 100 steps
    save_strategy="epoch",              # Save checkpoints after each epoch
    save_total_limit=2,                 # Keep only the last 2 checkpoints
    evaluation_strategy="epoch",        # Evaluate at the end of each epoch
    fp16=True if torch.cuda.is_available() else False,  # Enable mixed precision if using GPU
)

# Create the Data Collator for Seq2Seq tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Ensure that trainer is initialized successfully
print("Trainer initialized successfully!")


Using device: cpu
Trainer initialized successfully!


In [None]:
# Check the number of files involved in the datasets
print(f"Number of training examples: {len(tokenized_train)}")
print(f"Number of validation examples: {len(tokenized_val)}")



Number of training examples: 6327
Number of validation examples: 703


# Trainging Model

In [None]:
# Start the training process
trainer.train()

In [None]:
import streamlit as st
import easyocr
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from PIL import Image
import numpy as np

def extract_text(image, output_file):
    image = Image.open(image).convert("RGB")
    image_np = np.array(image)

    reader = easyocr.Reader(['en'])
    results = reader.readtext(image_np, detail=0)
    extracted_text = " ".join(results)

    with open(output_file, 'w') as file:
        file.write(extracted_text)
    return extracted_text, output_file

def preprocess_text(text):
    sentences = sent_tokenize(text)
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence)]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words, sentences

def get_pos_tags(words):
    return nltk.pos_tag(words)

def get_key_sentences(sentences, pos_tags):
    nouns_verbs_adjectives = [word for word, tag in pos_tags if tag.startswith('N') or tag.startswith('V') or tag.startswith('J')]
    word_freq = Counter(nouns_verbs_adjectives)
    key_sentences = [(sentence, sum(word_freq[word] for word in word_tokenize(sentence.lower()) if word in word_freq)) for sentence in sentences]
    key_sentences.sort(key=lambda x: x[1], reverse=True)
    return [sentence for sentence, score in key_sentences[:3]]  # Top 3 sentences

def summarize_text(text):
    words, sentences = preprocess_text(text)
    pos_tags = get_pos_tags(words)
    key_sentences = get_key_sentences(sentences, pos_tags)
    return ' '.join(key_sentences)

# Evaluation

In [None]:
# Function to generate summaries on the test dataset
def generate_summary(batch, model, tokenizer, device, max_length=200, num_beams=4):
    model.eval()  # Set model to evaluation mode
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    # Generate summaries (adjust max_length and num_beams as needed)
    generated_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True
    )

    # Decode generated IDs into readable text
    generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_summary

# Function to evaluate the model on the test dataset
def evaluate_model(model, tokenizer, test_dataset, device):
    model.eval()  # Ensure model is in evaluation mode
    test_summaries = []
    ground_truths = []

    for batch in test_dataset:
        # Generate summary for each test sample
        generated_summary = generate_summary(batch, model, tokenizer, device)

        # Append generated and ground truth summaries for comparison
        test_summaries.append(generated_summary)
        ground_truths.append(batch["labels"])  # Assuming labels contain the true summaries

    return test_summaries, ground_truths

# Perform evaluation on the test dataset
test_summaries, ground_truths = evaluate_model(model, tokenizer, tokenized_test, device)

# Output some test summaries and their corresponding ground truths for comparison
for i in range(5):  # Show 5 examples
    print(f"Test Sample {i+1}:")
    print(f"Generated Summary: {test_summaries[i]}")
    print(f"Ground Truth Summary: {tokenizer.decode(ground_truths[i], skip_special_tokens=True)}")
    print("\n")


# Testing

In [None]:
# After training, generate summaries on the test dataset
def generate_summary(test_dataset, model, tokenizer):
    model.eval()  # Set the model to evaluation mode
    summaries = []

    for batch in test_dataset:
        input_ids = batch["input_ids"].unsqueeze(0).to(device)
        attention_mask = batch["attention_mask"].unsqueeze(0).to(device)

        # Generate summary (you can adjust max_length and num_beams)
        summary_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=200, num_beams=4)

        # Decode and store the summary
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

# Generate summaries on the test set
test_summaries = generate_summary(tokenized_test, model, tokenizer)

# Output a few summaries for inspection
for i, summary in enumerate(test_summaries[:5]):
    print(f"Summary {i+1}: {summary}")


/content/drive/MyDrive/LegalLoom/trained_model.joblib

In [None]:
import joblib
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load the trained model from the saved file
trained_model = joblib.load('/content/drive/MyDrive/LegalLoom/trained_model.joblib')

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Set device for evaluation (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model.to(device)

# Set the model to evaluation mode
trained_model.eval()

print("Model loaded successfully and set to evaluation mode.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Model loaded successfully and set to evaluation mode.


In [None]:
import os
from transformers import T5Tokenizer

# Define the test data directory
test_dir = "/content/drive/MyDrive/LegalLoom/dataset/IN-Abs/test-data"

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Function to read judgement and summary pairs from test data
def load_test_data(test_dir):
    judgements = []
    summaries = []

    judgement_dir = os.path.join(test_dir, "judgement")
    summary_dir = os.path.join(test_dir, "summary")

    # Iterate over files in judgement and summary directories
    for filename in os.listdir(judgement_dir):
        with open(os.path.join(judgement_dir, filename), 'r') as j_file:
            judgement_text = j_file.read()

        with open(os.path.join(summary_dir, filename), 'r') as s_file:
            summary_text = s_file.read()

        judgements.append(judgement_text)
        summaries.append(summary_text)

    return judgements, summaries

# Load test judgements and summaries
test_judgements, test_summaries = load_test_data(test_dir)

# Tokenize the test data
tokenized_test = tokenizer(test_judgements, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
tokenized_summaries = tokenizer(test_summaries, padding="max_length", truncation=True, max_length=150, return_tensors="pt")


In [None]:
# Set the model to evaluation mode
trained_model.eval()

# Move the model to the appropriate device
trained_model.to(device)

# Generate summaries for the test dataset
generated_summaries = []

for i in range(len(test_judgements)):
    input_ids = tokenized_test['input_ids'][i].unsqueeze(0).to(device)
    attention_mask = tokenized_test['attention_mask'][i].unsqueeze(0).to(device)

    # Generate summary
    generated_ids = trained_model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_beams=4, early_stopping=True)

    # Decode generated summary
    generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)

# Print a sample comparison
for i in range(3):  # Show a few examples
    print(f"Original Summary: {test_summaries[i]}")
    print(f"Generated Summary: {generated_summaries[i]}\n")


Original Summary: The respondent was required by the Chief Inspector of Shops and Establishments to register its establishment ' under the Delhi Shops and Establishments Act, 1954.
The respondent contended that it was not an 'establishment ' which is defined in section 2(9) as meaning 'a shop or a commercial estab lishment ' and did not comply with the direction.
Proceedings for prosecution of its Secretary were instituted, where upon the respondent filed a petition in the High Court for quashing the order of the appellant and for directing the Magistrate not to proceed with the complaint.
Before the High Court the appellant contended that the respondent was a 'commercial establishment ' because, the activity of the respondent amounted to a 'Profession ', and alternatively, that the case fell within the latter part of the definition of 'commercial establishment ' in section 2(5), inasmuch as its activities were connected with trade and business generally, but, it was not urged that the