In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'legal-case-document-summarization:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4616972%2F7868867%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241007%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241007T102642Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6c339aa36e5344ebbdb840e331fd5dd5c56da12f10c9412d9b96cf00bcc6e2f9519e235642aca081cfe0b94f3e0e71cbde6a30347a3ed18eb4cc3114e91605c811ee8dcb37758054b685d3e345014016410612ff5728bbbc4f5dbdc529728cb6150598ca43250e0c8dc59adbf217a93e372b0542a867608a8d75028e548641c9157d9c265b8447e85b3d9cb67ed01a514b3e8fd65c7436fb7ecd03bd168d01324587e82d3767342a10e51769caf2f3f5f002d2ef4feb0d8307ee5b8162c019e9350044855b53bfd34ca53aeebb2c73ca16e7019625b959af95bc059c4f411685c26ab283cf6963fbc46c2480ef81157c8531d74a03db1ba194cd271c51c49d57'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading legal-case-document-summarization, 98320130 bytes compressed
Downloaded and uncompressed: legal-case-document-summarization
Data source import complete.


In [None]:
!pip install transformers rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=8937dd31de51bf7a428688420c34c11528bfa436fb9d27b350a2ad433c78a4d1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


## **Import Libraries and Load Models**

In [None]:
import os
import torch  # Import torch for tensor operations and model inference
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline, BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report

# Load the BART summarization model
summarization_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
summarization_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Load a BERT model for classification
classification_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
classification_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Adjust num_labels as needed

# Create pipelines
summarization_pipeline = pipeline("summarization", model=summarization_model, tokenizer=summarization_tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## **Load and Preprocess the Dataset**

In [None]:
# Define the dataset paths
train_judgement_path = '/kaggle/input/legal-case-document-summarization/dataset/UK-Abs/train-data/judgement'
test_judgement_path = '/kaggle/input/legal-case-document-summarization/dataset/UK-Abs/test-data/judgement'

# Load and preprocess the text files from the training data
judgement_files_train = os.listdir(train_judgement_path)

# Ensure input text is within the model's maximum length
max_input_length = 1024  # BART's maximum input length

# Process a small sample of files from the training data with truncation
processed_texts_sample_train = []
for filename in judgement_files_train[:10]:  # Limiting to the first 10 files for summarization
    file_path = os.path.join(train_judgement_path, filename)
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            tokenized_text = summarization_tokenizer.encode(text, truncation=True, max_length=max_input_length)
            decoded_text = summarization_tokenizer.decode(tokenized_text, skip_special_tokens=True)
            processed_texts_sample_train.append(decoded_text)
    except Exception as e:
        print(f"Error reading file {filename}: {e}")

## **Generate Summaries**

In [None]:
# Generate summaries for the loaded and truncated documents
summaries = []
for text in processed_texts_sample_train:
    summary = summarization_pipeline(text, max_length=150, min_length=40, do_sample=False)
    summaries.append(summary[0]['summary_text'])

# Display the summaries
for i, summary in enumerate(summaries):
    print(f"Document {i+1} Summary:")
    print(summary)
    print("\n")

Document 1 Summary:
The appeal raises the issue whether the daily vessel operating expenses of shipowners incurred while they were negotiating to reduce the ransom demands of pirates should be allowed in general average. The question is whether those expenses should be shared proportionately between all those whose property and entitlements were imperilled as a result of that seizure or whether they must be borne by the shipowner alone.


Document 2 Summary:
Appellant complained that he was not provided with appropriate rehabilitation courses following his recall to prison. He was released on licence after serving two thirds of the custodial term, but was recalled to custody after committing a further offence.


Document 3 Summary:
The six claimants were the owners and bill of lading holders for nine separate consignments of bagged Colombian green coffee beans. They were stowed in a total of 20 unventilated 20 foot containers. The bags in 18 of them were found to have suffered water da

## **Classification of Documents**

In [None]:
# Convert summaries (or original texts) to tokenized inputs for classification
classification_inputs = classification_tokenizer(summaries, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Perform classification
with torch.no_grad():
    outputs = classification_model(**classification_inputs)
    predictions = torch.argmax(outputs.logits, dim=1)

# Map predictions to labels (Assuming binary classification: 0 = 'Type A', 1 = 'Type B')
labels = {0: 'Type A', 1: 'Type B'}
predicted_labels = [labels[pred.item()] for pred in predictions]

# Display the classification results
for i, label in enumerate(predicted_labels):
    print(f"Document {i+1} Classified as: {label}")

Document 1 Classified as: Type A
Document 2 Classified as: Type A
Document 3 Classified as: Type A
Document 4 Classified as: Type A
Document 5 Classified as: Type A
Document 6 Classified as: Type A
Document 7 Classified as: Type A
Document 8 Classified as: Type A
Document 9 Classified as: Type A
Document 10 Classified as: Type A


## **Evaluate Classification Performance**

In [None]:
# Dummy reference labels (for demonstration purposes)
reference_labels = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # Replace with actual labels

# Generate a classification report
print("Classification Report:")
print(classification_report(reference_labels, predictions, target_names=['Type A', 'Type B']))

Classification Report:
              precision    recall  f1-score   support

      Type A       0.50      1.00      0.67         5
      Type B       0.00      0.00      0.00         5

    accuracy                           0.50        10
   macro avg       0.25      0.50      0.33        10
weighted avg       0.25      0.50      0.33        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
