In [5]:
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch

# Set paths for original and annotated documents
original_docs_path = "/kaggle/input/para-splitting-data/data_splitting/original"
annotated_docs_path = "/kaggle/input/para-splitting-data/data_splitting/annotated"

# Load the 50 documents with the naming convention in mind
def load_documents_with_naming_convention(original_docs_path, annotated_docs_path):
    original_files = sorted(os.listdir(original_docs_path))
    original_documents = []
    annotated_documents = []
    
    for file in original_files:
        if file.endswith('.txt'):
            # Load the original document
            with open(os.path.join(original_docs_path, file), 'r', encoding='utf-8') as f:
                original_documents.append(f.read())
            
            # Load the corresponding annotated document
            annotated_file = 'cleaned_' + file
            with open(os.path.join(annotated_docs_path, annotated_file), 'r', encoding='utf-8') as f:
                annotated_documents.append(f.read())
    
    return original_documents, annotated_documents

# Load original and corresponding annotated documents
original_documents, annotated_documents = load_documents_with_naming_convention(original_docs_path, annotated_docs_path)

print(f"Loaded {len(original_documents)} original and {len(annotated_documents)} annotated documents.")

Loaded 50 original and 50 annotated documents.


In [6]:
class ParagraphT5Dataset(Dataset):
    def __init__(self, original_docs, annotated_docs, tokenizer, max_len=512):
        self.original_docs = original_docs
        self.annotated_docs = annotated_docs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.original_docs)

    def __getitem__(self, idx):
        original_text = self.original_docs[idx]
        annotated_text = self.annotated_docs[idx]
        
        # Tokenize input and output
        input_encoding = self.tokenizer(original_text, max_length=self.max_len, truncation=True, padding='max_length', return_tensors="pt")
        target_encoding = self.tokenizer(annotated_text, max_length=self.max_len, truncation=True, padding='max_length', return_tensors="pt")
        
        input_ids = input_encoding['input_ids'].squeeze()  # Remove batch dimension
        attention_mask = input_encoding['attention_mask'].squeeze()
        labels = target_encoding['input_ids'].squeeze()

        return input_ids, attention_mask, labels

In [7]:
# Load pre-trained T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Create the dataset
dataset = ParagraphT5Dataset(original_documents, annotated_documents, tokenizer)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

In [8]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [9]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        
        # Clear the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [11]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [13]:
# Training process
epochs = 3
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, device)
    val_loss = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}/{epochs}:")
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")

Epoch 1/3:
Training Loss: 1.8890
Validation Loss: 0.7025
Epoch 2/3:
Training Loss: 1.0721
Validation Loss: 0.3939
Epoch 3/3:
Training Loss: 0.7206
Validation Loss: 0.2676


In [14]:
# Save the fine-tuned model
model.save_pretrained('t5_paragraph_splitter')
tokenizer.save_pretrained('t5_paragraph_splitter')

('t5_paragraph_splitter/tokenizer_config.json',
 't5_paragraph_splitter/special_tokens_map.json',
 't5_paragraph_splitter/spiece.model',
 't5_paragraph_splitter/added_tokens.json')

In [16]:
# Function to load a document from a .txt file
def load_test_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        document = file.read()
    return document

# Updated function to split paragraphs using a loaded .txt file
def split_paragraphs_from_file(model, tokenizer, file_path, device):
    # Load the document from the file
    document = load_test_document(file_path)
    
    # Tokenize and generate the output
    input_ids = tokenizer(document, return_tensors='pt', truncation=True, padding=True).input_ids.to(device)
    
    # Generate the segmented document
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_length=512)
    
    # Decode the output
    segmented_document = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return segmented_document

In [17]:
# Specify the path to the test .txt file
test_file_path = "/kaggle/input/para-splitting-data/data_splitting/original/_judgment_judis_29890.txt"  # Replace with the actual file path

# Perform inference and print the segmented output
segmented_output = split_paragraphs_from_file(model, tokenizer, test_file_path, device)
print("Segmented Document with Paragraphs:")
print(segmented_output)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Segmented Document with Paragraphs:
the appeal has been preferred against the judgment and order dated 2.1.2007 of Gauhati High Court by which the appeal preferred by the appellants was disposed of with the modification that the sentence of five years R.I. and fine of Rs.7,000/- imposed upon each of the appellants under Section 313 read with Section 34 IPC by the learned Additional Sessions Judge, Kokrajhar, was reduced to three years R.I. and fine of Rs.5,000/- of both the appellants. The court took regard to the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact that the fact