In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def split_into_chunks(text, max_tokens, overlap, tokenizer, question, return_text=False):
    """
    Splits text into logical 512-token chunks using paragraph & clause boundaries.
    Ensures no chunk exceeds the model's max length.
    """

    paragraphs = text.split("\n\n")  # Step 1: Split by paragraph
    chunks = []
    current_chunk = []
    token_count = 0

    for paragraph in paragraphs:
        sentences = sent_tokenize(paragraph)
        paragraph_tokens = tokenizer.encode(paragraph, add_special_tokens=False)

        # Step 2: Merge sentences into logical units
        for sentence in sentences:
            sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
            sentence_length = len(sentence_tokens)

            # If adding this sentence exceeds max tokens, finalize the chunk
            if token_count + sentence_length > max_tokens - 50:  # 50 reserved for "question: ..." prefix
                chunk_text = " ".join(current_chunk)
                formatted_text = f"question: {question} context: {chunk_text}"

                # Step 3: Truncate full formatted input to 512 tokens
                formatted_chunk = tokenizer.encode(formatted_text, add_special_tokens=True, truncation=True, max_length=max_tokens)

                if return_text:
                    chunks.append(tokenizer.decode(formatted_chunk, skip_special_tokens=False))
                else:
                    chunks.append(formatted_chunk)

                # Step 4: Contextual overlap (keep full logical units)
                current_chunk = current_chunk[-(overlap // 2):]
                token_count = sum(len(tokenizer.encode(sent, add_special_tokens=False)) for sent in current_chunk)

            # Add sentence to chunk
            current_chunk.append(sentence)
            token_count += sentence_length

    # Step 5: Add last chunk
    if current_chunk:
        chunk_text = " ".join(current_chunk)
        formatted_text = f"question: {question} context: {chunk_text}"
        formatted_chunk = tokenizer.encode(formatted_text, add_special_tokens=True, truncation=True, max_length=max_tokens)

        if return_text:
            chunks.append(tokenizer.decode(formatted_chunk, skip_special_tokens=False))
        else:
            chunks.append(formatted_chunk)

    return chunks


second tokenizer- faster than above but trying to move to gpu

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def split_into_chunks(text, max_tokens, overlap, tokenizer, question, return_text=False):
    """
    Efficiently splits text into logical 512-token chunks while preserving clause and paragraph boundaries.
    """

    paragraphs = text.split("\n\n")  # Step 1: Split by paragraph
    chunks = []
    current_chunk = []
    token_count = 0

    for paragraph in paragraphs:
        sentences = sent_tokenize(paragraph)

        # Step 2: Tokenize all sentences at once (Optimized)
        sentence_tokens = tokenizer.batch_encode_plus(sentences, add_special_tokens=False)['input_ids']
        sentence_lengths = [len(tokens) for tokens in sentence_tokens]  # Cache token lengths

        # Step 3: Merge sentences efficiently
        for sentence, sentence_tokenized, sentence_length in zip(sentences, sentence_tokens, sentence_lengths):

            # If adding this sentence exceeds max tokens, finalize the chunk
            if token_count + sentence_length > max_tokens - 50:  # 50 reserved for prefix
                chunk_text = " ".join(current_chunk)
                formatted_text = f"question: {question} context: {chunk_text}"

                # Truncate final chunk
                formatted_chunk = tokenizer.encode(formatted_text, add_special_tokens=True, truncation=True, max_length=max_tokens)

                chunks.append(tokenizer.decode(formatted_chunk) if return_text else formatted_chunk)

                # **Step 4: Contextual Overlap** (Optimized: Avoid recomputing token lengths)
                current_chunk = current_chunk[-(overlap // 2):]
                token_count = sum(sentence_lengths[-(overlap // 2):])

            # Add sentence to chunk
            current_chunk.append(sentence)
            token_count += sentence_length

    # Step 5: Add last chunk
    if current_chunk:
        chunk_text = " ".join(current_chunk)
        formatted_text = f"question: {question} context: {chunk_text}"
        formatted_chunk = tokenizer.encode(formatted_text, add_special_tokens=True, truncation=True, max_length=max_tokens)

        chunks.append(tokenizer.decode(formatted_chunk) if return_text else formatted_chunk)

    return chunks


In [None]:
# Function to assign a confidence score (New)
def compute_confidence(chunk_tokens, answers, tokenizer):
    """
    Computes confidence score based on token-level answer presence.

    Parameters:
    - chunk_tokens (list[int]): Tokenized chunk (list of token IDs).
    - answers (list[str]): List of possible correct answers.
    - tokenizer (AutoTokenizer): Tokenizer used for encoding.

    Returns:
    - confidence_score (float): Ranges from 0.0 (no match) to 1.0 (perfect match).
    """

    if not answers:
        return 0.0  # No answer should be found

    # Tokenize each answer to compare at the token level
    tokenized_answers = [tokenizer.encode(ans, add_special_tokens=False) for ans in answers]

    exact_matches = 0
    partial_matches = 0

    for tokenized_answer in tokenized_answers:
        if tokenized_answer in chunk_tokens:
            exact_matches += 1  # Perfect match (entire answer found)
        else:
            # Check if part of the tokenized answer appears in the chunk
            partial_count = sum(1 for token in tokenized_answer if token in chunk_tokens)
            if partial_count > 0:
                partial_matches += partial_count / len(tokenized_answer)  # Partial match ratio

    # Compute final confidence score
    if exact_matches == len(answers):
        return 1.0  # Perfect match for all answers
    elif partial_matches > 0:
        return round(partial_matches / len(answers), 2)  # Partial confidence
    return 0.0  # No match found

moving compute confidence to gpu but this is faster than above

In [None]:
def compute_confidence(chunk_tokens, answers, tokenizer):
    """
    Computes confidence score based on token-level answer presence.

    Optimizations:
    - Uses a set for `chunk_tokens` (O(1) lookup instead of O(N))
    - Uses `set.intersection()` for fast partial match checking
    - Improves confidence calculation by using max recall instead of averaging

    Returns:
    - confidence_score (float): Ranges from 0.0 (no match) to 1.0 (perfect match).
    """

    if not answers:
        return 0.0  # No answer should be found

    # Tokenize each answer
    tokenized_answers = [tokenizer.encode(ans, add_special_tokens=False) for ans in answers]

    # Convert chunk tokens to a set for faster lookups
    chunk_token_set = set(chunk_tokens)

    exact_matches = 0
    partial_match_ratios = []

    for tokenized_answer in tokenized_answers:
        answer_set = set(tokenized_answer)

        if answer_set.issubset(chunk_token_set):  # Perfect match
            exact_matches += 1
        else:
            # Compute partial match ratio
            overlap = len(answer_set.intersection(chunk_token_set))
            partial_match_ratios.append(overlap / len(answer_set) if len(answer_set) > 0 else 0)

    # Compute final confidence score
    if exact_matches == len(answers):
        return 1.0  # All answers perfectly matched
    elif partial_match_ratios:
        return round(max(partial_match_ratios), 2)  # Take the highest recall ratio
    return 0.0  # No match found


old preprocess data

In [None]:
import json
from multiprocessing import Pool
from tqdm import tqdm

# Load CUAD dataset
with open('/content/drive/My Drive/Colab Notebooks/CUAD_v1.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)["data"]

# Function to process a single contract
def process_contract(contract):
    processed_contract_data = []
    contract_title = clean_text(contract["title"])
    doc_id = contract["title"]

    for paragraph in contract["paragraphs"]:
        context = clean_text(paragraph["context"])

        for qa in paragraph["qas"]:
            clause_type = clean_text(qa["question"].split("related to \"")[1].split("\"")[0])
            answers = [clean_text(ans["text"]) for ans in qa["answers"]] if not qa["is_impossible"] else []

            # Generate refined instruction prompt
            instruction_prompt = generate_instruction_prompt(clause_type, context)

            # Split into chunks
            chunks = split_into_chunks(context, MAX_TOKENS, OVERLAP, tokenizer, clause_type)

            for chunk in chunks:
                # Compute confidence score
                confidence = compute_answer_presence(chunk, answers, tokenizer)
                answer_present = confidence > 0

                # Store structured data
                processed_contract_data.append({
                    "doc_id": doc_id,
                    "input": instruction_prompt,
                    "expected_output": answers if answer_present else [],
                    "answer_present": answer_present,
                    "confidence_score": confidence
                })

    return processed_contract_data

# Use multiprocessing with 8 workers and tqdm progress bar
if __name__ == "__main__":
    print("Starting dataset processing with 8 workers...\n")

    with Pool(processes=8) as pool:
        results = []
        with tqdm(total=len(dataset)) as pbar:
            for result in pool.imap_unordered(process_contract, dataset):
                results.append(result)
                pbar.update(1)

    # Flatten the results
    processed_data = [item for sublist in results for item in sublist]

    print("\nDataset processing complete! Saving results...")

    # Save processed dataset
    save_path = "/content/drive/My Drive/Colab Notebooks/NLP_266_Project/chunked_datasets/enhanced_chunking_dataset.json"
    with open(save_path, "w", encoding="utf-8") as file:
        json.dump(processed_data, file, indent=4)

    print(f" Saved {len(processed_data)} processed examples to {save_path}")


In [None]:
import torch
from transformers import AutoTokenizer

# Load tokenizer and move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


# Chunking parameters
MAX_TOKENS = 512  # Larger context for legal clauses
OVERLAP = 256  # More overlap to avoid clause splits

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def split_into_chunks(text, max_tokens, overlap, tokenizer, question, return_text=False):
    """
    Efficiently splits text into logical 512-token chunks while preserving clause and paragraph boundaries.
    Moves tokenization to GPU for faster processing.
    """

    paragraphs = text.split("\n\n")  # Step 1: Split by paragraph
    chunks = []
    current_chunk = []
    token_count = 0

    for paragraph in paragraphs:
        sentences = sent_tokenize(paragraph)

        # Step 2: Tokenize all sentences at once and move to GPU
        sentence_tokens = tokenizer.batch_encode_plus(sentences, add_special_tokens=False)['input_ids']
        sentence_tokens = [torch.tensor(tokens, device=device) for tokens in sentence_tokens]  # Move to GPU
        sentence_lengths = [len(tokens) for tokens in sentence_tokens]  # Cache token lengths

        # Step 3: Merge sentences efficiently
        for sentence, sentence_tokenized, sentence_length in zip(sentences, sentence_tokens, sentence_lengths):

            # If adding this sentence exceeds max tokens, finalize the chunk
            if token_count + sentence_length > max_tokens - 50:  # 50 reserved for prefix
                chunk_text = " ".join(current_chunk)
                formatted_text = f"question: {question} context: {chunk_text}"

                # Truncate final chunk and move to GPU
                formatted_chunk = tokenizer.encode(formatted_text, add_special_tokens=True, truncation=True, max_length=max_tokens)
                formatted_chunk = torch.tensor(formatted_chunk, device=device)  # Move to GPU

                chunks.append(tokenizer.decode(formatted_chunk.tolist()) if return_text else formatted_chunk.tolist())

                # **Step 4: Contextual Overlap** (Optimized: Avoid recomputing token lengths)
                current_chunk = current_chunk[-(overlap // 2):]
                token_count = sum(sentence_lengths[-(overlap // 2):])

            # Add sentence to chunk
            current_chunk.append(sentence)
            token_count += sentence_length

    # Step 5: Add last chunk
    if current_chunk:
        chunk_text = " ".join(current_chunk)
        formatted_text = f"question: {question} context: {chunk_text}"
        formatted_chunk = tokenizer.encode(formatted_text, add_special_tokens=True, truncation=True, max_length=max_tokens)
        formatted_chunk = torch.tensor(formatted_chunk, device=device)  # Move to GPU

        chunks.append(tokenizer.decode(formatted_chunk.tolist()) if return_text else formatted_chunk.tolist())

    return chunks


def clean_text(text):
    """Cleans text by removing extra spaces and newlines."""
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text



def generate_instruction_prompt(clause_type, context):
    return (
        f"You are a legal document analyst specializing in contract clause identification. "
        f"Your task is to accurately extract the {clause_type} from the provided text. "
        f"Follow these steps carefully:\n"
        f"1. Identify the exact clause without adding or removing words.\n"
        f"2. If the {clause_type} is missing or is not explicitly present, return [].\n"
        f"3. Ensure no unrelated text is included in the extraction.\n"
        f"4. If only part of the clause appears, extract the full statement if possible; otherwise, return [].\n"
        f"5. Do not make assumptions or interpret missing text—strictly return what is present.\n"
        f"---\n"
        f"QUESTION: {clause_type}\n"
        f"CONTEXT: {context}"
    )


import torch

def compute_answer_presence(chunk_tokens, answers, tokenizer):
    """
    Checks if any answer is fully present in the chunk.
    Optimized for GPU processing to reduce computation time.

    Optimizations:
    - Uses PyTorch tensors for fast token comparisons
    - Returns early if an exact match is found (saves time)
    - Removes unnecessary computations for partial matches

    Returns:
    - answer_present (bool): True if any answer is found in chunk, else False
    """

    if not answers:
        return False  # No answer should be found

    # Move chunk tokens to GPU
    chunk_tokens = torch.tensor(chunk_tokens, device="cuda" if torch.cuda.is_available() else "cpu")

    # Tokenize answers and move to GPU
    tokenized_answers = [torch.tensor(tokenizer.encode(ans, add_special_tokens=False),
                                      device=chunk_tokens.device) for ans in answers]

    for tokenized_answer in tokenized_answers:
        # If entire answer tokens exist within chunk tokens, return True immediately
        if all(token in chunk_tokens for token in tokenized_answer):
            return True  # Answer found, exit early

    return False  # No answer found

# Load CUAD dataset
with open('/content/drive/My Drive/Colab Notebooks/CUAD_v1.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)["data"]


# Function to process a single contract
def process_contract(contract):
    processed_contract_data = []
    contract_title = clean_text(contract["title"])
    doc_id = contract["title"]

    for paragraph in contract["paragraphs"]:
        context = clean_text(paragraph["context"])

        for qa in paragraph["qas"]:
            clause_type = clean_text(qa["question"].split("related to \"")[1].split("\"")[0])
            answers = [clean_text(ans["text"]) for ans in qa["answers"]] if not qa["is_impossible"] else []

            # Generate refined instruction prompt
            instruction_prompt = generate_instruction_prompt(clause_type, context)

            # Split into chunks (now moved to GPU)
            chunks = split_into_chunks(context, MAX_TOKENS, OVERLAP, tokenizer, clause_type)

            for chunk in chunks:
                # Compute answer presence (Binary flag, optimized on GPU)
                answer_present = compute_answer_presence(chunk, answers, tokenizer)

                # Store structured data
                processed_contract_data.append({
                    "doc_id": doc_id,
                    "input": instruction_prompt,
                    "expected_output": answers if answer_present else [],
                    "answer_present": answer_present,
                })

    return processed_contract_data

In [1]:
import multiprocessing
import psutil

print("CPUs available:", multiprocessing.cpu_count())
print("RAM available (GB):", round(psutil.virtual_memory().total / 1e9, 2))

CPUs available: 12
RAM available (GB): 89.63


In [2]:
#check gpu
!nvidia-smi


Wed Mar 26 03:36:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             44W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import torch

print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


GPU available: True
GPU name: NVIDIA A100-SXM4-40GB
