In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import re
from docx import Document

def read_docx(file_path):
    """
    Extract text from a .docx file.
    """
    doc = Document(file_path)
    text = []
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():  # Only include non-empty paragraphs
            text.append(paragraph.text.strip())
    return "\n".join(text)

def clean_text(text):
    """
    Clean text for better embedding generation.
    """
    # Remove unnecessary whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text.strip()

def chunk_text_semantically(text, chunk_size=400, chunk_overlap=50):
    """
    Chunk text using RecursiveCharacterTextSplitter to retain semantic structure.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""],  # Semantic priority
    )
    return text_splitter.split_text(text)

raw_text = read_docx("test_sample.pdf.docx")
cleaned_text = clean_text(raw_text)
chunks = chunk_text_semantically(cleaned_text, chunk_size=400, chunk_overlap=50)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chunks

['EX1A-6 MAT CTRCT 11 ark7_ex6-10.htm EXHIBIT 6.10 Exhibit 6.10 RESIDENTIAL LEASE AGREEMENT This Lease Agreement (the  Agreement ) is made and entered on [CONTRACT_DATE] (the  Effective Date ) by and between ARK7 PROPERTIES LLC (the  Landlord ) and [TENANT1], [TENANT2] (the  Tenant )',
 '. Subject to the terms and conditions stated below the parties agree as follows: If you choose to pay your rent using personal check, money order, or cashier s check, please make your check payable to ARK7 INC. and mail it to our company address listed below, before the due date each month: Ark7 Inc',
 '. 535 Mission St, 14th Floor San Francisco, CA 94105 If any payment is returned for non-sufficient funds or because Tenant stops payments, then, after that, (i) Landlord may, in writing, require Tenant to pay Rent in cash for three months and (ii) all future Rent shall be paid by cashier s check or money order',
 '. In the event of roommates, or another form of joint or multiple occupancy, Tenant will b

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import json

model_name = "chentong00/propositionizer-wiki-flan-t5-large"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)



cuda


In [5]:
propositions = []
title = "Rental Lease Agreement"
for idx, chunk in enumerate(chunks):
    input_text = f"Title: {title}. Section: Chunk {idx + 1}. Content: {chunk}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True).input_ids.to(device)
    
    # Generate propositions
    outputs = model.generate(input_ids, max_new_tokens=512).cpu()
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Parse JSON output
    try:
        prop_list = json.loads(output_text)
        print(prop_list)
        propositions.extend(prop_list)
    except json.JSONDecodeError:
        print(f"[ERROR] Failed to parse output for Chunk {idx + 1}")
        
print(json.dumps(propositions, indent=2))

['EX1A-6 MAT CTRCT 11 ark7_ex6-10.htm EXHIBIT 6.10 Exhibit 6.10 RESIDENTIAL LEASE AGREEMENT This Lease Agreement is made and entered on [CONTRACT_DATE].', 'The Effective Date of the Lease Agreement is [CONTRACT_DATE].', 'ARK7 PROPERTIES LLC is the Landlord.', 'TENANT1 is the Tenant.', 'TENANT2 is the Tenant.']
['The parties agree to pay rent using personal check, money order, or cashier s check.', 'Make your check payable to ARK7 INC.', 'Mail your check to the company address listed below.', 'Pay your rent before the due date each month.']
['The address is 535 Mission St, 14th Floor, San Francisco, CA 94105.', 'If any payment is returned for non-sufficient funds or because Tenant stops payments, then Landlord may require Tenant to pay Rent in cash for three months.', 'After the return of a payment, Landlord may require Tenant to pay Rent by cashier s check or money order.']
['In the event of roommates, or another form of joint or multiple occupancy, Tenant will be responsible for colle

In [7]:
import json
from docx import Document

# Assuming `propositions` is the JSON object you want to save

# Define the file path
file_path = "propositions_output.docx"

# Create a new Word document
doc = Document()

# Add a title to the document (optional)
doc.add_heading("Propositions Output", level=1)

# Add the JSON content to the document
doc.add_paragraph(json.dumps(propositions, indent=2, ensure_ascii=False))

# Save the document
doc.save(file_path)

print(f"Propositions successfully saved to {file_path}")



Propositions successfully saved to propositions_output.docx
