In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
from docx import Document
# Sample lease text (you can replace this with actual lease content)
def read_docx(file_path):
    """
    Extract text from a .docx file.
    """
    doc = Document(file_path)
    text = []
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():  # Only include non-empty paragraphs
            text.append(paragraph.text.strip())
    return "\n".join(text)

def clean_text(text):
    """
    Clean text for better embedding generation.
    """
    # Remove unnecessary whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text.strip()

raw_text = read_docx("propositions_output.docx")
cleaned_text = clean_text(raw_text)

# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,     # Maximum chunk size (in characters)
    chunk_overlap=20,   # Overlap between chunks (in characters)
    separators=["\n\n", "\n", ".", " ", ""],  # Prioritized separators
)

# Split the lease text into chunks
chunks = text_splitter.split_text(cleaned_text)

# Print the chunks
print("Text Chunks:")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:")
    print(chunk)
    print("---")
    

# Load a pre-trained Sentence Transformer model for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for each chunk
ner_embeddings = np.array(embedding_model.encode(chunks), dtype="float32")

# Create a FAISS vector index
ner_index = faiss.IndexFlatL2(ner_embeddings.shape[1])   
ner_index.add(ner_embeddings)

print(f"\nNumber of chunks indexed in FAISS: {ner_index.ntotal}")

  from .autonotebook import tqdm as notebook_tqdm


Text Chunks:
Chunk 1:
EX1A-6 MAT CTRCT 11 ark7_ex6-10.htm EXHIBIT 6.10 Exhibit 6
---
Chunk 2:
.10 RESIDENTIAL LEASE AGREEMENT This Lease Agreement (the  Agreement ) is made and entered on
---
Chunk 3:
made and entered on [CONTRACT_DATE] (the  Effective Date ) by and between ARK7 PROPERTIES LLC (the
---
Chunk 4:
LLC (the  Landlord ) and [TENANT1], [TENANT2] (the  Tenant )
---
Chunk 5:
. Subject to the terms and conditions stated below the parties agree as follows: If you choose to
---
Chunk 6:
If you choose to pay your rent using personal check, money order, or cashier s check, please make
---
Chunk 7:
check, please make your check payable to ARK7 INC
---
Chunk 8:
. and mail it to our company address listed below, before the due date each month: Ark7 Inc
---
Chunk 9:
. 535 Mission St, 14th Floor San Francisco, CA 94105 If any payment is returned for non-sufficient
---
Chunk 10:
for non-sufficient funds or because Tenant stops payments, then, after that, (i) Landlord may, in
---
Chunk 11

In [2]:
query = "What is the start date?"
query_embedding = embedding_model.encode([query], normalize_embeddings=True)

# Search for the top 3 most relevant chunks
D, I = ner_index.search(np.array(query_embedding, dtype="float32"), k=50)
retrieved_chunks = [chunks[i] for i in I[0]]

# Print the retrieved chunks
print("\nTop Matching Chunks for Query:")
for i, chunk in enumerate(retrieved_chunks):
    print(f"{i+1}. {chunk} (Distance: {D[0][i]:.2f})")
    print("---")


Top Matching Chunks for Query:
1. made and entered on [CONTRACT_DATE] (the  Effective Date ) by and between ARK7 PROPERTIES LLC (the (Distance: 1.20)
---
2. manner prescribed by law as of the Effective Date (Distance: 1.26)
---
3. will be present at the Premises for more than 14 consecutive days or 30 days in a calendar year (Distance: 1.35)
---
4. . and mail it to our company address listed below, before the due date each month: Ark7 Inc (Distance: 1.38)
---
5. Premises for not over 14 consecutive days or 30 days in a calendar year, and no more than two (Distance: 1.40)
---
6. .e. every two months) (Distance: 1.40)
---
7. .e., every two months) (Distance: 1.44)
---
8. . Mailed lease payments must be received on or before the due date (Distance: 1.48)
---
9. on a monthly or bi-monthly basis (i (Distance: 1.54)
---
10. .50, the payment of which will be due on the 1st day of the month following Tenant s receipt of an (Distance: 1.57)
---
11. the building at any time (Distance: 1.57)
---

In [3]:
#Trying NER with Spacy
import spacy
import json

# Load English language model
nlp = spacy.load("en_core_web_lg")

# Process the text with spaCy
doc = nlp(cleaned_text)

# Extract named entities and their labels
meta_data = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]

# Convert meta data to JSON format
meta_data_json = json.dumps(meta_data)
from IPython.display import JSON
JSON(meta_data)
meta_data

ModuleNotFoundError: No module named 'spacy'

In [4]:
from transformers import pipeline

# Load QA pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Questions for the rental agreement
questions = [
    {"question": "Name of the tenant", "context": cleaned_text},
    {"question": "What is the rent amount?", "context": cleaned_text},
    {"question": "Date of contract?", "context": cleaned_text},
]

# Extract answers
for q in questions:
    result = qa_pipeline(q)
    print(f"Q: {q['question']}\nA: {result['answer']} (Score: {result['score']:.2f})")


Device set to use cuda:0


Q: Name of the tenant
A: Landlord (Score: 0.00)
Q: What is the rent amount?
A: California (Score: 0.21)
Q: Date of contract?
A: LLC (Score: 0.00)
