#Part 1. Install dependencies

We install the necessary Python libraries for:
- Sentence embeddings (`sentence-transformers`)
- Vector search (`faiss-cpu`)
- Pre-trained transformers (`transformers`)
- Datasets and text processing (`datasets`, `regex`)


In [None]:
!pip install -q sentence-transformers faiss-cpu transformers[torch] accelerate datasets regex


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[?25h

#Part 2. Generate synthetic rental contracts

We simulate 20 synthetic rental contracts.  
Each contract has random fields: rent, deposit, dates, pets allowed, notice period, repairs responsibilities.  
We also generate JSON/CSV datasets with both the text and the structured entities.


In [None]:
from pathlib import Path
import re, json, random, csv

def generate_contract(i):
    deposit_months = random.choice([1, 1.5, 2])
    rent = random.choice([3500, 4500, 5000, 6000, 7500])
    deposit_amount = int(rent * deposit_months)
    currency = "ZAR"
    start_date = f"2024-{random.randint(1,12):02d}-{random.choice([1,5,10,15,20]):02d}"
    end_date = f"2025-{random.randint(1,12):02d}-{random.choice([1,5,10,15,20]):02d}"
    pet_allowed = random.choice([True, False])
    notice = random.choice([7,14,30,60])
    late_fee = random.choice([0, 100, 200, 500])
    landlord_resp = random.choice([
        "Landlord is responsible for major structural repairs.",
        "Tenant must handle minor repairs; landlord covers roofing and plumbing.",
        "All repairs under 500 ZAR are tenant responsibility."
    ])
    tenant_name = random.choice(["Alice M.", "Thabo D.", "Zanele K.", "James B.", "Mpho R."])

    text = f"""
CONTRACT #{i}
This Residential Lease Agreement is entered into between the Landlord and the Tenant {tenant_name}.
Rent is {rent} {currency} per month. Tenant pays a security deposit equal to {deposit_months} months' rent (R{deposit_amount}).
Deposit will be held in a segregated investment account and interest will be shared between the company and the Tenant as per clause 5.
Lease starts on {start_date} and ends on {end_date}.
Notice period for termination is {notice} days. Late fee for overdue rent is R{late_fee} per day.
Pets allowed: {"yes" if pet_allowed else "no"}.
Repairs: {landlord_resp}
Upon termination deposit return is subject to normal wear and tear assessment.
"""
    entities = {
        "deposit_amount": deposit_amount,
        "currency": currency,
        "deposit_months": deposit_months,
        "start_date": start_date,
        "end_date": end_date,
        "pet_allowed": pet_allowed,
        "notice_period_days": notice,
        "late_fee": late_fee,
        "tenant_name": tenant_name
    }
    return text.strip(), entities

Path("contracts_data").mkdir(exist_ok=True)
contracts = []
for i in range(1, 21):
    t,e = generate_contract(i)
    contracts.append({"contract_id": i, "text": t, "entities": e})

with open("contracts_data/contracts.jsonl","w",encoding="utf8") as f:
    for c in contracts:
        f.write(json.dumps(c) + "\n")
with open("contracts_data/contracts.csv","w",encoding="utf8", newline='') as f:
    writer = csv.DictWriter(f, fieldnames=["contract_id","text","entities"])
    writer.writeheader()
    for c in contracts:
        writer.writerow({"contract_id":c["contract_id"], "text":c["text"], "entities":json.dumps(c["entities"])})
print("Generated", len(contracts), "contracts in contracts_data/")


Generated 20 contracts in contracts_data/


#Part 3. Rule-based extraction (baseline NER)

We build a very simple regex-based extractor.  
This mimics Named Entity Recognition (NER): finding key fields like deposit amount, dates, notice period.


In [None]:
def rule_extract(text):
    out = {}
    m = re.search(r"R(\d{3,7})", text)
    if m:
        out["deposit_amount"] = int(m.group(1))
    m = re.search(r"starts on (\d{4}-\d{2}-\d{2})", text)
    if m:
        out["start_date"] = m.group(1)
    m = re.search(r"ends on (\d{4}-\d{2}-\d{2})", text)
    if m:
        out["end_date"] = m.group(1)
    m = re.search(r"Notice period .* is (\d{1,3}) days", text)
    if m:
        out["notice_period_days"] = int(m.group(1))
    m = re.search(r"Pets allowed: (yes|no)", text, flags=re.I)
    if m:
        out["pet_allowed"] = m.group(1).lower() == "yes"
    return out

for c in contracts[:3]:
    print("-----")
    print(c["text"][:300])
    print("GOLD:", c["entities"])
    print("EXTRACTED:", rule_extract(c["text"]))


-----
CONTRACT #1
This Residential Lease Agreement is entered into between the Landlord and the Tenant Zanele K.. 
Rent is 7500 ZAR per month. Tenant pays a security deposit equal to 2 months' rent (R15000). 
Deposit will be held in a segregated investment account and interest will be shared between the c
GOLD: {'deposit_amount': 15000, 'currency': 'ZAR', 'deposit_months': 2, 'start_date': '2024-04-15', 'end_date': '2025-03-10', 'pet_allowed': True, 'notice_period_days': 14, 'late_fee': 500, 'tenant_name': 'Zanele K.'}
EXTRACTED: {'deposit_amount': 15000, 'start_date': '2024-04-15', 'end_date': '2025-03-10', 'notice_period_days': 14, 'pet_allowed': True}
-----
CONTRACT #2
This Residential Lease Agreement is entered into between the Landlord and the Tenant Alice M.. 
Rent is 5000 ZAR per month. Tenant pays a security deposit equal to 2 months' rent (R10000). 
Deposit will be held in a segregated investment account and interest will be shared between the co
GOLD: {'deposit_amount': 10000

#Part 4. Embeddings + FAISS (vector DB)

We create embeddings for each contract text and index them in FAISS.  
This lets us search semantically — not by keywords, but by meaning.


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')
texts = [c["text"] for c in contracts]
embs = embedder.encode(texts, convert_to_numpy=True)

dim = embs.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embs)
print("FAISS index size:", index.ntotal)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index size: 20


#Part 5. RAG Question-Answering

We use Flan-T5 (small) as a generator.  
The pipeline:  
1. Embed the user’s question.  
2. Find top relevant contracts via FAISS.  
3. Pass retrieved context + question into the model.  
4. Generate an answer.


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

gen_model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)

def rag_answer(question, top_k=3):
    q_emb = embedder.encode([question], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)
    retrieved_texts = [texts[idx] for idx in I[0]]
    context = "\n\n".join([f"Document {i+1}:\n{t}" for i,t in enumerate(retrieved_texts)])
    prompt = f"Use the following contract excerpts to answer the question.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer concisely:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    out = gen_model.generate(**inputs, max_length=256)
    ans = tokenizer.decode(out[0], skip_special_tokens=True)
    return ans, retrieved_texts

q = "How much deposit does tenant have to pay and in what currency?"
ans, docs = rag_answer(q)
print("Q:", q)
print("A:", ans)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Q: How much deposit does tenant have to pay and in what currency?
A: ZAR


#Part 6. Interactive demo

A small CLI loop: ask questions about contracts and get answers.


In [None]:
while True:
    q = input("Enter question (or 'exit'): ")
    if q.strip().lower() in ("exit","quit"):
        break
    ans, docs = rag_answer(q, top_k=3)
    print("\nAnswer:\n", ans)
    print("\nTop retrieved doc excerpts:\n")
    for i, d in enumerate(docs):
        print(f"--- doc {i+1} ---\n{d[:300]}\n")


Enter question (or 'exit'): What country

Answer:
 Canada

Top retrieved doc excerpts:

--- doc 1 ---
CONTRACT #15
This Residential Lease Agreement is entered into between the Landlord and the Tenant Thabo D.. 
Rent is 5000 ZAR per month. Tenant pays a security deposit equal to 1.5 months' rent (R7500). 
Deposit will be held in a segregated investment account and interest will be shared between the 

--- doc 2 ---
CONTRACT #11
This Residential Lease Agreement is entered into between the Landlord and the Tenant Thabo D.. 
Rent is 4500 ZAR per month. Tenant pays a security deposit equal to 1.5 months' rent (R6750). 
Deposit will be held in a segregated investment account and interest will be shared between the 

--- doc 3 ---
CONTRACT #3
This Residential Lease Agreement is entered into between the Landlord and the Tenant Thabo D.. 
Rent is 3500 ZAR per month. Tenant pays a security deposit equal to 1 months' rent (R3500). 
Deposit will be held in a segregated investment account and inter

KeyboardInterrupt: Interrupted by user