# Real Estate Synthetic Data Generator

This notebook generates synthetic data for a real estate application, including:
- **Buyers:** Prospective buyers with financial profiles.
- **Houses for Sale:** Listings in Central NJ.
- **Past Sales:** Historical sales records.
- **Basic Transcripts:** Simulated call transcripts (rule-based).
- **LLM Transcripts:** Simulated call transcripts (using OpenAI, if API key is configured).

Data will be saved to a specified output directory.


In [9]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
from faker import Faker
import random
import uuid
import datetime
import os
import time
from openai import OpenAI # Only if generating LLM transcripts
from IPython.display import display, Markdown # For Jupyter display
from dotenv import load_dotenv
load_dotenv()

# Initialize Faker
fake = Faker('en_US')

# Configuration
NUM_BUYERS = 5000
NUM_HOUSES_FOR_SALE = 1000
NUM_PAST_SALES = 500
NUM_TRANSCRIPTS_BASIC = 10000 # For basic rule-based transcripts
# WARNING: Setting NUM_TRANSCRIPTS_LLM high will take significant time and API cost.
# Start with a small number like 100 for testing.
NUM_TRANSCRIPTS_LLM = 100 # Set desired number for LLM transcripts

HOUSE_PRICE_MIN = 500_000
HOUSE_PRICE_MAX = 4_000_000

OUTPUT_DIR = "real_estate_synthetic_data"

# Central NJ Towns (Example - can be expanded)
CENTRAL_NJ_TOWNS = [
    "Princeton", "West Windsor", "Plainsboro", "Montgomery", "Hillsborough",
    "Bridgewater", "Edison", "Woodbridge", "East Brunswick", "South Brunswick",
    "Franklin Township", "Piscataway", "New Brunswick", "Hopewell", "Lawrenceville"
]

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory '{OUTPUT_DIR}' ensured.")

# OpenAI Client Setup (Conditional)
openai_api_key = os.getenv("OPENAI_API_KEY")
client = None
llm_enabled = False
if openai_api_key:
    try:
        client = OpenAI(api_key=openai_api_key)
        llm_enabled = True
        print("OpenAI client initialized successfully.")
    except Exception as e:
        print(f"Failed to initialize OpenAI client: {e}. LLM transcript generation will be skipped.")
        llm_enabled = False
else:
    print("OPENAI_API_KEY environment variable not set. LLM transcript generation will be skipped.")

# Helper function to get output dir (used by save functions)
def get_output_dir():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    return OUTPUT_DIR


Output directory 'real_estate_synthetic_data' ensured.
OpenAI client initialized successfully.


## Configuration Summary

- **Buyers:** {NUM_BUYERS}
- **Houses for Sale:** {NUM_HOUSES_FOR_SALE}
- **Past Sales:** {NUM_PAST_SALES}
- **Basic Transcripts:** {NUM_TRANSCRIPTS_BASIC}
- **LLM Transcripts:** {NUM_TRANSCRIPTS_LLM} (if enabled)
- **Output Directory:** `{OUTPUT_DIR}`

In [10]:

# Cell 2: Helper Functions

def generate_credit_score(income_bracket):
    """Generates a credit score, loosely correlated with income bracket."""
    if income_bracket == 'Very High':
        base, std = 780, 40
    elif income_bracket == 'High':
        base, std = 740, 60
    elif income_bracket == 'Medium':
        base, std = 680, 80
    else: # Low
        base, std = 620, 100

    # Add possibility of outliers
    if random.random() < 0.05: # 5% chance of a score outside the norm
        score = np.random.randint(300, 850)
    else:
        score = int(np.random.normal(base, std))

    return np.clip(score, 300, 850) # Clip to valid range

def generate_financials(income_bracket):
    """Generates correlated income, net worth, and debt based on bracket."""
    if income_bracket == 'Very High':
        income = np.random.randint(300_000, 1_500_000)
        net_worth_multiplier = random.uniform(2, 10)
        debt_ratio = random.uniform(0.1, 0.5)
    elif income_bracket == 'High':
        income = np.random.randint(150_000, 300_000)
        net_worth_multiplier = random.uniform(1.5, 7)
        debt_ratio = random.uniform(0.2, 0.6)
    elif income_bracket == 'Medium':
        income = np.random.randint(80_000, 150_000)
        net_worth_multiplier = random.uniform(0.5, 4)
        debt_ratio = random.uniform(0.3, 0.8)
    else: # Low
        income = np.random.randint(40_000, 80_000)
        net_worth_multiplier = random.uniform(0.1, 2)
        debt_ratio = random.uniform(0.4, 1.0) # Can exceed income

    net_worth = int(income * net_worth_multiplier + np.random.normal(0, income * 0.2)) # Add noise
    net_worth = max(0, net_worth) # Ensure non-negative net worth

    total_debt = int(income * debt_ratio + np.random.normal(0, income * 0.1))
    total_debt = max(0, total_debt)

    # Outlier check: high debt regardless of income
    if random.random() < 0.03:
        total_debt = int(income * random.uniform(1.0, 2.5))

    # Outlier check: Low net worth despite high income
    if income_bracket in ['High', 'Very High'] and random.random() < 0.05:
         net_worth = int(income * random.uniform(0.1, 0.5))
         net_worth = max(0, net_worth)

    return income, net_worth, total_debt

def generate_fake_address():
    """Generates a somewhat more realistic Central NJ address."""
    street = fake.street_address()
    town = random.choice(CENTRAL_NJ_TOWNS)
    # Faker might produce zips outside NJ, this is a simplification
    zipcode = fake.zipcode_in_state(state_abbr='NJ')
    return f"{street}, {town}, NJ {zipcode}"

def generate_ssn():
    """Generates a valid-looking SSN."""
    return fake.ssn()

print("Helper functions defined.")

# %%


Helper functions defined.


In [11]:

# Cell 3: Generate Buyers Dataset

def generate_buyers(num_buyers):
    buyers_data = []
    income_brackets = ['Low', 'Medium', 'High', 'Very High']
    bracket_probabilities = [0.15, 0.45, 0.30, 0.10] # Example distribution
    print(f"Generating {num_buyers} buyer records...")
    for i in range(num_buyers):
        if (i + 1) % 500 == 0:
            print(f"  Generated {i + 1}/{num_buyers} buyers...")
        buyer_id = str(uuid.uuid4())
        full_name = fake.name()
        address = generate_fake_address()
        ssn = generate_ssn()
        phone_number = fake.phone_number()
        email = fake.email()

        income_bracket = np.random.choice(income_brackets, p=bracket_probabilities)
        annual_income, net_worth, total_debt = generate_financials(income_bracket)
        credit_score = generate_credit_score(income_bracket)

        # Desired price range based on financials, with variation
        base_desired = annual_income * random.uniform(3, 7) # Simple heuristic
        desired_min = int(max(HOUSE_PRICE_MIN * 0.8, base_desired * 0.7))
        desired_max = int(min(HOUSE_PRICE_MAX * 1.2, base_desired * 1.3))
        # Ensure min < max and within bounds
        desired_min = min(desired_min, HOUSE_PRICE_MAX)
        desired_max = max(desired_min + 50000, desired_max) # Ensure some range
        desired_max = min(desired_max, HOUSE_PRICE_MAX*1.1) # Allow slight overshoot
        desired_min = max(desired_min, HOUSE_PRICE_MIN*0.9)

        # Simple Pre-Approval Model
        max_monthly_payment = (annual_income / 12) * random.uniform(0.35, 0.50)
        credit_factor = 1.0 + (credit_score - 700) / 1000
        estimated_loan = max(0, max_monthly_payment * 200 * credit_factor - total_debt * 0.5)
        pre_approved_amount = int(np.clip(estimated_loan, 0, HOUSE_PRICE_MAX * 1.5))

        # Introduce outliers in pre-approval
        if random.random() < 0.05:
            pre_approved_amount = int(pre_approved_amount * random.uniform(0.5, 2.0))
            pre_approved_amount = max(0, pre_approved_amount)

        buyers_data.append({
            "BuyerID": buyer_id,
            "FullName": full_name,
            "Address": address,
            "SSN": ssn,
            "PhoneNumber": phone_number,
            "Email": email,
            "IncomeBracket": income_bracket,
            "AnnualIncome": annual_income,
            "NetWorth": net_worth,
            "TotalDebt": total_debt,
            "CreditScore": credit_score,
            "DesiredPriceRange_Min": desired_min,
            "DesiredPriceRange_Max": desired_max,
            "PreApprovedAmount": pre_approved_amount
        })

    df = pd.DataFrame(buyers_data)
    print(f"Finished generating {len(df)} buyer records.")
    return df

def create_buyers_df():
    df_buyers = generate_buyers(NUM_BUYERS)
    # Save to CSV
    buyers_csv_path = os.path.join(get_output_dir(), "buyers.csv")
    df_buyers.to_csv(buyers_csv_path, index=False)
    print(f"Buyers data saved to: {buyers_csv_path}")
    return df_buyers, buyers_csv_path

# Execute buyer generation
df_buyers, buyers_csv_path = create_buyers_df()

# Display header
display(Markdown(f"### Buyers Dataset ({len(df_buyers)} records)"))
display(Markdown(f"Saved to: `{buyers_csv_path}`"))
display(df_buyers.head())

# %%


Generating 5000 buyer records...
  Generated 500/5000 buyers...
  Generated 1000/5000 buyers...
  Generated 1500/5000 buyers...
  Generated 2000/5000 buyers...
  Generated 2500/5000 buyers...
  Generated 3000/5000 buyers...
  Generated 3500/5000 buyers...
  Generated 4000/5000 buyers...
  Generated 4500/5000 buyers...
  Generated 5000/5000 buyers...
Finished generating 5000 buyer records.
Buyers data saved to: real_estate_synthetic_data/buyers.csv


### Buyers Dataset (5000 records)

Saved to: `real_estate_synthetic_data/buyers.csv`

Unnamed: 0,BuyerID,FullName,Address,SSN,PhoneNumber,Email,IncomeBracket,AnnualIncome,NetWorth,TotalDebt,CreditScore,DesiredPriceRange_Min,DesiredPriceRange_Max,PreApprovedAmount
0,84092373-60de-427b-a0a7-a9f096e107a3,Nathan Sanchez,"4009 Carter Alley Suite 217, West Windsor, NJ ...",702-78-5163,(307)773-5111x30556,owashington@example.com,Medium,117249,479189,272934,669,450000.0,633268.0,567860
1,42b70d65-9cfd-481c-9b12-4dc862dab833,Cindy Henderson,"635 Steven Meadows, Franklin Township, NJ 08498",455-07-1143,001-926-910-8836x670,adrienneedwards@example.com,Low,68657,10379,73615,723,450000.0,450000.0,480703
2,a680bff0-da43-4a9c-a9b8-e4fe4f4ec592,Jordan Rodriguez,"379 Stacey Inlet, Franklin Township, NJ 08422",735-86-8814,+1-289-254-9490x946,lisa00@example.net,Low,54635,49513,25304,558,450000.0,450000.0,367367
3,65f9b54d-0e0e-4c66-b23e-9816708bea71,Brandy Baker,"35865 Martin Tunnel Apt. 662, Woodbridge, NJ 0...",835-58-6431,(485)228-8333,calhounalexander@example.com,Low,40733,8809,37832,572,450000.0,450000.0,192680
4,f7f4d835-0c74-47e2-9798-4e1f5d435eaa,Allison Carrillo,"1885 Kristina Mills Apt. 888, Hopewell, NJ 07003",542-55-0929,854-927-5490x3640,victoria26@example.org,Low,75515,102709,46321,641,450000.0,450000.0,558887


In [12]:

# Cell 4: Generate Houses for Sale Dataset

def generate_houses(num_houses):
    houses_data = []
    property_types = ['Single Family', 'Townhouse', 'Condo', 'Multi-Family']
    type_probabilities = [0.65, 0.15, 0.10, 0.10]
    print(f"Generating {num_houses} house listings...")

    for i in range(num_houses):
        if (i + 1) % 200 == 0:
             print(f"  Generated {i + 1}/{num_houses} houses...")
        house_id = str(uuid.uuid4())
        address = generate_fake_address()
        listing_price = np.random.randint(HOUSE_PRICE_MIN, HOUSE_PRICE_MAX + 1)

        # Correlate features with price (loosely)
        price_factor = (listing_price - HOUSE_PRICE_MIN) / (HOUSE_PRICE_MAX - HOUSE_PRICE_MIN)

        bedrooms = np.random.randint(2, 7) + int(price_factor * 2)
        bedrooms = max(2, bedrooms)

        bathrooms = round(np.random.uniform(1.5, 5.0) + price_factor * 2, 1)
        bathrooms = max(1.5, min(bathrooms, 6.0))
        bathrooms = round(bathrooms * 2) / 2

        base_sqft = 1000
        sqft = int(base_sqft + (bedrooms * 250) + (bathrooms * 150) + (price_factor * 3000) + np.random.normal(0, 300))
        sqft = max(800, sqft)

        lot_size = round(np.random.uniform(0.05, 3.0) + price_factor * 2.0, 2)
        lot_size = max(0.05, lot_size)

        year_built = np.random.randint(1940, datetime.datetime.now().year + 1)
        if price_factor > 0.7 and random.random() < 0.6:
            year_built = np.random.randint(1990, datetime.datetime.now().year + 1)
        elif price_factor < 0.3 and random.random() < 0.6:
            year_built = np.random.randint(1940, 1985)

        property_type = np.random.choice(property_types, p=type_probabilities)
        if property_type == 'Condo' and listing_price > 1_000_000:
             listing_price = int(listing_price * random.uniform(0.6, 0.9))
        elif property_type == 'Single Family' and listing_price < 700_000:
             listing_price = int(listing_price * random.uniform(1.0, 1.3))
        listing_price = np.clip(listing_price, HOUSE_PRICE_MIN, HOUSE_PRICE_MAX)

        houses_data.append({
            "HouseID": house_id,
            "Address": address,
            "ListingPrice": listing_price,
            "Bedrooms": bedrooms,
            "Bathrooms": bathrooms,
            "SquareFootage": sqft,
            "LotSize_Acres": lot_size,
            "YearBuilt": year_built,
            "PropertyType": property_type,
            "Status": "For Sale" # Assuming this function generates 'For Sale' listings
        })

    df = pd.DataFrame(houses_data)
    print(f"Finished generating {len(df)} house listings.")
    return df

def create_houses_df():
    df_houses = generate_houses(NUM_HOUSES_FOR_SALE)
    # Save to CSV
    houses_csv_path = os.path.join(get_output_dir(), "houses_for_sale.csv")
    df_houses.to_csv(houses_csv_path, index=False)
    print(f"Houses data saved to: {houses_csv_path}")
    return df_houses, houses_csv_path

# Execute house generation
df_houses, houses_csv_path = create_houses_df()

# Display header
display(Markdown(f"### Houses for Sale Dataset ({len(df_houses)} records)"))
display(Markdown(f"Saved to: `{houses_csv_path}`"))
display(df_houses.head())


# %%


Generating 1000 house listings...
  Generated 200/1000 houses...
  Generated 400/1000 houses...
  Generated 600/1000 houses...
  Generated 800/1000 houses...
  Generated 1000/1000 houses...
Finished generating 1000 house listings.
Houses data saved to: real_estate_synthetic_data/houses_for_sale.csv


### Houses for Sale Dataset (1000 records)

Saved to: `real_estate_synthetic_data/houses_for_sale.csv`

Unnamed: 0,HouseID,Address,ListingPrice,Bedrooms,Bathrooms,SquareFootage,LotSize_Acres,YearBuilt,PropertyType,Status
0,e9d0206b-debd-4b8a-b68d-cc770fb32c7d,"840 Vega Spurs Apt. 726, Hillsborough, NJ 08761",1859911,2,4.0,3724,1.44,1965,Single Family,For Sale
1,b63d84bb-0fdb-467d-a93b-a4f8268dbd2c,"95522 Lawrence Island Apt. 349, Hillsborough, ...",3270629,6,6.0,6160,2.04,1994,Single Family,For Sale
2,42bbdaa3-e1ec-49d4-8226-b50937846af5,"511 Michael Spurs, Plainsboro, NJ 07056",2366576,7,3.0,4712,3.23,1972,Single Family,For Sale
3,da779b26-598e-4bd2-b253-627825fc024d,"99313 Eric Extensions Apt. 851, Hopewell, NJ 0...",3592286,4,6.0,5537,3.25,2023,Townhouse,For Sale
4,096d4fa5-e555-4109-8338-4c696dfc5552,"631 Savannah Mountain Apt. 500, Princeton, NJ ...",1183151,6,4.0,3981,0.89,1942,Multi-Family,For Sale


In [13]:

# Cell 5: Generate Past Sales Dataset

def generate_past_sales(num_sales, buyers_df, houses_df_structure_unused):
    """ Generates past sales, linking to buyers and simulating house details """
    # NOTE: houses_df_structure_unused is kept for signature compatibility but not used as intended in the original Marimo code.
    # The original code generated NEW house details for each past sale via generate_houses(1).
    sales_data = []
    sale_notes_categories = [
        "Normal", "Job Relocation", "Downsizing", "Upsizing",
        "Divorce", "Estate Sale", "Job Loss", "Bankruptcy/Foreclosure"
    ]
    sale_notes_probabilities = [0.65, 0.10, 0.05, 0.05, 0.04, 0.04, 0.035, 0.035]

    print(f"Generating {num_sales} past sales records...")

    # Ensure we have enough buyers to sample from
    if len(buyers_df) == 0:
        print("Warning: Buyers DataFrame is empty. Cannot generate past sales.")
        return pd.DataFrame()

    if len(buyers_df) < num_sales:
        print("Warning: Not enough unique buyers generated for the number of sales. Buyers will be reused.")
        buyer_indices = np.random.choice(buyers_df.index, num_sales, replace=True)
    else:
        buyer_indices = np.random.choice(buyers_df.index, num_sales, replace=False) # Unique buyers per sale if possible

    available_buyers = buyers_df.loc[buyer_indices].copy()

    for i in range(num_sales):
        if (i + 1) % 100 == 0:
             print(f"  Generated {i + 1}/{num_sales} past sales...")

        sale_id = str(uuid.uuid4())

        # Generate *new* house details for this past sale using the same logic
        temp_house_df = generate_houses(1) # This now prints "Generating 1 house listings..." each time. Consider suppressing print inside loop.
        house_details = temp_house_df.iloc[0]

        # Select a buyer for this sale
        buyer_info = available_buyers.iloc[i]
        buyer_id = buyer_info["BuyerID"]

        # Adjust generated house price towards buyer's capability
        listing_price = house_details["ListingPrice"]
        target_price = (buyer_info['DesiredPriceRange_Min'] + buyer_info['DesiredPriceRange_Max']) / 2
        target_price = min(max(target_price, HOUSE_PRICE_MIN), HOUSE_PRICE_MAX) # Clamp target

        adjusted_listing_price = int(listing_price * 0.5 + target_price * 0.5 + np.random.normal(0, listing_price * 0.1))
        adjusted_listing_price = np.clip(adjusted_listing_price, HOUSE_PRICE_MIN, HOUSE_PRICE_MAX)

        # Determine Sale Price
        sale_price_ratio = random.uniform(0.93, 1.07)
        sale_price = int(adjusted_listing_price * sale_price_ratio)
        if random.random() < 0.08:
            sale_price = int(adjusted_listing_price * random.uniform(0.85, 1.15))
        sale_price = np.clip(sale_price, int(HOUSE_PRICE_MIN*0.8), int(HOUSE_PRICE_MAX*1.1))

        # Sale Date
        sale_date = fake.date_between(start_date="-5y", end_date="today")

        # Sale Notes
        sale_category = np.random.choice(sale_notes_categories, p=sale_notes_probabilities)
        sale_details = f"Standard transaction."
        # (Logic for different sale categories and potential price adjustments)
        if sale_category == "Job Relocation":
            sale_details = f"Seller relocated for a new job opportunity in {fake.city()}."
            if random.random() < 0.3: sale_price = int(sale_price * random.uniform(0.92, 0.98))
        elif sale_category == "Downsizing":
            sale_details = "Seller downsizing after retirement/children moved out."
        elif sale_category == "Upsizing":
            sale_details = "Seller buying a larger home for growing family."
            if random.random() < 0.2: sale_price = int(sale_price * random.uniform(1.01, 1.05))
        elif sale_category == "Divorce":
            sale_details = "Sale resulting from divorce proceedings."
            if random.random() < 0.4: sale_price = int(sale_price * random.uniform(0.90, 0.97))
        elif sale_category == "Estate Sale":
            sale_details = "Property sold as part of an estate settlement."
            if random.random() < 0.5: sale_price = int(sale_price * random.uniform(0.88, 1.02))
        elif sale_category == "Job Loss":
            sale_details = "Forced sale due to unexpected job loss and financial hardship."
            sale_price = int(adjusted_listing_price * random.uniform(0.85, 0.95))
        elif sale_category == "Bankruptcy/Foreclosure":
            sale_details = "Sale managed through bankruptcy court or bank foreclosure process."
            sale_price = int(adjusted_listing_price * random.uniform(0.80, 0.93))

        sale_price = max(int(HOUSE_PRICE_MIN*0.75), sale_price) # Final price floor

        sales_data.append({
            "SaleID": sale_id,
            "HouseID": house_details["HouseID"], # Link to the generated house's ID
            "BuyerID": buyer_id,
            "SellerFullName": fake.name(), # Generate fake seller
            "SellerAddress": generate_fake_address(), # Fake seller address
            "ListingPrice": adjusted_listing_price, # Use adjusted price
            "SalePrice": sale_price,
            "SaleDate": sale_date,
            "SaleCategory": sale_category,
            "SaleDetails": sale_details,
            # Buyer snapshot
            "Buyer_AnnualIncome": buyer_info["AnnualIncome"],
            "Buyer_NetWorth": buyer_info["NetWorth"],
            "Buyer_TotalDebt": buyer_info["TotalDebt"],
            "Buyer_CreditScore": buyer_info["CreditScore"],
             # House snapshot
            "House_Address": house_details["Address"],
            "House_Bedrooms": house_details["Bedrooms"],
            "House_Bathrooms": house_details["Bathrooms"],
            "House_SquareFootage": house_details["SquareFootage"],
            "House_YearBuilt": house_details["YearBuilt"],
            "House_PropertyType": house_details["PropertyType"],
        })

    df = pd.DataFrame(sales_data)
    print(f"Finished generating {len(df)} past sales records.")
    return df

def create_sales_df():
    # Pass buyers df and the structure (columns) of houses df
    # Suppress the inner print from generate_houses(1) if desired, or ignore it.
    df_sales = generate_past_sales(NUM_PAST_SALES, df_buyers, df_houses) # df_houses is passed but not used as intended
    # Save to CSV
    sales_csv_path = os.path.join(get_output_dir(), "past_sales.csv")
    if not df_sales.empty:
        df_sales.to_csv(sales_csv_path, index=False)
        print(f"Past sales data saved to: {sales_csv_path}")
    else:
        print("Past sales DataFrame is empty. No file saved.")
        sales_csv_path = None
    return df_sales, sales_csv_path

# Execute past sales generation
df_sales, sales_csv_path = create_sales_df()

# Display header
if sales_csv_path:
    display(Markdown(f"### Past Sales Dataset ({len(df_sales)} records)"))
    display(Markdown(f"Saved to: `{sales_csv_path}`"))
    display(df_sales.head())
else:
    display(Markdown("### Past Sales Dataset Generation Skipped (likely no buyers)"))


# %%


Generating 500 past sales records...
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listing

### Past Sales Dataset (500 records)

Saved to: `real_estate_synthetic_data/past_sales.csv`

Unnamed: 0,SaleID,HouseID,BuyerID,SellerFullName,SellerAddress,ListingPrice,SalePrice,SaleDate,SaleCategory,SaleDetails,Buyer_AnnualIncome,Buyer_NetWorth,Buyer_TotalDebt,Buyer_CreditScore,House_Address,House_Bedrooms,House_Bathrooms,House_SquareFootage,House_YearBuilt,House_PropertyType
0,31a4b65a-dfa1-4f29-ae22-0c082e5070e6,e8a15cc0-ae28-4ee7-a436-8c13b0ae4948,c1206aa8-7991-4ff1-9c64-8c8c165826bf,Brian Christensen,"75627 Micheal Groves, Woodbridge, NJ 07089",1658266,1596812,2023-05-07,Normal,Standard transaction.,228326,1244831,147890,702,"3959 Ashley Village, Montgomery, NJ 07317",5,4.5,4482,2016,Single Family
1,0d72c875-922e-4a2b-9ea7-98bc2e7254f0,7f2362d3-41d4-42c9-bbfe-d48f233b44a8,ac36acd2-c6d8-4c48-89ac-4d80af9da158,Veronica Holmes,"13658 Cooper Summit Suite 308, Montgomery, NJ ...",1734526,1620621,2023-01-13,Normal,Standard transaction.,137784,549338,76540,690,"644 Oliver Brook, West Windsor, NJ 07529",6,4.5,5522,2015,Single Family
2,a6629385-f796-46fe-9f57-48d721ac3533,974d93a7-751f-4ace-ae3d-53cc3258f440,ac3499f5-2e96-4333-bff5-f72031d2a403,Deborah Smith,"3632 Johnson Road, Hopewell, NJ 08131",999092,1049371,2022-03-04,Normal,Standard transaction.,137230,226533,70811,723,"22068 Danielle Street Suite 083, West Windsor,...",5,4.5,3874,1993,Single Family
3,2f39c148-007c-4ce6-8fe4-2a7f9b5a3156,da91f6a9-0bf2-4ec8-9ef6-572994a518c4,ed911fd7-2642-412f-93a9-c609fcc920df,Courtney Armstrong,"397 Klein Wells, Piscataway, NJ 07203",1698622,1658207,2020-05-25,Normal,Standard transaction.,93932,151171,75024,674,"14022 Coffey Place Suite 440, Hopewell, NJ 08569",7,4.0,6435,2012,Single Family
4,0550868b-b0ad-49c1-a95b-8f82d77dd208,34701bf3-9abd-4db2-99dc-365b756adb3c,face0e40-4d88-4ee3-8335-b335cb19b885,Debra Green,"826 Conrad Views Suite 972, Hopewell, NJ 07353",3894709,3662492,2021-07-20,Downsizing,Seller downsizing after retirement/children mo...,852909,8180586,303332,705,"0834 Stacey Shoals Apt. 823, New Brunswick, NJ...",3,6.0,5290,1958,Townhouse


In [14]:

# Cell 6: Generate Basic Call Transcripts (Rule-Based)

def generate_basic_transcripts(num_transcripts, buyers_df):
    transcripts_data = []
    broker_names = [fake.name() for _ in range(25)] # Pool of brokers
    banker_names = [fake.name() for _ in range(25)] # Pool of bankers

    print(f"Generating {num_transcripts} basic call transcripts...")

    # Ensure we have buyers to sample from
    if len(buyers_df) == 0:
        print("Warning: Buyers DataFrame is empty. Cannot generate basic transcripts.")
        return pd.DataFrame()

    buyer_ids = buyers_df['BuyerID'].tolist()

    for i in range(num_transcripts):
        if (i + 1) % 1000 == 0:
             print(f"  Generated {i + 1}/{num_transcripts} basic transcripts...")

        transcript_id = str(uuid.uuid4())
        call_datetime = fake.date_time_between(start_date="-2y", end_date="now")
        buyer_id = random.choice(buyer_ids)

        try:
            buyer_info = buyers_df[buyers_df['BuyerID'] == buyer_id].iloc[0]
        except IndexError:
            print(f"Warning: BuyerID {buyer_id} not found in buyers_df. Skipping transcript.")
            continue # Skip if buyer_id not found

        broker_name = random.choice(broker_names)
        banker_name = random.choice(banker_names)

        # Extract PII
        buyer_name = buyer_info['FullName']
        buyer_address = buyer_info['Address']
        buyer_ssn = buyer_info['SSN']
        buyer_ssn_last4 = buyer_ssn.split('-')[-1]
        buyer_phone = buyer_info['PhoneNumber']
        buyer_income = buyer_info['AnnualIncome']
        buyer_desired_max = buyer_info['DesiredPriceRange_Max']

        # Simple transcript templates including PII
        templates = [
            f"MB: Hi {buyer_name}, this is {banker_name}. Just confirming your application details for the mortgage - is your SSN still {buyer_ssn}? \nBuyer: Yes, that's correct. \nMB: Great, and the address {buyer_address} is current?",
            f"Broker: Hello {buyer_name}, {broker_name} calling. Regarding houses around ${buyer_desired_max:,.0f}, I have a new listing you might like. \nBuyer: Oh really? Tell me more. \nBroker: It's on Maple St, let's connect later. Your number is {buyer_phone}, right?",
            f"MB: {banker_name} here for {buyer_name}. We need to verify income for the pre-approval. \nBuyer: Okay, what do you need? \nMB: Can you confirm your full SSN {buyer_ssn} and current residence at {buyer_address} for security?",
            f"Broker: {broker_name} checking in with {buyer_name}. Any thoughts on the properties we saw last week? \nBuyer: Still considering. The one near {random.choice(CENTRAL_NJ_TOWNS)} park was nice. \nBroker: Got it. Just confirming your details for updates: Name: {buyer_name}, Address: {buyer_address}, SSN: {buyer_ssn}.",
            f"MB: {buyer_name}, it's {banker_name}. The underwriter needs clarification on your debt-to-income ratio, given your stated income of ${buyer_income:,.0f}. \nBuyer: Okay, what specifically? \nMB: Let's review your file. Confirming SSN ending in {buyer_ssn_last4} and address {buyer_address}."
        ]

        transcript_text = random.choice(templates)
        # Replace actual newlines with escaped newlines to maintain proper CSV formatting
        transcript_text = transcript_text.replace("\n", "\\n")

        transcripts_data.append({
            "TranscriptID": transcript_id,
            "CallDateTime": call_datetime,
            "BuyerID": buyer_id,
            "BrokerName": broker_name,
            "MortgageBankerName": banker_name,
            "TranscriptText": transcript_text
        })

    df = pd.DataFrame(transcripts_data)
    print(f"Finished generating {len(df)} basic transcripts.")
    return df

def create_basic_transcripts_df():
    df_basic_transcripts = generate_basic_transcripts(NUM_TRANSCRIPTS_BASIC, df_buyers)
    # Save to CSV
    basic_transcripts_csv_path = os.path.join(get_output_dir(), "basic_call_transcripts.csv")
    if not df_basic_transcripts.empty:
        df_basic_transcripts.to_csv(basic_transcripts_csv_path, index=False, quoting=1)  # quoting=1 uses QUOTE_ALL for safety
        print(f"Basic transcripts saved to: {basic_transcripts_csv_path}")
    else:
        print("Skipping save for empty basic transcripts DataFrame.")
        basic_transcripts_csv_path = None
    return df_basic_transcripts, basic_transcripts_csv_path

# Execute basic transcript generation
df_basic_transcripts, basic_transcripts_csv_path = create_basic_transcripts_df()

# Display header
if basic_transcripts_csv_path:
    display(Markdown(f"### Basic Call Transcripts Dataset ({len(df_basic_transcripts)} records)"))
    display(Markdown(f"Saved to: `{basic_transcripts_csv_path}`"))
    display(df_basic_transcripts.head())
else:
    display(Markdown("### Basic Call Transcripts Generation Skipped or Resulted in Empty Data"))


Generating 10000 basic call transcripts...
  Generated 1000/10000 basic transcripts...
  Generated 2000/10000 basic transcripts...
  Generated 3000/10000 basic transcripts...
  Generated 4000/10000 basic transcripts...
  Generated 5000/10000 basic transcripts...
  Generated 6000/10000 basic transcripts...
  Generated 7000/10000 basic transcripts...
  Generated 8000/10000 basic transcripts...
  Generated 9000/10000 basic transcripts...
  Generated 10000/10000 basic transcripts...
Finished generating 10000 basic transcripts.
Basic transcripts saved to: real_estate_synthetic_data/basic_call_transcripts.csv


### Basic Call Transcripts Dataset (10000 records)

Saved to: `real_estate_synthetic_data/basic_call_transcripts.csv`

Unnamed: 0,TranscriptID,CallDateTime,BuyerID,BrokerName,MortgageBankerName,TranscriptText
0,a308472d-5055-4ece-b698-31d4c1885651,2023-09-13 22:48:06.736542,adb735a0-2433-4515-b3f3-8ccb19322c45,Harold Garcia,Tony Vazquez,Broker: Harold Garcia checking in with Brianna...
1,3ae10cbc-ab58-4945-97fe-7a1e078db431,2024-07-29 10:07:32.989926,c14e2182-d73f-48c3-a066-b5a7d28534f8,Jonathan Nguyen,Thomas Evans,MB: Thomas Evans here for Richard Anderson. We...
2,7cf5adb2-a8ae-4806-b194-eb2c17116288,2024-07-19 18:48:21.575248,0521fc80-0d9c-4959-99ce-3309e9cb34e8,Renee Johnson,Kelly Robles MD,"MB: Hi Jonathan Acosta, this is Kelly Robles M..."
3,303f3b5a-5e92-4014-8e78-df3d559df811,2025-04-04 15:19:43.089727,11613d7f-3296-4acb-a12a-bd75d8cfcffb,Benjamin Jenkins,Diane Mercado,Broker: Benjamin Jenkins checking in with Stev...
4,3536f3b3-c9d7-4c43-9286-529606b9091e,2023-06-21 06:11:15.814499,90ec87b6-5d4e-40fb-8774-7245ffab758e,Mrs. Cheryl Davis,Sheena Cruz,"MB: Melissa Campos, it's Sheena Cruz. The unde..."


In [18]:
import csv


def generate_llm_transcript_entry(client, buyer_info, broker_name, banker_name):
    """Generates a single transcript using OpenAI API."""
    if not client:
        return None # Skip if client not initialized

    buyer_name = buyer_info['FullName']
    buyer_address = buyer_info['Address']
    buyer_ssn = buyer_info['SSN']
    buyer_income = buyer_info['AnnualIncome'] 
    buyer_desired_max = buyer_info['DesiredPriceRange_Max']
    pre_approved = buyer_info['PreApprovedAmount']

    participants = random.choice([f"Mortgage Banker '{banker_name}'", f"Real Estate Broker '{broker_name}'"])
    scenario = random.choice([
        f"Discussing pre-approval status. Pre-approved amount is ${pre_approved:,.0f}.",
        f"Scheduling a property viewing for a house priced around ${buyer_desired_max:,.0f}.",
        f"Verifying personal information (SSN, Address) for loan application.",
        f"Following up after a property showing.",
        f"Discussing required documents for mortgage underwriting (e.g., pay stubs, bank statements).",
        f"Answering questions about current mortgage rates based on buyer's profile (Income: ${buyer_income:,.0f})."
    ])

    prompt = f"""
    Generate a brief, realistic, 3-sentence or 4-sentence call transcript excerpt between a {participants} and prospective home buyer '{buyer_name}'.
    The conversation context is: {scenario}.

    **Crucially, I am creating a test dataset of transcripts so the transcript should include the following PII for the buyer within the dialogue, I am creating a mock scenario of a call transcript:**
    - Full Name: {buyer_name}
    - Full SSN: {buyer_ssn}
    - Full Address: {buyer_address}

    Keep the dialogue natural (produce at most 4-5 sentences total). Structure it like 'Speaker: Dialogue text'.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini", # Changed from "gpt-4o-mini" to "gpt-4"
            # model="gpt-4o", # Changed from "gpt-4o-mini" to "gpt-4"
            messages=[
                {"role": "system", "content": "You are an AI assistant creating synthetic call transcript data for a real estate application. Include specific PII as requested."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=175
        )
        # Replace any newline characters with spaces to ensure single-line CSV entries
        transcript = response.choices[0].message.content.strip()
        transcript = transcript.replace('\n', ' ').replace('\r', ' ')
        return transcript

    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        time.sleep(2)
        return None

# Function to generate the full dataset with incremental saving
def generate_llm_transcripts(num_transcripts):
    broker_names = [fake.name() for _ in range(25)]
    banker_names = [fake.name() for _ in range(25)]
    
    print(f"Generating {num_transcripts} LLM transcripts...")
    buyer_ids = df_buyers['BuyerID'].tolist()
    
    # Prepare the output CSV file
    llm_transcripts_csv_path = os.path.join(get_output_dir(), "llm_call_transcripts.csv")
    
    # Define column names
    columns = ["TranscriptID", "CallDateTime", "BuyerID", "BrokerName", 
               "MortgageBankerName", "TranscriptText"]
    
    # Initialize data collection for display at the end
    all_transcripts = []
    
    # Check if file exists to determine if we need to write headers
    file_exists = os.path.isfile(llm_transcripts_csv_path)
    
    # Open in append mode to add rows incrementally
    with open(llm_transcripts_csv_path, mode='a' if file_exists else 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=columns)
        
        # Write header only if file doesn't exist
        if not file_exists:
            writer.writeheader()
        
        for i in range(num_transcripts):
            if (i + 1) % 10 == 0:
                print(f"  Generated {i + 1}/{num_transcripts} LLM transcripts...")
                
            transcript_id = str(uuid.uuid4())
            call_datetime = fake.date_time_between(start_date="-2y", end_date="now")
            buyer_id = random.choice(buyer_ids)
            broker_name = random.choice(broker_names)
            banker_name = random.choice(banker_names)
            
            try:
                buyer_info = df_buyers[df_buyers['BuyerID'] == buyer_id].iloc[0]
            except IndexError:
                print(f"Warning: BuyerID {buyer_id} not found in buyers_df. Skipping transcript.")
                continue
                
            transcript_text = generate_llm_transcript_entry(client, buyer_info, broker_name, banker_name)
            if transcript_text is None:
                transcript_text = "I'm sorry, but I can't assist with that."
                
            # Create record
            transcript_record = {
                "TranscriptID": transcript_id,
                "CallDateTime": call_datetime,
                "BuyerID": buyer_id,
                "BrokerName": broker_name,
                "MortgageBankerName": banker_name,
                "TranscriptText": transcript_text
            }
            
            # Write the single row
            writer.writerow(transcript_record)
            
            # Add to collection for display
            all_transcripts.append(transcript_record)
        
    # Create a DataFrame from all collected records for display
    df = pd.DataFrame(all_transcripts)
    print(f"Finished generating {len(df)} LLM transcripts.")
    return df

# Execute LLM transcript generation if enabled
if llm_enabled:
    # df_llm_transcripts = generate_llm_transcripts(NUM_TRANSCRIPTS_LLM)
    df_llm_transcripts = generate_llm_transcripts(4000)
    # No need to save here since we're saving incrementally
    llm_transcripts_csv_path = os.path.join(get_output_dir(), "llm_call_transcripts.csv")
    print(f"LLM transcripts saved to: {llm_transcripts_csv_path}")
    
    # Display header
    display(Markdown(f"### LLM Call Transcripts Dataset ({len(df_llm_transcripts)} records)"))
    display(Markdown(f"Saved to: `{llm_transcripts_csv_path}`"))
    display(df_llm_transcripts.head())
else:
    print("LLM transcript generation skipped (not enabled).")


Generating 4000 LLM transcripts...
  Generated 10/4000 LLM transcripts...
  Generated 20/4000 LLM transcripts...
  Generated 30/4000 LLM transcripts...
  Generated 40/4000 LLM transcripts...
  Generated 50/4000 LLM transcripts...
  Generated 60/4000 LLM transcripts...
  Generated 70/4000 LLM transcripts...
  Generated 80/4000 LLM transcripts...
  Generated 90/4000 LLM transcripts...
  Generated 100/4000 LLM transcripts...
  Generated 110/4000 LLM transcripts...
  Generated 120/4000 LLM transcripts...
  Generated 130/4000 LLM transcripts...
  Generated 140/4000 LLM transcripts...
  Generated 150/4000 LLM transcripts...
  Generated 160/4000 LLM transcripts...
  Generated 170/4000 LLM transcripts...
  Generated 180/4000 LLM transcripts...
  Generated 190/4000 LLM transcripts...
  Generated 200/4000 LLM transcripts...
  Generated 210/4000 LLM transcripts...
  Generated 220/4000 LLM transcripts...
  Generated 230/4000 LLM transcripts...
  Generated 240/4000 LLM transcripts...
  Generated 25

KeyboardInterrupt: 

In [19]:

# Cell 8: Summary

display(Markdown(f"""
## Data Generation Complete

The following datasets have been generated and saved in the `{OUTPUT_DIR}` directory:

1.  **Buyers:** `{os.path.basename(buyers_csv_path) if buyers_csv_path else '*Not Generated*'}` ({len(df_buyers)} records)
2.  **Houses for Sale:** `{os.path.basename(houses_csv_path) if houses_csv_path else '*Not Generated*'}` ({len(df_houses)} records)
3.  **Past Sales:** `{os.path.basename(sales_csv_path) if sales_csv_path else '*Not Generated*'}` ({len(df_sales)} records)
4.  **Basic Transcripts:** `{os.path.basename(basic_transcripts_csv_path) if basic_transcripts_csv_path else '*Not Generated*'}` ({len(df_basic_transcripts)} records)
5.  **LLM Transcripts:** {f'`{os.path.basename(llm_transcripts_csv_path)}` ({len(df_llm_transcripts)} records)' if llm_transcripts_csv_path else '*Skipped or Not Generated*'}
"""))

print("\nScript Finished.")

NameError: name 'llm_transcripts_csv_path' is not defined