In [8]:
# %% [markdown]
# # Real Estate Synthetic Data Generator
#
# This notebook generates synthetic data for a real estate application, including:
# - **Buyers:** Prospective buyers with financial profiles.
# - **Houses for Sale:** Listings in Central NJ.
# - **Past Sales:** Historical sales records.
# - **Basic Transcripts:** Simulated call transcripts (rule-based).
# - **LLM Transcripts:** Simulated call transcripts (using OpenAI, if API key is configured).
#
# Data will be saved to a specified output directory.

# %%
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
from faker import Faker
import random
import uuid
import datetime
import os
import time
from openai import OpenAI # Only if generating LLM transcripts
from IPython.display import display, Markdown # For Jupyter display
from dotenv import load_dotenv
load_dotenv()

# Initialize Faker
fake = Faker('en_US')

# Configuration
NUM_BUYERS = 5000
NUM_HOUSES_FOR_SALE = 1000
NUM_PAST_SALES = 500
NUM_TRANSCRIPTS_BASIC = 10000 # For basic rule-based transcripts
# WARNING: Setting NUM_TRANSCRIPTS_LLM high will take significant time and API cost.
# Start with a small number like 100 for testing.
NUM_TRANSCRIPTS_LLM = 100 # Set desired number for LLM transcripts

HOUSE_PRICE_MIN = 500_000
HOUSE_PRICE_MAX = 4_000_000

OUTPUT_DIR = "real_estate_synthetic_data"

# Central NJ Towns (Example - can be expanded)
CENTRAL_NJ_TOWNS = [
    "Princeton", "West Windsor", "Plainsboro", "Montgomery", "Hillsborough",
    "Bridgewater", "Edison", "Woodbridge", "East Brunswick", "South Brunswick",
    "Franklin Township", "Piscataway", "New Brunswick", "Hopewell", "Lawrenceville"
]

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory '{OUTPUT_DIR}' ensured.")

# OpenAI Client Setup (Conditional)
openai_api_key = os.getenv("OPENAI_API_KEY")
client = None
llm_enabled = False
if openai_api_key:
    try:
        client = OpenAI(api_key=openai_api_key)
        llm_enabled = True
        print("OpenAI client initialized successfully.")
    except Exception as e:
        print(f"Failed to initialize OpenAI client: {e}. LLM transcript generation will be skipped.")
        llm_enabled = False
else:
    print("OPENAI_API_KEY environment variable not set. LLM transcript generation will be skipped.")

# Helper function to get output dir (used by save functions)
def get_output_dir():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    return OUTPUT_DIR

# %% [markdown]
# ## Configuration Summary
#
# - **Buyers:** {NUM_BUYERS}
# - **Houses for Sale:** {NUM_HOUSES_FOR_SALE}
# - **Past Sales:** {NUM_PAST_SALES}
# - **Basic Transcripts:** {NUM_TRANSCRIPTS_BASIC}
# - **LLM Transcripts:** {NUM_TRANSCRIPTS_LLM} (if enabled)
# - **Output Directory:** `{OUTPUT_DIR}`

# %%


Output directory 'real_estate_synthetic_data' ensured.
OpenAI client initialized successfully.


In [9]:

# Cell 2: Helper Functions

def generate_credit_score(income_bracket):
    """Generates a credit score, loosely correlated with income bracket."""
    if income_bracket == 'Very High':
        base, std = 780, 40
    elif income_bracket == 'High':
        base, std = 740, 60
    elif income_bracket == 'Medium':
        base, std = 680, 80
    else: # Low
        base, std = 620, 100

    # Add possibility of outliers
    if random.random() < 0.05: # 5% chance of a score outside the norm
        score = np.random.randint(300, 850)
    else:
        score = int(np.random.normal(base, std))

    return np.clip(score, 300, 850) # Clip to valid range

def generate_financials(income_bracket):
    """Generates correlated income, net worth, and debt based on bracket."""
    if income_bracket == 'Very High':
        income = np.random.randint(300_000, 1_500_000)
        net_worth_multiplier = random.uniform(2, 10)
        debt_ratio = random.uniform(0.1, 0.5)
    elif income_bracket == 'High':
        income = np.random.randint(150_000, 300_000)
        net_worth_multiplier = random.uniform(1.5, 7)
        debt_ratio = random.uniform(0.2, 0.6)
    elif income_bracket == 'Medium':
        income = np.random.randint(80_000, 150_000)
        net_worth_multiplier = random.uniform(0.5, 4)
        debt_ratio = random.uniform(0.3, 0.8)
    else: # Low
        income = np.random.randint(40_000, 80_000)
        net_worth_multiplier = random.uniform(0.1, 2)
        debt_ratio = random.uniform(0.4, 1.0) # Can exceed income

    net_worth = int(income * net_worth_multiplier + np.random.normal(0, income * 0.2)) # Add noise
    net_worth = max(0, net_worth) # Ensure non-negative net worth

    total_debt = int(income * debt_ratio + np.random.normal(0, income * 0.1))
    total_debt = max(0, total_debt)

    # Outlier check: high debt regardless of income
    if random.random() < 0.03:
        total_debt = int(income * random.uniform(1.0, 2.5))

    # Outlier check: Low net worth despite high income
    if income_bracket in ['High', 'Very High'] and random.random() < 0.05:
         net_worth = int(income * random.uniform(0.1, 0.5))
         net_worth = max(0, net_worth)

    return income, net_worth, total_debt

def generate_fake_address():
    """Generates a somewhat more realistic Central NJ address."""
    street = fake.street_address()
    town = random.choice(CENTRAL_NJ_TOWNS)
    # Faker might produce zips outside NJ, this is a simplification
    zipcode = fake.zipcode_in_state(state_abbr='NJ')
    return f"{street}, {town}, NJ {zipcode}"

def generate_ssn():
    """Generates a valid-looking SSN."""
    return fake.ssn()

print("Helper functions defined.")

# %%


Helper functions defined.


In [10]:

# Cell 3: Generate Buyers Dataset

def generate_buyers(num_buyers):
    buyers_data = []
    income_brackets = ['Low', 'Medium', 'High', 'Very High']
    bracket_probabilities = [0.15, 0.45, 0.30, 0.10] # Example distribution
    print(f"Generating {num_buyers} buyer records...")
    for i in range(num_buyers):
        if (i + 1) % 500 == 0:
            print(f"  Generated {i + 1}/{num_buyers} buyers...")
        buyer_id = str(uuid.uuid4())
        full_name = fake.name()
        address = generate_fake_address()
        ssn = generate_ssn()
        phone_number = fake.phone_number()
        email = fake.email()

        income_bracket = np.random.choice(income_brackets, p=bracket_probabilities)
        annual_income, net_worth, total_debt = generate_financials(income_bracket)
        credit_score = generate_credit_score(income_bracket)

        # Desired price range based on financials, with variation
        base_desired = annual_income * random.uniform(3, 7) # Simple heuristic
        desired_min = int(max(HOUSE_PRICE_MIN * 0.8, base_desired * 0.7))
        desired_max = int(min(HOUSE_PRICE_MAX * 1.2, base_desired * 1.3))
        # Ensure min < max and within bounds
        desired_min = min(desired_min, HOUSE_PRICE_MAX)
        desired_max = max(desired_min + 50000, desired_max) # Ensure some range
        desired_max = min(desired_max, HOUSE_PRICE_MAX*1.1) # Allow slight overshoot
        desired_min = max(desired_min, HOUSE_PRICE_MIN*0.9)

        # Simple Pre-Approval Model
        max_monthly_payment = (annual_income / 12) * random.uniform(0.35, 0.50)
        credit_factor = 1.0 + (credit_score - 700) / 1000
        estimated_loan = max(0, max_monthly_payment * 200 * credit_factor - total_debt * 0.5)
        pre_approved_amount = int(np.clip(estimated_loan, 0, HOUSE_PRICE_MAX * 1.5))

        # Introduce outliers in pre-approval
        if random.random() < 0.05:
            pre_approved_amount = int(pre_approved_amount * random.uniform(0.5, 2.0))
            pre_approved_amount = max(0, pre_approved_amount)

        buyers_data.append({
            "BuyerID": buyer_id,
            "FullName": full_name,
            "Address": address,
            "SSN": ssn,
            "PhoneNumber": phone_number,
            "Email": email,
            "IncomeBracket": income_bracket,
            "AnnualIncome": annual_income,
            "NetWorth": net_worth,
            "TotalDebt": total_debt,
            "CreditScore": credit_score,
            "DesiredPriceRange_Min": desired_min,
            "DesiredPriceRange_Max": desired_max,
            "PreApprovedAmount": pre_approved_amount
        })

    df = pd.DataFrame(buyers_data)
    print(f"Finished generating {len(df)} buyer records.")
    return df

def create_buyers_df():
    df_buyers = generate_buyers(NUM_BUYERS)
    # Save to CSV
    buyers_csv_path = os.path.join(get_output_dir(), "buyers.csv")
    df_buyers.to_csv(buyers_csv_path, index=False)
    print(f"Buyers data saved to: {buyers_csv_path}")
    return df_buyers, buyers_csv_path

# Execute buyer generation
df_buyers, buyers_csv_path = create_buyers_df()

# Display header
display(Markdown(f"### Buyers Dataset ({len(df_buyers)} records)"))
display(Markdown(f"Saved to: `{buyers_csv_path}`"))
display(df_buyers.head())

# %%


Generating 5000 buyer records...
  Generated 500/5000 buyers...
  Generated 1000/5000 buyers...
  Generated 1500/5000 buyers...
  Generated 2000/5000 buyers...
  Generated 2500/5000 buyers...
  Generated 3000/5000 buyers...
  Generated 3500/5000 buyers...
  Generated 4000/5000 buyers...
  Generated 4500/5000 buyers...
  Generated 5000/5000 buyers...
Finished generating 5000 buyer records.
Buyers data saved to: real_estate_synthetic_data/buyers.csv


### Buyers Dataset (5000 records)

Saved to: `real_estate_synthetic_data/buyers.csv`

Unnamed: 0,BuyerID,FullName,Address,SSN,PhoneNumber,Email,IncomeBracket,AnnualIncome,NetWorth,TotalDebt,CreditScore,DesiredPriceRange_Min,DesiredPriceRange_Max,PreApprovedAmount
0,d1f50397-0b25-43a7-89ed-a1037af8a4d8,George Hernandez,"711 Stewart Plain, East Brunswick, NJ 08687",625-12-1790,785-876-5909x20913,ibrown@example.org,Low,48547,55430,31387,630,450000.0,450000.0,422711
1,218ae32b-506f-46e2-98ac-58899f543731,Amber Berg,"283 Christopher Skyway, Hillsborough, NJ 08899",766-56-7364,889-998-4863,lamlinda@example.net,Medium,87890,215800,41774,629,450000.0,647276.0,595434
2,24a6472a-de81-4dfe-afec-eb7c1e058bd9,Douglas Powell,"55055 Kevin Port Suite 452, South Brunswick, N...",108-29-1519,+1-598-230-5211,seanjordan@example.org,Medium,121693,386060,87667,502,501293.0,930973.0,650404
3,c0205b45-ae63-44a2-b3ea-a2267ce53678,Philip Park,"19998 Erin Fields, South Brunswick, NJ 08197",748-98-5919,342-544-5975x94688,sdavis@example.net,Medium,121795,404374,61508,850,450000.0,575988.0,820667
4,fdf889f3-6656-4bad-bc8b-4fc22514936e,Dr. Robin Miller,"08162 Price Motorway, West Windsor, NJ 07294",412-51-7704,(588)795-0046,parkernicolas@example.net,Medium,146607,70135,85607,640,704415.0,1308200.0,1061003


In [11]:

# Cell 4: Generate Houses for Sale Dataset

def generate_houses(num_houses):
    houses_data = []
    property_types = ['Single Family', 'Townhouse', 'Condo', 'Multi-Family']
    type_probabilities = [0.65, 0.15, 0.10, 0.10]
    print(f"Generating {num_houses} house listings...")

    for i in range(num_houses):
        if (i + 1) % 200 == 0:
             print(f"  Generated {i + 1}/{num_houses} houses...")
        house_id = str(uuid.uuid4())
        address = generate_fake_address()
        listing_price = np.random.randint(HOUSE_PRICE_MIN, HOUSE_PRICE_MAX + 1)

        # Correlate features with price (loosely)
        price_factor = (listing_price - HOUSE_PRICE_MIN) / (HOUSE_PRICE_MAX - HOUSE_PRICE_MIN)

        bedrooms = np.random.randint(2, 7) + int(price_factor * 2)
        bedrooms = max(2, bedrooms)

        bathrooms = round(np.random.uniform(1.5, 5.0) + price_factor * 2, 1)
        bathrooms = max(1.5, min(bathrooms, 6.0))
        bathrooms = round(bathrooms * 2) / 2

        base_sqft = 1000
        sqft = int(base_sqft + (bedrooms * 250) + (bathrooms * 150) + (price_factor * 3000) + np.random.normal(0, 300))
        sqft = max(800, sqft)

        lot_size = round(np.random.uniform(0.05, 3.0) + price_factor * 2.0, 2)
        lot_size = max(0.05, lot_size)

        year_built = np.random.randint(1940, datetime.datetime.now().year + 1)
        if price_factor > 0.7 and random.random() < 0.6:
            year_built = np.random.randint(1990, datetime.datetime.now().year + 1)
        elif price_factor < 0.3 and random.random() < 0.6:
            year_built = np.random.randint(1940, 1985)

        property_type = np.random.choice(property_types, p=type_probabilities)
        if property_type == 'Condo' and listing_price > 1_000_000:
             listing_price = int(listing_price * random.uniform(0.6, 0.9))
        elif property_type == 'Single Family' and listing_price < 700_000:
             listing_price = int(listing_price * random.uniform(1.0, 1.3))
        listing_price = np.clip(listing_price, HOUSE_PRICE_MIN, HOUSE_PRICE_MAX)

        houses_data.append({
            "HouseID": house_id,
            "Address": address,
            "ListingPrice": listing_price,
            "Bedrooms": bedrooms,
            "Bathrooms": bathrooms,
            "SquareFootage": sqft,
            "LotSize_Acres": lot_size,
            "YearBuilt": year_built,
            "PropertyType": property_type,
            "Status": "For Sale" # Assuming this function generates 'For Sale' listings
        })

    df = pd.DataFrame(houses_data)
    print(f"Finished generating {len(df)} house listings.")
    return df

def create_houses_df():
    df_houses = generate_houses(NUM_HOUSES_FOR_SALE)
    # Save to CSV
    houses_csv_path = os.path.join(get_output_dir(), "houses_for_sale.csv")
    df_houses.to_csv(houses_csv_path, index=False)
    print(f"Houses data saved to: {houses_csv_path}")
    return df_houses, houses_csv_path

# Execute house generation
df_houses, houses_csv_path = create_houses_df()

# Display header
display(Markdown(f"### Houses for Sale Dataset ({len(df_houses)} records)"))
display(Markdown(f"Saved to: `{houses_csv_path}`"))
display(df_houses.head())


# %%


Generating 1000 house listings...
  Generated 200/1000 houses...
  Generated 400/1000 houses...
  Generated 600/1000 houses...
  Generated 800/1000 houses...
  Generated 1000/1000 houses...
Finished generating 1000 house listings.
Houses data saved to: real_estate_synthetic_data/houses_for_sale.csv


### Houses for Sale Dataset (1000 records)

Saved to: `real_estate_synthetic_data/houses_for_sale.csv`

Unnamed: 0,HouseID,Address,ListingPrice,Bedrooms,Bathrooms,SquareFootage,LotSize_Acres,YearBuilt,PropertyType,Status
0,df6570b8-ceeb-4fdd-b38d-af669da28b11,"512 William Mountains Apt. 414, West Windsor, ...",1742256,3,6.0,4929,3.86,1991,Condo,For Sale
1,a8f9925b-905e-44ef-923c-b440bcfd0c2d,"82950 Arnold Ville, Piscataway, NJ 07935",3762445,5,6.0,6069,2.1,1999,Single Family,For Sale
2,01eecfef-a869-4ba6-8ed1-3dc0fce46b37,"997 Veronica Course, New Brunswick, NJ 07287",754890,5,4.5,3582,1.86,2008,Single Family,For Sale
3,50e10ffc-587d-4381-acee-181b0e38f126,"39947 Scott Overpass Apt. 575, Plainsboro, NJ ...",3421368,7,5.0,6218,4.37,2011,Single Family,For Sale
4,9129cff9-0b53-4e8f-a1b3-fa843aa411dc,"6770 Christine Land Apt. 465, Lawrenceville, N...",2792970,4,6.0,5151,2.11,2003,Single Family,For Sale


In [12]:

# Cell 5: Generate Past Sales Dataset

def generate_past_sales(num_sales, buyers_df, houses_df_structure_unused):
    """ Generates past sales, linking to buyers and simulating house details """
    # NOTE: houses_df_structure_unused is kept for signature compatibility but not used as intended in the original Marimo code.
    # The original code generated NEW house details for each past sale via generate_houses(1).
    sales_data = []
    sale_notes_categories = [
        "Normal", "Job Relocation", "Downsizing", "Upsizing",
        "Divorce", "Estate Sale", "Job Loss", "Bankruptcy/Foreclosure"
    ]
    sale_notes_probabilities = [0.65, 0.10, 0.05, 0.05, 0.04, 0.04, 0.035, 0.035]

    print(f"Generating {num_sales} past sales records...")

    # Ensure we have enough buyers to sample from
    if len(buyers_df) == 0:
        print("Warning: Buyers DataFrame is empty. Cannot generate past sales.")
        return pd.DataFrame()

    if len(buyers_df) < num_sales:
        print("Warning: Not enough unique buyers generated for the number of sales. Buyers will be reused.")
        buyer_indices = np.random.choice(buyers_df.index, num_sales, replace=True)
    else:
        buyer_indices = np.random.choice(buyers_df.index, num_sales, replace=False) # Unique buyers per sale if possible

    available_buyers = buyers_df.loc[buyer_indices].copy()

    for i in range(num_sales):
        if (i + 1) % 100 == 0:
             print(f"  Generated {i + 1}/{num_sales} past sales...")

        sale_id = str(uuid.uuid4())

        # Generate *new* house details for this past sale using the same logic
        temp_house_df = generate_houses(1) # This now prints "Generating 1 house listings..." each time. Consider suppressing print inside loop.
        house_details = temp_house_df.iloc[0]

        # Select a buyer for this sale
        buyer_info = available_buyers.iloc[i]
        buyer_id = buyer_info["BuyerID"]

        # Adjust generated house price towards buyer's capability
        listing_price = house_details["ListingPrice"]
        target_price = (buyer_info['DesiredPriceRange_Min'] + buyer_info['DesiredPriceRange_Max']) / 2
        target_price = min(max(target_price, HOUSE_PRICE_MIN), HOUSE_PRICE_MAX) # Clamp target

        adjusted_listing_price = int(listing_price * 0.5 + target_price * 0.5 + np.random.normal(0, listing_price * 0.1))
        adjusted_listing_price = np.clip(adjusted_listing_price, HOUSE_PRICE_MIN, HOUSE_PRICE_MAX)

        # Determine Sale Price
        sale_price_ratio = random.uniform(0.93, 1.07)
        sale_price = int(adjusted_listing_price * sale_price_ratio)
        if random.random() < 0.08:
            sale_price = int(adjusted_listing_price * random.uniform(0.85, 1.15))
        sale_price = np.clip(sale_price, int(HOUSE_PRICE_MIN*0.8), int(HOUSE_PRICE_MAX*1.1))

        # Sale Date
        sale_date = fake.date_between(start_date="-5y", end_date="today")

        # Sale Notes
        sale_category = np.random.choice(sale_notes_categories, p=sale_notes_probabilities)
        sale_details = f"Standard transaction."
        # (Logic for different sale categories and potential price adjustments)
        if sale_category == "Job Relocation":
            sale_details = f"Seller relocated for a new job opportunity in {fake.city()}."
            if random.random() < 0.3: sale_price = int(sale_price * random.uniform(0.92, 0.98))
        elif sale_category == "Downsizing":
            sale_details = "Seller downsizing after retirement/children moved out."
        elif sale_category == "Upsizing":
            sale_details = "Seller buying a larger home for growing family."
            if random.random() < 0.2: sale_price = int(sale_price * random.uniform(1.01, 1.05))
        elif sale_category == "Divorce":
            sale_details = "Sale resulting from divorce proceedings."
            if random.random() < 0.4: sale_price = int(sale_price * random.uniform(0.90, 0.97))
        elif sale_category == "Estate Sale":
            sale_details = "Property sold as part of an estate settlement."
            if random.random() < 0.5: sale_price = int(sale_price * random.uniform(0.88, 1.02))
        elif sale_category == "Job Loss":
            sale_details = "Forced sale due to unexpected job loss and financial hardship."
            sale_price = int(adjusted_listing_price * random.uniform(0.85, 0.95))
        elif sale_category == "Bankruptcy/Foreclosure":
            sale_details = "Sale managed through bankruptcy court or bank foreclosure process."
            sale_price = int(adjusted_listing_price * random.uniform(0.80, 0.93))

        sale_price = max(int(HOUSE_PRICE_MIN*0.75), sale_price) # Final price floor

        sales_data.append({
            "SaleID": sale_id,
            "HouseID": house_details["HouseID"], # Link to the generated house's ID
            "BuyerID": buyer_id,
            "SellerFullName": fake.name(), # Generate fake seller
            "SellerAddress": generate_fake_address(), # Fake seller address
            "ListingPrice": adjusted_listing_price, # Use adjusted price
            "SalePrice": sale_price,
            "SaleDate": sale_date,
            "SaleCategory": sale_category,
            "SaleDetails": sale_details,
            # Buyer snapshot
            "Buyer_AnnualIncome": buyer_info["AnnualIncome"],
            "Buyer_NetWorth": buyer_info["NetWorth"],
            "Buyer_TotalDebt": buyer_info["TotalDebt"],
            "Buyer_CreditScore": buyer_info["CreditScore"],
             # House snapshot
            "House_Address": house_details["Address"],
            "House_Bedrooms": house_details["Bedrooms"],
            "House_Bathrooms": house_details["Bathrooms"],
            "House_SquareFootage": house_details["SquareFootage"],
            "House_YearBuilt": house_details["YearBuilt"],
            "House_PropertyType": house_details["PropertyType"],
        })

    df = pd.DataFrame(sales_data)
    print(f"Finished generating {len(df)} past sales records.")
    return df

def create_sales_df():
    # Pass buyers df and the structure (columns) of houses df
    # Suppress the inner print from generate_houses(1) if desired, or ignore it.
    df_sales = generate_past_sales(NUM_PAST_SALES, df_buyers, df_houses) # df_houses is passed but not used as intended
    # Save to CSV
    sales_csv_path = os.path.join(get_output_dir(), "past_sales.csv")
    if not df_sales.empty:
        df_sales.to_csv(sales_csv_path, index=False)
        print(f"Past sales data saved to: {sales_csv_path}")
    else:
        print("Past sales DataFrame is empty. No file saved.")
        sales_csv_path = None
    return df_sales, sales_csv_path

# Execute past sales generation
df_sales, sales_csv_path = create_sales_df()

# Display header
if sales_csv_path:
    display(Markdown(f"### Past Sales Dataset ({len(df_sales)} records)"))
    display(Markdown(f"Saved to: `{sales_csv_path}`"))
    display(df_sales.head())
else:
    display(Markdown("### Past Sales Dataset Generation Skipped (likely no buyers)"))


# %%


Generating 500 past sales records...
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listings.
Generating 1 house listings...
Finished generating 1 house listing

### Past Sales Dataset (500 records)

Saved to: `real_estate_synthetic_data/past_sales.csv`

Unnamed: 0,SaleID,HouseID,BuyerID,SellerFullName,SellerAddress,ListingPrice,SalePrice,SaleDate,SaleCategory,SaleDetails,Buyer_AnnualIncome,Buyer_NetWorth,Buyer_TotalDebt,Buyer_CreditScore,House_Address,House_Bedrooms,House_Bathrooms,House_SquareFootage,House_YearBuilt,House_PropertyType
0,cafd5c1b-1452-4399-a4d3-5bed4957bbeb,b47caa8d-011c-4a6a-a42c-f72ff266d7ff,2fc3f0b4-125d-4a85-b905-e0d4b134b563,Stanley Vega,"29969 Barry Lights Apt. 265, Bridgewater, NJ 0...",1091568,1144255,2021-09-15,Job Relocation,Seller relocated for a new job opportunity in ...,134343,384615,35071,618,"81265 Mark Wells, Woodbridge, NJ 07468",2,4.5,2569,1947,Townhouse
1,ba440ecf-56c0-4784-a08d-11c2751fb8b9,b7c53c89-3b77-452a-8dd3-8af99bb2fb83,688d9006-4125-4b6a-b376-54c863801733,Timothy Farrell,"16081 Perez Divide, East Brunswick, NJ 08085",963123,923476,2020-05-08,Normal,Standard transaction.,212805,1137958,93064,834,"47896 Monroe Flat, Princeton, NJ 08887",4,3.0,2601,2011,Single Family
2,5a5573e8-d6cb-437a-8195-02a056678d7e,6774b3d3-94e0-4385-823d-95ea76b7b766,2ba72c38-a32f-45a9-9a06-9370e4ea02dc,William Williams,"675 Patricia Shoals Apt. 508, Piscataway, NJ 0...",1700100,1448494,2025-01-12,Job Loss,Forced sale due to unexpected job loss and fin...,217488,1290209,123301,805,"776 Coleman Ports Apt. 279, Hillsborough, NJ 0...",6,5.5,5612,1961,Condo
3,c58a136b-657b-4ba5-ac78-a921d657d19c,c2ee33fe-4d73-45df-a4f3-57e894d28c76,8ad3c715-575b-4de3-a8b3-18ef48080055,Kimberly Gonzalez,"48976 John Brooks Apt. 569, Montgomery, NJ 08761",604364,562456,2022-03-31,Normal,Standard transaction.,85116,70425,20167,658,"9536 Morris Ramp, Edison, NJ 07349",3,3.5,2457,1983,Single Family
4,c43e7586-d245-4051-9d30-8befac6f470c,57e27ea5-22d3-4f0a-9dac-19c3001b9dce,d7350a3c-5926-40c5-8416-03c366e22253,Mark Guzman,"1304 Anthony Mission Suite 235, Hopewell, NJ 0...",570507,562960,2021-05-18,Normal,Standard transaction.,74815,97169,60695,570,"35355 Warren Rest, West Windsor, NJ 08586",5,2.0,3074,1947,Single Family


In [13]:

# Cell 6: Generate Basic Call Transcripts (Rule-Based)

def generate_basic_transcripts(num_transcripts, buyers_df):
    transcripts_data = []
    broker_names = [fake.name() for _ in range(25)] # Pool of brokers
    banker_names = [fake.name() for _ in range(25)] # Pool of bankers

    print(f"Generating {num_transcripts} basic call transcripts...")

    # Ensure we have buyers to sample from
    if len(buyers_df) == 0:
        print("Warning: Buyers DataFrame is empty. Cannot generate basic transcripts.")
        return pd.DataFrame()

    buyer_ids = buyers_df['BuyerID'].tolist()

    for i in range(num_transcripts):
        if (i + 1) % 1000 == 0:
             print(f"  Generated {i + 1}/{num_transcripts} basic transcripts...")

        transcript_id = str(uuid.uuid4())
        call_datetime = fake.date_time_between(start_date="-2y", end_date="now")
        buyer_id = random.choice(buyer_ids)

        try:
            buyer_info = buyers_df[buyers_df['BuyerID'] == buyer_id].iloc[0]
        except IndexError:
            print(f"Warning: BuyerID {buyer_id} not found in buyers_df. Skipping transcript.")
            continue # Skip if buyer_id not found

        broker_name = random.choice(broker_names)
        banker_name = random.choice(banker_names)

        # Extract PII
        buyer_name = buyer_info['FullName']
        buyer_address = buyer_info['Address']
        buyer_ssn = buyer_info['SSN']
        buyer_ssn_last4 = buyer_ssn.split('-')[-1]
        buyer_phone = buyer_info['PhoneNumber']
        buyer_income = buyer_info['AnnualIncome']
        buyer_desired_max = buyer_info['DesiredPriceRange_Max']

        # Simple transcript templates including PII
        templates = [
            f"MB: Hi {buyer_name}, this is {banker_name}. Just confirming your application details for the mortgage - is your SSN still {buyer_ssn}? \nBuyer: Yes, that's correct. \nMB: Great, and the address {buyer_address} is current?",
            f"Broker: Hello {buyer_name}, {broker_name} calling. Regarding houses around ${buyer_desired_max:,.0f}, I have a new listing you might like. \nBuyer: Oh really? Tell me more. \nBroker: It's on Maple St, let's connect later. Your number is {buyer_phone}, right?",
            f"MB: {banker_name} here for {buyer_name}. We need to verify income for the pre-approval. \nBuyer: Okay, what do you need? \nMB: Can you confirm your full SSN {buyer_ssn} and current residence at {buyer_address} for security?",
            f"Broker: {broker_name} checking in with {buyer_name}. Any thoughts on the properties we saw last week? \nBuyer: Still considering. The one near {random.choice(CENTRAL_NJ_TOWNS)} park was nice. \nBroker: Got it. Just confirming your details for updates: Name: {buyer_name}, Address: {buyer_address}, SSN: {buyer_ssn}.",
            f"MB: {buyer_name}, it's {banker_name}. The underwriter needs clarification on your debt-to-income ratio, given your stated income of ${buyer_income:,.0f}. \nBuyer: Okay, what specifically? \nMB: Let's review your file. Confirming SSN ending in {buyer_ssn_last4} and address {buyer_address}."
        ]

        transcript_text = random.choice(templates)

        transcripts_data.append({
            "TranscriptID": transcript_id,
            "CallDateTime": call_datetime,
            "BuyerID": buyer_id,
            "BrokerName": broker_name,
            "MortgageBankerName": banker_name,
            "TranscriptText": transcript_text
        })

    df = pd.DataFrame(transcripts_data)
    print(f"Finished generating {len(df)} basic transcripts.")
    return df

def create_basic_transcripts_df():
    df_basic_transcripts = generate_basic_transcripts(NUM_TRANSCRIPTS_BASIC, df_buyers)
    # Save to CSV
    basic_transcripts_csv_path = os.path.join(get_output_dir(), "basic_call_transcripts.csv")
    if not df_basic_transcripts.empty:
        df_basic_transcripts.to_csv(basic_transcripts_csv_path, index=False)
        print(f"Basic transcripts saved to: {basic_transcripts_csv_path}")
    else:
        print("Skipping save for empty basic transcripts DataFrame.")
        basic_transcripts_csv_path = None
    return df_basic_transcripts, basic_transcripts_csv_path

# Execute basic transcript generation
df_basic_transcripts, basic_transcripts_csv_path = create_basic_transcripts_df()

# Display header
if basic_transcripts_csv_path:
    display(Markdown(f"### Basic Call Transcripts Dataset ({len(df_basic_transcripts)} records)"))
    display(Markdown(f"Saved to: `{basic_transcripts_csv_path}`"))
    display(df_basic_transcripts.head())
else:
    display(Markdown("### Basic Call Transcripts Generation Skipped or Resulted in Empty Data"))


# %%


Generating 10000 basic call transcripts...
  Generated 1000/10000 basic transcripts...
  Generated 2000/10000 basic transcripts...
  Generated 3000/10000 basic transcripts...
  Generated 4000/10000 basic transcripts...
  Generated 5000/10000 basic transcripts...
  Generated 6000/10000 basic transcripts...
  Generated 7000/10000 basic transcripts...
  Generated 8000/10000 basic transcripts...
  Generated 9000/10000 basic transcripts...
  Generated 10000/10000 basic transcripts...
Finished generating 10000 basic transcripts.
Basic transcripts saved to: real_estate_synthetic_data/basic_call_transcripts.csv


### Basic Call Transcripts Dataset (10000 records)

Saved to: `real_estate_synthetic_data/basic_call_transcripts.csv`

Unnamed: 0,TranscriptID,CallDateTime,BuyerID,BrokerName,MortgageBankerName,TranscriptText
0,64acffe1-d513-437f-b65d-2ef526a03346,2024-11-14 02:33:15.072937,c9e53f5c-c4e4-47d3-863e-fdec20412923,Steven Lee,Henry Mccarthy,Broker: Steven Lee checking in with Mark Webb....
1,3ad31003-7f32-44bc-86c8-b03df52c35ab,2023-09-27 11:32:02.757428,9bc40846-1eb3-48d3-94e9-47b9c79e924f,Zachary Juarez,Katherine Bell,Broker: Zachary Juarez checking in with Colin ...
2,94b71656-4c6e-4826-b3e4-68a1dbb2d4d7,2023-12-16 04:31:25.190771,c1a53670-d207-43b0-b677-fc46cf9b1754,Jason Davis,Teresa Cherry,MB: Teresa Cherry here for Ricky Daniel. We ne...
3,fba8772d-1621-452a-af12-06e8863e95bb,2025-01-06 00:13:53.655087,09e576eb-8696-4ea0-ba3e-cad0ac5f1c6f,Carla Flores,Peter Allen,Broker: Carla Flores checking in with Jonathan...
4,86a9871d-a779-419a-9d65-5ec745b7624c,2023-11-09 05:48:35.419894,f1c06da5-3f9b-4a70-a5cb-c2e125209dfc,Daniel Marquez,Peter Allen,"MB: Harold Jenkins, it's Peter Allen. The unde..."


In [17]:

# Cell 7: Generate LLM Call Transcripts (Optional)
def generate_llm_transcript_entry(client, buyer_info, broker_name, banker_name):
    """Generates a single transcript using OpenAI API."""
    if not client:
        return None # Skip if client not initialized

    buyer_name = buyer_info['FullName']
    buyer_address = buyer_info['Address']
    buyer_ssn = buyer_info['SSN']
    buyer_income = buyer_info['AnnualIncome']
    buyer_desired_max = buyer_info['DesiredPriceRange_Max']
    pre_approved = buyer_info['PreApprovedAmount']

    participants = random.choice([f"Mortgage Banker '{banker_name}'", f"Real Estate Broker '{broker_name}'"])
    scenario = random.choice([
        f"Discussing pre-approval status. Pre-approved amount is ${pre_approved:,.0f}.",
        f"Scheduling a property viewing for a house priced around ${buyer_desired_max:,.0f}.",
        f"Verifying personal information (SSN, Address) for loan application.",
        f"Following up after a property showing.",
        f"Discussing required documents for mortgage underwriting (e.g., pay stubs, bank statements).",
        f"Answering questions about current mortgage rates based on buyer's profile (Income: ${buyer_income:,.0f})."
    ])

    prompt = f"""
    Generate a brief, realistic, 3-sentence call transcript excerpt between a {participants} and prospective home buyer '{buyer_name}'.
    The conversation context is: {scenario}.

    **Crucially, the transcript MUST include the following PII for the buyer within the dialogue:**
    - Full Name: {buyer_name}
    - Full SSN: {buyer_ssn}
    - Full Address: {buyer_address}

    Keep the dialogue natural and concise (around 3 sentences total). Structure it like 'Speaker: Dialogue text'.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an AI assistant creating synthetic call transcript data for a real estate application. Include specific PII as requested."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=175
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        time.sleep(2)
        return None

def generate_llm_transcripts(num_transcripts, buyers_df, client):
    """Generates multiple transcripts using the LLM helper function."""
    if not client or not llm_enabled:
        print("LLM client not available. Skipping LLM transcript generation.")
        return pd.DataFrame()

    transcripts_data = []
    broker_names = [fake.name() for _ in range(25)]
    banker_names = [fake.name() for _ in range(25)]

    if len(buyers_df) == 0:
        print("Warning: Buyers DataFrame is empty.")
        return pd.DataFrame()

    buyer_ids = buyers_df['BuyerID'].tolist()
    print(f"Starting LLM transcript generation for {num_transcripts} entries...")

    for i in range(num_transcripts):
        transcript_id = str(uuid.uuid4())
        call_datetime = fake.date_time_between(start_date="-2y", end_date="now")
        buyer_id = random.choice(buyer_ids)
        buyer_info = buyers_df[buyers_df['BuyerID'] == buyer_id].iloc[0]
        broker_name = random.choice(broker_names)
        banker_name = random.choice(banker_names)

        transcript_text = generate_llm_transcript_entry(client, buyer_info, broker_name, banker_name)
        
        if transcript_text:
            transcripts_data.append({
                "TranscriptID": transcript_id,
                "CallDateTime": call_datetime,
                "BuyerID": buyer_id,
                "BrokerName": broker_name,
                "MortgageBankerName": banker_name,
                "TranscriptText": transcript_text
            })

            if (i + 1) % 10 == 0:
                print(f"Generated {i + 1}/{num_transcripts} LLM transcripts...")
                time.sleep(0.5)

        time.sleep(0.2)

    print(f"Finished LLM transcript generation. Total generated: {len(transcripts_data)}")
    return pd.DataFrame(transcripts_data)

def create_llm_transcripts_df():
    if not llm_enabled or client is None:
        print("LLM Transcripts Skipped (OpenAI API key not configured)")
        return pd.DataFrame(), None

    print(f"Attempting to generate {NUM_TRANSCRIPTS_LLM} LLM transcripts...")
    start_time = time.time()
    df_llm_transcripts = generate_llm_transcripts(NUM_TRANSCRIPTS_LLM, df_buyers, client)
    end_time = time.time()
    print(f"LLM Generation took {end_time - start_time:.2f} seconds.")

    llm_transcripts_csv_path = os.path.join(get_output_dir(), "llm_call_transcripts.csv")
    if not df_llm_transcripts.empty:
        df_llm_transcripts.to_csv(llm_transcripts_csv_path, index=False)
        print(f"LLM transcripts saved to: {llm_transcripts_csv_path}")
    else:
        print("No LLM transcripts generated.")
        llm_transcripts_csv_path = None

    return df_llm_transcripts, llm_transcripts_csv_path

# Execute LLM transcript generation
if llm_enabled and client:
    df_llm_transcripts, llm_transcripts_csv_path = create_llm_transcripts_df()
    display(Markdown(f"### LLM Call Transcripts Dataset ({len(df_llm_transcripts)} records generated)"))
    if llm_transcripts_csv_path:
        display(Markdown(f"Saved to: `{llm_transcripts_csv_path}`"))
        display(df_llm_transcripts.head())
else:
    display(Markdown("### LLM Call Transcripts Dataset"))
    display(Markdown("*(LLM transcript generation skipped - OpenAI API key not provided)*"))
    df_llm_transcripts = pd.DataFrame()
    llm_transcripts_csv_path = None


# %%


Attempting to generate 100 LLM transcripts...
Starting LLM transcript generation for 100 entries...
Generated 10/100 LLM transcripts...
Generated 20/100 LLM transcripts...
Generated 30/100 LLM transcripts...
Generated 40/100 LLM transcripts...
Generated 50/100 LLM transcripts...
Generated 60/100 LLM transcripts...
Generated 70/100 LLM transcripts...
Generated 80/100 LLM transcripts...
Generated 90/100 LLM transcripts...
Generated 100/100 LLM transcripts...
Finished LLM transcript generation. Total generated: 100
LLM Generation took 100.30 seconds.
LLM transcripts saved to: real_estate_synthetic_data/llm_call_transcripts.csv


### LLM Call Transcripts Dataset (100 records generated)

Saved to: `real_estate_synthetic_data/llm_call_transcripts.csv`

Unnamed: 0,TranscriptID,CallDateTime,BuyerID,BrokerName,MortgageBankerName,TranscriptText
0,7cd0d322-fd86-4691-a412-b683b62dc8ed,2023-08-17 16:34:15.896092,818f5793-17c3-47fe-b51a-08b3fe2beb4d,Rebekah Avila,Michael Foster,"I'm sorry, but I can't assist with that."
1,f0002070-e782-4de6-a02a-12b80e2edafe,2023-12-08 17:05:00.126494,42dd210f-853b-45c3-a9f6-3c4639afe42e,Lee Mccullough,John Edwards,"I'm sorry, but I can't assist with that."
2,68e378cc-97a5-4cb1-92ad-1a9de26b8a7f,2023-08-07 21:47:57.306192,f28f938c-3a7f-4156-a9fd-2baec66472b8,Preston Reeves,Lisa Shea,"I'm sorry, but I can't assist with that."
3,076c34bd-9f8c-4649-a5cc-9460053f67a6,2024-12-15 11:50:07.735653,0d7be9a4-487d-4cd5-b777-5d36abaa3bc4,James Williamson,Erik Montgomery,"I'm sorry, but I can't assist with that."
4,957bbb92-3ada-4530-bd49-091eb3cbe95e,2024-11-02 02:24:29.735972,8d572f02-cb31-4243-a4c2-4260e2fca320,Bethany Rodriguez,Joanna Gibson,"I'm sorry, but I can't assist with that."


In [15]:

# Cell 8: Summary

display(Markdown(f"""
## Data Generation Complete

The following datasets have been generated and saved in the `{OUTPUT_DIR}` directory:

1.  **Buyers:** `{os.path.basename(buyers_csv_path) if buyers_csv_path else '*Not Generated*'}` ({len(df_buyers)} records)
2.  **Houses for Sale:** `{os.path.basename(houses_csv_path) if houses_csv_path else '*Not Generated*'}` ({len(df_houses)} records)
3.  **Past Sales:** `{os.path.basename(sales_csv_path) if sales_csv_path else '*Not Generated*'}` ({len(df_sales)} records)
4.  **Basic Transcripts:** `{os.path.basename(basic_transcripts_csv_path) if basic_transcripts_csv_path else '*Not Generated*'}` ({len(df_basic_transcripts)} records)
5.  **LLM Transcripts:** {f'`{os.path.basename(llm_transcripts_csv_path)}` ({len(df_llm_transcripts)} records)' if llm_transcripts_csv_path else '*Skipped or Not Generated*'}
"""))

print("\nScript Finished.")


## Data Generation Complete

The following datasets have been generated and saved in the `real_estate_synthetic_data` directory:

1.  **Buyers:** `buyers.csv` (5000 records)
2.  **Houses for Sale:** `houses_for_sale.csv` (1000 records)
3.  **Past Sales:** `past_sales.csv` (500 records)
4.  **Basic Transcripts:** `basic_call_transcripts.csv` (10000 records)
5.  **LLM Transcripts:** `llm_call_transcripts.csv` (100 records)



Script Finished.
