In [1]:
import pandas as pd
import numpy as np
import random
import time
import os
from datetime import datetime
import argparse

def generate_house_pricing_data(num_records=500_000, price_noise_percent=5, feature_noise_percent=0, output_file=""):
    seed = int(time.time())
    np.random.seed(seed)
    random.seed(seed)

    price_noise_multiplier = price_noise_percent / 100.0
    feature_noise_multiplier = feature_noise_percent / 100.0

    def generate_zipcode():
        return random.choice(['90210', '10001', '60614', '94105', '77005', '30303', '98101'])

    def generate_floor_material():
        return random.choice(['Wood', 'Tile', 'Carpet', 'Mixed'])

    def generate_exterior_quality():
        return random.choice(['Poor', 'Fair', 'Good', 'Excellent'])

    def generate_sale_season():
        return random.choice(['Spring', 'Summer', 'Fall', 'Winter'])

    data = {
        'Bedrooms': np.random.randint(1, 6, num_records),
        'Bathrooms': np.random.randint(1, 5, num_records),
        'TotalArea': np.random.normal(2000, 500, num_records).clip(500, 5000),
        'GarageSize': np.random.normal(300, 100, num_records).clip(0, 1000),
        'HasPool': np.random.choice([0, 1], num_records, p=[0.8, 0.2]),
        'PoolSize': np.random.normal(400, 150, num_records).clip(0, 1000),
        'LotSize': np.random.normal(8000, 3000, num_records).clip(2000, 20000),
        'HouseAge': np.random.randint(0, 100, num_records),
        'RenovatedYear': np.random.choice(list(range(1950, 2025)) + [0], num_records),
        'HasFireplace': np.random.choice([0, 1], num_records, p=[0.6, 0.4]),
        'FireplaceCount': np.random.randint(0, 4, num_records),
        'ZipCode': [generate_zipcode() for _ in range(num_records)],
        'SchoolRating': np.random.randint(1, 11, num_records),
        'CrimeRate': np.random.normal(3, 1.5, num_records).clip(0, 10),
        'NearbyParks': np.random.randint(0, 6, num_records),
        'DistanceToDowntown': np.random.normal(10, 5, num_records).clip(0, 50),
        'MonthlyHOAFee': np.random.normal(150, 50, num_records).clip(0, 1000),
        'ExteriorQuality': [generate_exterior_quality() for _ in range(num_records)],
        'FloorMaterial': [generate_floor_material() for _ in range(num_records)],
        'SaleSeason': [generate_sale_season() for _ in range(num_records)],
        'ListingDuration': np.random.randint(5, 180, num_records)
    }

    df = pd.DataFrame(data)

    # Add noise to numerical fields
    if feature_noise_percent > 0:
        numerical_noise_std = {
            'TotalArea': 50,
            'GarageSize': 20,
            'PoolSize': 30,
            'LotSize': 100,
            'HouseAge': 3,
            'CrimeRate': 0.5,
            'MonthlyHOAFee': 10,
            'DistanceToDowntown': 1,
        }

        for col, std in numerical_noise_std.items():
            df[col] += np.random.normal(0, std * feature_noise_multiplier, num_records)

        for col in ['HasPool', 'HasFireplace']:
            flip_mask = np.random.rand(num_records) < feature_noise_multiplier
            df.loc[flip_mask, col] = 1 - df.loc[flip_mask, col]

        for col, generator in [('ZipCode', generate_zipcode),
                               ('ExteriorQuality', generate_exterior_quality),
                               ('FloorMaterial', generate_floor_material),
                               ('SaleSeason', generate_sale_season)]:
            mask = np.random.rand(num_records) < feature_noise_multiplier
            df.loc[mask, col] = [generator() for _ in range(mask.sum())]

    # Calculate SalePrice
    base_price = (
        50000 * df['Bedrooms'] +
        40000 * df['Bathrooms'] +
        100 * df['TotalArea'] +
        50 * df['GarageSize'] +
        10000 * df['HasPool'] +
        20 * df['PoolSize'] +
        0.5 * df['LotSize'] -
        1000 * df['HouseAge'] +
        3000 * df['HasFireplace'] +
        2000 * df['FireplaceCount'] +
        5000 * df['SchoolRating'] -
        3000 * df['CrimeRate'] +
        1000 * df['NearbyParks'] -
        800 * df['DistanceToDowntown'] -
        2 * df['MonthlyHOAFee'] -
        100 * df['ListingDuration']
    )

    df['SalePrice'] = base_price + np.random.normal(0, base_price.std() * price_noise_multiplier, num_records)

    # Determine output file path
    if not output_file:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"house_pricing_{num_records}_priceNoise{price_noise_percent}pct_fieldNoise{feature_noise_percent}pct_{timestamp}.csv"

    output_dir = os.path.dirname(output_file)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    df.to_csv(output_file, index=False)
    print(f"✅ Dataset saved at: {output_file}")



In [2]:
t_noise = 0
f_noise = 0
records = 100

generate_house_pricing_data(
    num_records=records,
    price_noise_percent=t_noise,
    feature_noise_percent=f_noise,
    output_file="../data/unzipped-data/NN/house_pricing_"+str(records)+"k_TN_"+str(t_noise)+"_FN_"+str(f_noise)+".csv"
)

✅ Dataset saved at: ../data/unzipped-data/NN/house_pricing_100k_TN_0_FN_0.csv
