In [16]:
import pandas as pd
import numpy as np
import random
import time
import os
from datetime import datetime

def generate_house_pricing_data(
    num_records=500_000,
    price_noise_percent=5,
    feature_noise_percent=0,
    data_type="linear",  # Options: 'linear', 'binary', 'multiclass'
    num_classes=3,       # Only used for 'multiclass'
    output_file=""
):
    assert data_type in ("linear", "binary", "multiclass"), "Invalid data_type. Choose from 'linear', 'binary', or 'multiclass'."
    if data_type == "multiclass":
        assert num_classes >= 2, "num_classes must be at least 2 for multiclass classification"

    seed = int(time.time())
    np.random.seed(seed)
    random.seed(seed)

    price_noise_multiplier = price_noise_percent / 100.0
    feature_noise_multiplier = feature_noise_percent / 100.0

    def generate_zipcode():
        return random.choice(['90210', '10001', '60614', '94105', '77005', '30303', '98101'])

    def generate_floor_material():
        return random.choice(['Wood', 'Tile', 'Carpet', 'Mixed'])

    def generate_exterior_quality():
        return random.choice(['Poor', 'Fair', 'Good', 'Excellent'])

    def generate_sale_season():
        return random.choice(['Spring', 'Summer', 'Fall', 'Winter'])

    data = {
        'Bedrooms': np.random.randint(1, 6, num_records),
        'Bathrooms': np.random.randint(1, 5, num_records),
        'TotalArea': np.random.normal(2000, 500, num_records).clip(500, 5000),
        'GarageSize': np.random.normal(300, 100, num_records).clip(0, 1000),
        'HasPool': np.random.choice([0, 1], num_records, p=[0.8, 0.2]),
        'PoolSize': np.random.normal(400, 150, num_records).clip(0, 1000),
        'LotSize': np.random.normal(8000, 3000, num_records).clip(2000, 20000),
        'HouseAge': np.random.randint(0, 100, num_records),
        'RenovatedYear': np.random.choice(list(range(1950, 2025)) + [0], num_records),
        'HasFireplace': np.random.choice([0, 1], num_records, p=[0.6, 0.4]),
        'FireplaceCount': np.random.randint(0, 4, num_records),
        'ZipCode': [generate_zipcode() for _ in range(num_records)],
        'SchoolRating': np.random.randint(1, 11, num_records),
        'CrimeRate': np.random.normal(3, 1.5, num_records).clip(0, 10),
        'NearbyParks': np.random.randint(0, 6, num_records),
        'DistanceToDowntown': np.random.normal(10, 5, num_records).clip(0, 50),
        'MonthlyHOAFee': np.random.normal(150, 50, num_records).clip(0, 1000),
        'ExteriorQuality': [generate_exterior_quality() for _ in range(num_records)],
        'FloorMaterial': [generate_floor_material() for _ in range(num_records)],
        'SaleSeason': [generate_sale_season() for _ in range(num_records)],
        'ListingDuration': np.random.randint(5, 180, num_records)
    }

    df = pd.DataFrame(data)

    if feature_noise_percent > 0:
        numerical_noise_std = {
            'TotalArea': 50,
            'GarageSize': 20,
            'PoolSize': 30,
            'LotSize': 100,
            'HouseAge': 3,
            'CrimeRate': 0.5,
            'MonthlyHOAFee': 10,
            'DistanceToDowntown': 1,
        }

        for col, std in numerical_noise_std.items():
            df[col] += np.random.normal(0, std * feature_noise_multiplier, num_records)

        for col in ['HasPool', 'HasFireplace']:
            flip_mask = np.random.rand(num_records) < feature_noise_multiplier
            df.loc[flip_mask, col] = 1 - df.loc[flip_mask, col]

        for col, generator in [('ZipCode', generate_zipcode),
                               ('ExteriorQuality', generate_exterior_quality),
                               ('FloorMaterial', generate_floor_material),
                               ('SaleSeason', generate_sale_season)]:
            mask = np.random.rand(num_records) < feature_noise_multiplier
            df.loc[mask, col] = [generator() for _ in range(mask.sum())]

    score = (
        50000 * df['Bedrooms'] +
        40000 * df['Bathrooms'] +
        100 * df['TotalArea'] +
        50 * df['GarageSize'] +
        10000 * df['HasPool'] +
        20 * df['PoolSize'] +
        0.5 * df['LotSize'] -
        1000 * df['HouseAge'] +
        3000 * df['HasFireplace'] +
        2000 * df['FireplaceCount'] +
        5000 * df['SchoolRating'] -
        3000 * df['CrimeRate'] +
        1000 * df['NearbyParks'] -
        800 * df['DistanceToDowntown'] -
        2 * df['MonthlyHOAFee'] -
        100 * df['ListingDuration']
    )

    noisy_score = score + np.random.normal(0, score.std() * price_noise_multiplier, num_records)

    if data_type == "linear":
        df["SalePrice"] = noisy_score

    elif data_type == "binary":
        threshold = np.percentile(noisy_score, 60)
        df["WillSell"] = (noisy_score >= threshold).astype(int)

    elif data_type == "multiclass":
        # Label as integers: 0 to num_classes-1
        df["PriceCategory"] = pd.qcut(
            noisy_score,
            q=num_classes,
            labels=[f"Class{i}" for i in range(num_classes)]
        )

    # Output file path
    if not output_file:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"house_data_{data_type}_{num_records}recs_{timestamp}.csv"

    output_dir = os.path.dirname(output_file)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    df.to_csv(output_file, index=False)
    print(f"✅ Dataset saved at: {output_file}")


In [17]:
def human_format(number, precision=1):
    """
    Convert a number into a human-readable format:
    1234 -> 1.2k
    1000000 -> 1m
    850 -> 850
    """
    suffixes = ['', 'k', 'm', 'b', 't']
    num = float(number)
    magnitude = 0

    while abs(num) >= 1000 and magnitude < len(suffixes) - 1:
        magnitude += 1
        num /= 1000.0

    if magnitude == 0:
        return str(int(number))  # show full number for values < 1000

    formatted = f"{num:.{precision}f}".rstrip('0').rstrip('.')
    return f"{formatted}{suffixes[magnitude]}"


In [20]:
t_noise = 0
f_noise = 0
records = 100
num_classes=100



# Linear regression
filePath = "../data/unzipped-data/NN/house_pricing_linear_"+str(human_format(records))+"_TN_"+str(t_noise)+"_FN_"+str(f_noise)+".csv"
generate_house_pricing_data(
    num_records=records,
    price_noise_percent=t_noise,
    feature_noise_percent=f_noise,
    output_file=filePath
)

# Binary classification
filePath = "../data/unzipped-data/NN/house_pricing_binary_"+str(human_format(records))+"_TN_"+str(t_noise)+"_FN_"+str(f_noise)+".csv"
generate_house_pricing_data(
    num_records=records,
    price_noise_percent=t_noise,
    feature_noise_percent=f_noise,
    output_file=filePath, 
    data_type="binary")

# Multi-class with 4 classes
filePath = "../data/unzipped-data/NN/house_pricing_ multiclass_"+str(num_classes)+"_"+str(human_format(records))+"_TN_"+str(t_noise)+"_FN_"+str(f_noise)+".csv"
filePath = filePath.replace('TYPE', 'multiclass_'+str(num_classes))
generate_house_pricing_data(    
    num_records=records,
    price_noise_percent=t_noise,
    feature_noise_percent=f_noise,
    output_file=filePath, 
    data_type="multiclass", num_classes=num_classes)





✅ Dataset saved at: ../data/unzipped-data/NN/house_pricing_linear_100_TN_0_FN_0.csv
✅ Dataset saved at: ../data/unzipped-data/NN/house_pricing_binary_100_TN_0_FN_0.csv
✅ Dataset saved at: ../data/unzipped-data/NN/house_pricing_ multiclass_100_100_TN_0_FN_0.csv
