In [6]:
import pandas as pd
import numpy as np

In [7]:
# Set random seed for reproducibility
np.random.seed(2)

# Number of samples
num_samples = 100

# Generate random data
data = {
    "HouseID": [f"HOUSE{i:03d}" for i in range(1, num_samples + 1)],
    "Bedrooms": np.random.randint(2, 6, num_samples),
    "Bathrooms": np.random.randint(1, 4, num_samples),
    "SquareFootage": np.random.randint(800, 4000, num_samples),
    "Neighborhood": np.random.choice(["Downtown", "Suburban", "Rural"], num_samples),
    "YearBuilt": np.random.randint(1950, 2025, num_samples),  # Includes future dates
    "SellingPrice": np.round(np.random.uniform(100000, 500000, num_samples), 2),
}
# Introducing NaN values into YearBuilt results in error as int column can't have NaN.
data["YearBuilt"].dtype

dtype('int64')

In [8]:
# Introduce missing values
data["YearBuilt"] = data["YearBuilt"].astype(float)
data["SquareFootage"] = data["SquareFootage"].astype(float)
data["SquareFootage"][np.random.choice(num_samples, 5, replace=False)] = np.nan
data["YearBuilt"][np.random.choice(num_samples, 3, replace=False)] = np.nan

# Introduce outliers in SellingPrice
data["SellingPrice"][np.random.choice(num_samples, 3, replace=False)] = np.random.uniform(10000, 20000, 3)

In [9]:
# Create DataFrame
df = pd.DataFrame(data)

# Introduce inconsistent data
df.loc[df["Bedrooms"] > 4, "SquareFootage"] = np.random.randint(500, 1000, df[df["Bedrooms"] > 4].shape[0])

# Display a small sample of the data
df.head()

Unnamed: 0,HouseID,Bedrooms,Bathrooms,SquareFootage,Neighborhood,YearBuilt,SellingPrice
0,HOUSE001,2,2,3526.0,Suburban,1963.0,356620.3
1,HOUSE002,5,2,922.0,Downtown,1979.0,417819.14
2,HOUSE003,3,1,912.0,Downtown,1969.0,261851.54
3,HOUSE004,2,2,3568.0,Suburban,1970.0,148262.19
4,HOUSE005,4,1,832.0,Suburban,2008.0,446589.6


In [10]:
# Save to CSV file
df.to_csv("/Users/bez/Desktop/HousePricePrediction/house_price_prediction_with_anomalies2.csv", index=False)