In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

np.random.seed(42)
locations = ['Los Angeles, CA', 'San Francisco, CA', 'Austin, TX', 'Miami, FL', 'New York, NY', 'Denver, CO', 'Chicago, IL', 'Seattle, WA', 'Atlanta, GA', 'Boston, MA']
conditions = ['Good', 'Excellent', 'Fair']
nearby_transport_options = ['Yes', 'No']

data = {
    'Location': [np.random.choice(locations) for _ in range(10000)],
    'Bedrooms': np.random.randint(1, 5, size=10000),
    'Bathrooms': np.random.randint(1, 4, size=10000),
    'Size_SqFt': np.random.normal(loc=1500, scale=300, size=10000),
    'Lot_Size_SqFt': np.random.normal(loc=5000, scale=1000, size=10000),
    'Year_Built': np.random.randint(1950, 2024, size=10000),
    'Condition': [np.random.choice(conditions) for _ in range(10000)],
    'Days_on_Market': np.random.randint(10, 90, size=10000),
    'Interest_Rate (%)': np.random.uniform(2.5, 5.0, size=10000),
    'Median_Income ($)': np.random.normal(loc=80000, scale=15000, size=10000),
    'School_Rating': np.random.randint(5, 10, size=10000),
    'Walkability_Score': np.random.randint(50, 100, size=10000),
    'Nearby_Transport': [np.random.choice(nearby_transport_options) for _ in range(10000)],
}

df = pd.DataFrame(data)

# Deliberably making the sale price slightly linear so that we get meaningful results
df['Sale_Price ($)'] = (100000 +
                        df['Bedrooms'] * 30000 +
                        df['Bathrooms'] * 20000 +
                        df['Size_SqFt'] * 200 +
                        df['Lot_Size_SqFt'] * 10 +
                        df['Median_Income ($)'] * 0.5 +
                        np.random.normal(0, 20000, size=10000))

df['Location'] = df['Location'].astype('category').cat.codes
df['Condition'] = df['Condition'].astype('category').cat.codes
df['Nearby_Transport'] = df['Nearby_Transport'].apply(lambda x: 1 if x == 'Yes' else 0)

df.drop(columns=['Year_Built'], inplace=True)

scaler = StandardScaler()
scaled_columns = ['Size_SqFt', 'Lot_Size_SqFt', 'Days_on_Market', 'Interest_Rate (%)', 'Median_Income ($)', 'School_Rating', 'Walkability_Score']
df[scaled_columns] = scaler.fit_transform(df[scaled_columns])

X = df.drop(columns=['Sale_Price ($)'])
y = df['Sale_Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Root Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 19294.26047974623
