In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

np.random.seed(42)
locations = ['Los Angeles, CA', 'San Francisco, CA', 'Austin, TX', 'Miami, FL', 'New York, NY', 'Denver, CO', 'Chicago, IL', 'Seattle, WA', 'Atlanta, GA', 'Boston, MA']
conditions = ['Good', 'Excellent', 'Fair']
nearby_transport_options = ['Yes', 'No']

data = {
    'Location': [np.random.choice(locations) for _ in range(11000)],
    'Bedrooms': np.random.randint(1, 5, size=11000),
    'Bathrooms': np.random.randint(1, 4, size=11000),
    'Size_SqFt': np.random.normal(loc=1500, scale=300, size=11000),
    'Lot_Size_SqFt': np.random.normal(loc=5000, scale=1000, size=11000),
    'Year_Built': np.random.randint(1950, 2024, size=11000),
    'Condition': [np.random.choice(conditions) for _ in range(11000)],
    'Days_on_Market': np.random.randint(10, 90, size=11000),
    'Interest_Rate (%)': np.random.uniform(2.5, 5.0, size=11000),
    'Median_Income ($)': np.random.normal(loc=80000, scale=15000, size=11000),
    'School_Rating': np.random.randint(5, 10, size=11000),
    'Walkability_Score': np.random.randint(50, 100, size=11000),
    'Nearby_Transport': [np.random.choice(nearby_transport_options) for _ in range(11000)],
}

df = pd.DataFrame(data)

df['Sale_Price ($)'] = (100000 +
                        df['Bedrooms'] * 30000 +
                        df['Bathrooms'] * 20000 +
                        df['Size_SqFt'] * 200 +
                        df['Lot_Size_SqFt'] * 10 +
                        df['Median_Income ($)'] * 0.5 +
                        np.random.normal(0, 20000, size=11000))

df['Location'] = df['Location'].astype('category').cat.codes
df['Condition'] = df['Condition'].astype('category').cat.codes
df['Nearby_Transport'] = df['Nearby_Transport'].apply(lambda x: 1 if x == 'Yes' else 0)

df.drop(columns=['Year_Built'], inplace=True)
scaler = StandardScaler()
scaled_columns = ['Size_SqFt', 'Lot_Size_SqFt', 'Days_on_Market', 'Interest_Rate (%)', 'Median_Income ($)', 'School_Rating', 'Walkability_Score']
df[scaled_columns] = scaler.fit_transform(df[scaled_columns])

X = df.drop(columns=['Sale_Price ($)'])
y = df['Sale_Price ($)']

X_initial_train, X_new_data = X[:10000], X[10000:]
y_initial_train, y_new_data = y[:10000], y[10000:]

new_data_chunks = np.array_split(X_new_data, 10)
new_target_chunks = np.array_split(y_new_data, 10)

model = SGDRegressor(max_iter=3, warm_start=True, learning_rate='constant', eta0=0.01)
model.fit(X_initial_train, y_initial_train)

y_pred_initial = model.predict(X_initial_train)
initial_rmse = np.sqrt(mean_squared_error(y_initial_train, y_pred_initial))
print(f"Initial RMSE after first 10,000 records: {initial_rmse:.2f}")

for i, (X_new_batch, y_new_batch) in enumerate(zip(new_data_chunks, new_target_chunks), start=1):
    model.partial_fit(X_new_batch, y_new_batch)
    y_pred = model.predict(X_initial_train)
    rmse = np.sqrt(mean_squared_error(y_initial_train, y_pred))
    print(f"RMSE after adding {i * 100} new records: {rmse:.2f}")

print("Continuous learning with 100-record updates finished.")


Initial RMSE after first 10,000 records: 22557.13
RMSE after adding 100 new records: 21409.20
RMSE after adding 200 new records: 28436.20
RMSE after adding 300 new records: 22059.19
RMSE after adding 400 new records: 21883.62
RMSE after adding 500 new records: 22779.52
RMSE after adding 600 new records: 24104.19
RMSE after adding 700 new records: 23584.44
RMSE after adding 800 new records: 21138.18
RMSE after adding 900 new records: 20786.38
RMSE after adding 1000 new records: 25226.12
Continuous learning with 100-record updates finished.


  return bound(*args, **kwds)
  return bound(*args, **kwds)
