# Model Experiments

In [1]:
import numpy as np
import pandas as pd
from typing import List
from typing import Tuple
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error

In [2]:
SALES_PATH = "data/kc_house_data.csv"
DEMOGRAPHICS_PATH = "data/kc_house_data.csv"
SALES_COLUMN_SELECTION = [
    'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'sqft_above', 'sqft_basement', 'zipcode'
]

In [3]:
def load_data(
    sales_path: str, demographics_path: str, sales_column_selection: List[str]
) -> Tuple[pd.DataFrame, pd.Series]:
    """Load the target and feature data by merging sales and demographics.

    Args:
        sales_path: path to CSV file with home sale data
        demographics_path: path to CSV file with home sale data
        sales_column_selection: list of columns from sales data to be used as
            features

    Returns:
        Tuple containg with two elements: a DataFrame and a Series of the same
        length.  The DataFrame contains features for machine learning, the
        series contains the target variable (home sale price).

    """
    data = pd.read_csv(sales_path,
                           usecols=sales_column_selection,
                           dtype={'zipcode': str})
    demographics = pd.read_csv("data/zipcode_demographics.csv",
                                   dtype={'zipcode': str})

    merged_data = data.merge(demographics, how="left",
                             on="zipcode").drop(columns="zipcode")
    # Remove the target variable from the dataframe, features will remain
    y = merged_data.pop('price')
    x = merged_data

    return x, y

In [4]:
x, y = load_data(SALES_PATH, DEMOGRAPHICS_PATH, SALES_COLUMN_SELECTION)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [6]:
pipe = make_pipeline(RobustScaler(), KNeighborsRegressor()).fit(x_train, y_train)
pred = pipe.predict(x_train)
r2 = r2_score(y_true=y_train, y_pred=pred)
mae = mean_absolute_error(y_true=y_train, y_pred=pred)
mape = mean_absolute_percentage_error(y_true=y_train, y_pred=pred)
print(f"""
    Train Summary KNN:
    R2    - {round(r2, 4)}
    MAE   - {round(mae, 4)}
    MAPE  - {round(mape, 4)}
    """)


    Train Summary KNN:
    R2    - 0.8414
    MAE   - 76232.2497
    MAPE  - 0.1408
    


In [7]:
pred = pipe.predict(x_test)
r2 = r2_score(y_true=y_test, y_pred=pred)
mae = mean_absolute_error(y_true=y_test, y_pred=pred)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=pred)
print(f"""
    Test Summary KNN:
    R2    - {round(r2, 4)}
    MAE   - {round(mae, 4)}
    MAPE  - {round(mape, 4)}
    """)


    Test Summary KNN:
    R2    - 0.7281
    MAE   - 102044.6962
    MAPE  - 0.179
    


In [8]:
pipe = make_pipeline(RobustScaler(), RandomForestRegressor()).fit(x_train, y_train)
pred = pipe.predict(x_train)
r2 = r2_score(y_true=y_train, y_pred=pred)
mae = mean_absolute_error(y_true=y_train, y_pred=pred)
mape = mean_absolute_percentage_error(y_true=y_train, y_pred=pred)
print(f"""
    Train Summary RandomForest:
    R2    - {round(r2, 4)}
    MAE   - {round(mae, 4)}
    MAPE  - {round(mape, 4)}
    """)


    Train Summary RandomForest:
    R2    - 0.972
    MAE   - 32923.7664
    MAPE  - 0.0623
    


In [9]:
pred = pipe.predict(x_test)
r2 = r2_score(y_true=y_test, y_pred=pred)
mae = mean_absolute_error(y_true=y_test, y_pred=pred)
mape = mean_absolute_percentage_error(y_true=y_test, y_pred=pred)
print(f"""
    Test Summary RandomForest:
    R2    - {round(r2, 4)}
    MAE   - {round(mae, 4)}
    MAPE  - {round(mape, 4)}
    """)


    Test Summary RandomForest:
    R2    - 0.7873
    MAE   - 93513.0673
    MAPE  - 0.1692
    


---

# Model Unseen Data Prediction

In [10]:
!pip install requests



In [11]:
import pandas as pd
import requests
import pickle
import json

In [12]:
features = json.load(open("model/model_features.json", "r"))

In [13]:
df = pd.read_csv("data/future_unseen_examples.csv")
df

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,4,1.00,1680,5043,1.5,0,0,4,6,1680,0,1911,0,98118,47.5354,-122.273,1560,5765
1,3,2.50,2220,6380,1.5,0,0,4,8,1660,560,1931,0,98115,47.6974,-122.313,950,6380
2,3,2.25,1630,10962,1.0,0,0,4,8,1100,530,1977,0,98030,47.3801,-122.166,1830,8470
3,5,2.50,1710,9720,2.0,0,0,4,8,1710,0,1974,0,98005,47.5903,-122.157,2270,9672
4,2,1.00,850,6370,1.0,0,0,3,6,850,0,1951,0,98126,47.5198,-122.373,850,5170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3,2.50,2430,54059,2.0,0,0,3,10,2430,0,1987,0,98027,47.4664,-121.992,2910,49658
96,2,2.50,1240,1249,3.0,0,0,3,8,1240,0,2006,0,98107,47.6718,-122.386,1240,2500
97,4,1.75,1860,9750,1.0,0,0,3,7,1460,400,1969,0,98034,47.7097,-122.202,1900,8913
98,5,1.75,2330,3800,1.5,0,0,3,7,1360,970,1927,0,98115,47.6835,-122.308,2100,3800


In [14]:
train = pd.read_csv("data/kc_house_data.csv")
train

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [15]:
zipcode = pd.read_csv("data/zipcode_demographics.csv")
zipcode

Unnamed: 0,ppltn_qty,urbn_ppltn_qty,sbrbn_ppltn_qty,farm_ppltn_qty,non_farm_qty,medn_hshld_incm_amt,medn_incm_per_prsn_amt,hous_val_amt,edctn_less_than_9_qty,edctn_9_12_qty,...,per_farm,per_non_farm,per_less_than_9,per_9_to_12,per_hsd,per_some_clg,per_assoc,per_bchlr,per_prfsnl,zipcode
0,38249.0,37394.0,0.0,0.0,855.0,66051.0,25219.0,192000.0,437.0,2301.0,...,0.0,2.0,1.0,6.0,18.0,20.0,5.0,12.0,4.0,98042
1,22036.0,22036.0,0.0,0.0,0.0,91904.0,53799.0,573900.0,149.0,404.0,...,0.0,0.0,0.0,1.0,6.0,12.0,3.0,27.0,22.0,98040
2,18194.0,18194.0,0.0,0.0,0.0,61813.0,31765.0,246600.0,269.0,905.0,...,0.0,0.0,1.0,4.0,13.0,20.0,6.0,19.0,9.0,98028
3,21956.0,21956.0,0.0,0.0,0.0,47461.0,22158.0,175400.0,925.0,1773.0,...,0.0,0.0,4.0,8.0,20.0,21.0,5.0,12.0,4.0,98178
4,22814.0,22814.0,0.0,0.0,0.0,48606.0,28398.0,252600.0,599.0,1148.0,...,0.0,0.0,2.0,5.0,13.0,17.0,5.0,23.0,12.0,98007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,35140.0,35021.0,0.0,0.0,119.0,81929.0,41856.0,335900.0,212.0,865.0,...,0.0,0.0,0.0,2.0,8.0,15.0,4.0,27.0,15.0,98006
66,23926.5,23298.0,0.0,0.0,0.0,56933.0,27639.5,239850.0,406.0,1213.0,...,0.0,0.0,1.0,5.0,15.0,19.0,5.0,19.0,7.5,98074
67,23926.5,23298.0,0.0,0.0,0.0,56933.0,27639.5,239850.0,406.0,1213.0,...,0.0,0.0,1.0,5.0,15.0,19.0,5.0,19.0,7.5,98077
68,23926.5,23298.0,0.0,0.0,0.0,56933.0,27639.5,239850.0,406.0,1213.0,...,0.0,0.0,1.0,5.0,15.0,19.0,5.0,19.0,7.5,98030


In [16]:
model = pickle.load(open("model/model.pkl", 'rb'))
model

In [17]:
response = requests.get("http://localhost:8000/features/required", json=df.iloc[0].to_dict())
print(f"Status Code: {response.status_code}")
print(f"Response Text: {response.text}")

Status Code: 200
Response Text: {"required_features":["bedrooms","bathrooms","sqft_living","sqft_lot","floors","sqft_above","sqft_basement","zipcode"],"description":"Features that must be provided by the user for prediction (excludes zipcode demographic features)"}


In [18]:
model.predict(train.iloc[[0]].merge(zipcode, how="left", on="zipcode")[features])

array([239141.6])

In [19]:
response = requests.post("http://localhost:8000/predict", json=train.iloc[0].to_dict())
print(f"Status Code: {response.status_code}")
print(f"Response Text: {response.text}")

Status Code: 200
Response Text: {"predicted_price":239141.6,"metadata":{"prediction_id":"2061685e","timestamp":"2025-10-24T17:57:28.547625","processing_time_ms":67.80147552490234,"model_name":"KNeighborsRegressor"},"features_used":["bedrooms","bathrooms","sqft_living","sqft_lot","floors","sqft_above","sqft_basement","ppltn_qty","urbn_ppltn_qty","sbrbn_ppltn_qty","farm_ppltn_qty","non_farm_qty","medn_hshld_incm_amt","medn_incm_per_prsn_amt","hous_val_amt","edctn_less_than_9_qty","edctn_9_12_qty","edctn_high_schl_qty","edctn_some_clg_qty","edctn_assoc_dgre_qty","edctn_bchlr_dgre_qty","edctn_prfsnl_qty","per_urbn","per_sbrbn","per_farm","per_non_farm","per_less_than_9","per_9_to_12","per_hsd","per_some_clg","per_assoc","per_bchlr","per_prfsnl"]}


In [20]:
model.predict(df.iloc[[0]].merge(zipcode, how="left", on="zipcode")[features])

array([458520.])

In [30]:
df

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,4,1.00,1680,5043,1.5,0,0,4,6,1680,0,1911,0,98118,47.5354,-122.273,1560,5765
1,3,2.50,2220,6380,1.5,0,0,4,8,1660,560,1931,0,98115,47.6974,-122.313,950,6380
2,3,2.25,1630,10962,1.0,0,0,4,8,1100,530,1977,0,98030,47.3801,-122.166,1830,8470
3,5,2.50,1710,9720,2.0,0,0,4,8,1710,0,1974,0,98005,47.5903,-122.157,2270,9672
4,2,1.00,850,6370,1.0,0,0,3,6,850,0,1951,0,98126,47.5198,-122.373,850,5170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3,2.50,2430,54059,2.0,0,0,3,10,2430,0,1987,0,98027,47.4664,-121.992,2910,49658
96,2,2.50,1240,1249,3.0,0,0,3,8,1240,0,2006,0,98107,47.6718,-122.386,1240,2500
97,4,1.75,1860,9750,1.0,0,0,3,7,1460,400,1969,0,98034,47.7097,-122.202,1900,8913
98,5,1.75,2330,3800,1.5,0,0,3,7,1360,970,1927,0,98115,47.6835,-122.308,2100,3800


In [21]:
response = requests.post("http://localhost:8000/predict", json=df.iloc[0].to_dict())
print(f"Status Code: {response.status_code}")
print(f"Response Text: {response.text}")

Status Code: 200
Response Text: {"predicted_price":458520.0,"metadata":{"prediction_id":"89272001","timestamp":"2025-10-24T17:57:28.572384","processing_time_ms":6.589651107788086,"model_name":"KNeighborsRegressor"},"features_used":["bedrooms","bathrooms","sqft_living","sqft_lot","floors","sqft_above","sqft_basement","ppltn_qty","urbn_ppltn_qty","sbrbn_ppltn_qty","farm_ppltn_qty","non_farm_qty","medn_hshld_incm_amt","medn_incm_per_prsn_amt","hous_val_amt","edctn_less_than_9_qty","edctn_9_12_qty","edctn_high_schl_qty","edctn_some_clg_qty","edctn_assoc_dgre_qty","edctn_bchlr_dgre_qty","edctn_prfsnl_qty","per_urbn","per_sbrbn","per_farm","per_non_farm","per_less_than_9","per_9_to_12","per_hsd","per_some_clg","per_assoc","per_bchlr","per_prfsnl"]}


In [22]:
response = requests.post("http://localhost:8000/predict", json=df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','sqft_above', 'sqft_basement', 'zipcode']].iloc[0].to_dict())
print(f"Status Code: {response.status_code}")
print(f"Response Text: {response.text}")

Status Code: 200
Response Text: {"predicted_price":458520.0,"metadata":{"prediction_id":"242c7a20","timestamp":"2025-10-24T17:57:28.588374","processing_time_ms":4.784345626831055,"model_name":"KNeighborsRegressor"},"features_used":["bedrooms","bathrooms","sqft_living","sqft_lot","floors","sqft_above","sqft_basement","ppltn_qty","urbn_ppltn_qty","sbrbn_ppltn_qty","farm_ppltn_qty","non_farm_qty","medn_hshld_incm_amt","medn_incm_per_prsn_amt","hous_val_amt","edctn_less_than_9_qty","edctn_9_12_qty","edctn_high_schl_qty","edctn_some_clg_qty","edctn_assoc_dgre_qty","edctn_bchlr_dgre_qty","edctn_prfsnl_qty","per_urbn","per_sbrbn","per_farm","per_non_farm","per_less_than_9","per_9_to_12","per_hsd","per_some_clg","per_assoc","per_bchlr","per_prfsnl"]}


In [23]:
response = requests.post("http://localhost:8000/predict-batch", json=df.to_dict(orient="records"))
print(f"Status Code: {response.status_code}")
print(f"Response Text: {response.text}")

Status Code: 200
Response Text: {"predictions":[{"predicted_price":458520.0,"metadata":{"prediction_id":"83cfb25a","timestamp":"2025-10-24T17:57:28.949820","processing_time_ms":6.686687469482422,"model_name":"KNeighborsRegressor"},"features_used":["bedrooms","bathrooms","sqft_living","sqft_lot","floors","sqft_above","sqft_basement","ppltn_qty","urbn_ppltn_qty","sbrbn_ppltn_qty","farm_ppltn_qty","non_farm_qty","medn_hshld_incm_amt","medn_incm_per_prsn_amt","hous_val_amt","edctn_less_than_9_qty","edctn_9_12_qty","edctn_high_schl_qty","edctn_some_clg_qty","edctn_assoc_dgre_qty","edctn_bchlr_dgre_qty","edctn_prfsnl_qty","per_urbn","per_sbrbn","per_farm","per_non_farm","per_less_than_9","per_9_to_12","per_hsd","per_some_clg","per_assoc","per_bchlr","per_prfsnl"]},{"predicted_price":612800.0,"metadata":{"prediction_id":"1594c4c9","timestamp":"2025-10-24T17:57:28.949837","processing_time_ms":6.007671356201172,"model_name":"KNeighborsRegressor"},"features_used":["bedrooms","bathrooms","sqft_li

In [24]:
response = requests.post("http://localhost:8000/predict-batch", json=df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors','sqft_above', 'sqft_basement', 'zipcode']].to_dict(orient="records"))
print(f"Status Code: {response.status_code}")
print(f"Response Text: {response.text}")

Status Code: 200
Response Text: {"predictions":[{"predicted_price":458520.0,"metadata":{"prediction_id":"0cbee4ac","timestamp":"2025-10-24T17:57:29.305102","processing_time_ms":6.556510925292969,"model_name":"KNeighborsRegressor"},"features_used":["bedrooms","bathrooms","sqft_living","sqft_lot","floors","sqft_above","sqft_basement","ppltn_qty","urbn_ppltn_qty","sbrbn_ppltn_qty","farm_ppltn_qty","non_farm_qty","medn_hshld_incm_amt","medn_incm_per_prsn_amt","hous_val_amt","edctn_less_than_9_qty","edctn_9_12_qty","edctn_high_schl_qty","edctn_some_clg_qty","edctn_assoc_dgre_qty","edctn_bchlr_dgre_qty","edctn_prfsnl_qty","per_urbn","per_sbrbn","per_farm","per_non_farm","per_less_than_9","per_9_to_12","per_hsd","per_some_clg","per_assoc","per_bchlr","per_prfsnl"]},{"predicted_price":612800.0,"metadata":{"prediction_id":"9a311891","timestamp":"2025-10-24T17:57:29.305118","processing_time_ms":4.347562789916992,"model_name":"KNeighborsRegressor"},"features_used":["bedrooms","bathrooms","sqft_li

In [25]:
response = requests.post("http://localhost:8000/retrain-model")
response

<Response [200]>

In [26]:
response = requests.post("http://localhost:8000/predict", json=df.iloc[0].to_dict())
print(f"Status Code: {response.status_code}")
print(f"Response Text: {response.text}")

Status Code: 200
Response Text: {"predicted_price":402015.5,"metadata":{"prediction_id":"35d00953","timestamp":"2025-10-24T17:57:38.496700","processing_time_ms":4.944562911987305,"model_name":"RandomForestRegressor"},"features_used":["bedrooms","bathrooms","sqft_living","sqft_lot","floors","sqft_above","sqft_basement","ppltn_qty","urbn_ppltn_qty","sbrbn_ppltn_qty","farm_ppltn_qty","non_farm_qty","medn_hshld_incm_amt","medn_incm_per_prsn_amt","hous_val_amt","edctn_less_than_9_qty","edctn_9_12_qty","edctn_high_schl_qty","edctn_some_clg_qty","edctn_assoc_dgre_qty","edctn_bchlr_dgre_qty","edctn_prfsnl_qty","per_urbn","per_sbrbn","per_farm","per_non_farm","per_less_than_9","per_9_to_12","per_hsd","per_some_clg","per_assoc","per_bchlr","per_prfsnl"]}


In [27]:
response = requests.post("http://localhost:8000/reload-model")
response

<Response [200]>

In [28]:
response = requests.post("http://localhost:8000/rollback-model")
response

<Response [200]>

In [29]:
response = requests.post("http://localhost:8000/predict", json=df.iloc[0].to_dict())
print(f"Status Code: {response.status_code}")
print(f"Response Text: {response.text}")

Status Code: 200
Response Text: {"predicted_price":458520.0,"metadata":{"prediction_id":"692012c7","timestamp":"2025-10-24T17:57:38.674618","processing_time_ms":9.67860221862793,"model_name":"KNeighborsRegressor"},"features_used":["bedrooms","bathrooms","sqft_living","sqft_lot","floors","sqft_above","sqft_basement","ppltn_qty","urbn_ppltn_qty","sbrbn_ppltn_qty","farm_ppltn_qty","non_farm_qty","medn_hshld_incm_amt","medn_incm_per_prsn_amt","hous_val_amt","edctn_less_than_9_qty","edctn_9_12_qty","edctn_high_schl_qty","edctn_some_clg_qty","edctn_assoc_dgre_qty","edctn_bchlr_dgre_qty","edctn_prfsnl_qty","per_urbn","per_sbrbn","per_farm","per_non_farm","per_less_than_9","per_9_to_12","per_hsd","per_some_clg","per_assoc","per_bchlr","per_prfsnl"]}
