# PyCaret Regression — King County House Sales

Dataset: `kc_house_data.csv`. Target: `price`.

## Environment
Using Python 3.11 with `pycaret==3.3.0` inside the shared `.venv`. Notebook assumes data CSVs are present in `../data/` and GPU is available; if not, PyCaret will fall back to CPU.

In [1]:
import pandas as pd
from pycaret.regression import *

csv_path = "../data/kc_house_data.csv"
df = pd.read_csv(csv_path)
print(df.head())

           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \
0      5650     1.0           0     0  ...      7        1180              0   
1      7242     2.0           0     0  ...      7        2170            400   
2     10000     1.0           0     0  ...      6         770              0   
3      5000     1.0           0     0  ...      7        1050            910   
4      8080     1.0           0     0  ...      8        1680              0   

   yr_built  yr_renovated  zipcode      lat     lo

In [2]:
s = setup(
    data=df,
    target='price',
    session_id=42,
    train_size=0.8,
    fold=5,
    normalize=True,
    use_gpu=False,
    log_experiment=True,
    experiment_name='pycaret_regression_kc_house',
)
compare_models();

Unnamed: 0,Description,Value
0,Session id,42
1,Target,price
2,Target type,Regression
3,Original data shape,"(21613, 21)"
4,Transformed data shape,"(21613, 21)"
5,Transformed train set shape,"(17290, 21)"
6,Transformed test set shape,"(4323, 21)"
7,Numeric features,19
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,62611.5346,12912707259.2779,113122.6126,0.9019,0.1634,0.121,0.904
xgboost,Extreme Gradient Boosting,68128.3922,14505757696.0,120279.5156,0.8885,0.1752,0.1305,0.356
et,Extra Trees Regressor,68985.4548,15153357492.8279,122878.196,0.884,0.1794,0.1321,1.076
lightgbm,Light Gradient Boosting Machine,67426.5954,15556568146.6924,123944.6944,0.8821,0.1728,0.129,0.39
rf,Random Forest Regressor,69753.6413,16480298905.786,128080.7795,0.8741,0.1789,0.1321,2.734
gbr,Gradient Boosting Regressor,77166.736,17441894753.2918,131934.9131,0.8661,0.1925,0.1488,0.794
knn,K Neighbors Regressor,95018.1547,30035138969.6,172634.85,0.7716,0.2317,0.1775,0.114
dt,Decision Tree Regressor,101885.1947,36463421114.3718,190634.4087,0.7197,0.2556,0.1894,0.13
lar,Least Angle Regression,125628.7529,39783281886.8975,198998.1793,0.6963,0.4157,0.2571,0.018
lasso,Lasso Regression,125628.5957,39783246329.8266,198998.0939,0.6963,0.4151,0.2571,1.192


In [3]:
best = compare_models()
tuned = tune_model(best)
plot_model(tuned, plot='residuals', save=True)
plot_model(tuned, plot='feature', save=True)
plot_model(tuned, plot='error', save=True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,62611.5346,12912707259.2779,113122.6126,0.9019,0.1634,0.121,0.844
xgboost,Extreme Gradient Boosting,68128.3922,14505757696.0,120279.5156,0.8885,0.1752,0.1305,0.328
et,Extra Trees Regressor,68985.4548,15153357492.8279,122878.196,0.884,0.1794,0.1321,0.72
lightgbm,Light Gradient Boosting Machine,67426.5954,15556568146.6924,123944.6944,0.8821,0.1728,0.129,0.404
rf,Random Forest Regressor,69753.6413,16480298905.786,128080.7795,0.8741,0.1789,0.1321,1.83
gbr,Gradient Boosting Regressor,77166.736,17441894753.2918,131934.9131,0.8661,0.1925,0.1488,0.636
knn,K Neighbors Regressor,95018.1547,30035138969.6,172634.85,0.7716,0.2317,0.1775,0.04
dt,Decision Tree Regressor,101885.1947,36463421114.3718,190634.4087,0.7197,0.2556,0.1894,0.048
lar,Least Angle Regression,125628.7529,39783281886.8975,198998.1793,0.6963,0.4157,0.2571,0.012
lasso,Lasso Regression,125628.5957,39783246329.8266,198998.0939,0.6963,0.4151,0.2571,0.09


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,80356.8235,21013810473.1231,144961.4103,0.8631,0.2002,0.1548
1,77269.1745,16138133161.0463,127035.9522,0.8797,0.1967,0.151
2,77850.165,16073300568.5668,126780.5213,0.8769,0.1948,0.1514
3,74963.6277,14434442262.7849,120143.4237,0.8825,0.1907,0.1469
4,75286.3371,14402315347.1422,120009.6469,0.8716,0.1907,0.1477
Mean,77145.2255,16412400362.5327,127786.1909,0.8748,0.1946,0.1504
Std,1951.4866,2421402328.7086,9115.3598,0.0069,0.0036,0.0029


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


'Prediction Error.png'

In [4]:
final_best = finalize_model(tuned)
preds = predict_model(final_best)
print(preds[['prediction_label']].head())
save_model(final_best, 'regression_kc_house_model')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,52572.2553,6059746036.1977,77844.3706,0.9599,0.1432,0.1065


       prediction_label
735        3.893285e+05
2830       9.415220e+05
4106       1.139526e+06
16218      1.642343e+06
19964      7.469582e+05
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['id', 'bedrooms', 'bathrooms',
                                              'sqft_living', 'sqft_lot',
                                              'floors', 'waterfront', 'view',
                                              'condition', 'grade', 'sqft_above',
                                              'sqft_basement', 'yr_built',
                                              'yr_renovated', 'zipcode', 'lat',
                                              'long', 'sqft_living15',
                                              'sqft_lot15'],
                                     transformer=SimpleImputer())),
                 ('categorical_im...
                  TransformerWrapper(include=['date'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('rest_encoding',
                  TransformerWrapper(