In [None]:
from assess_property.read_data import ReadAllYears
from assess_property.preprocess import RemoveOutlier, ScaleTotalValue
from assess_property.fit_learner import Experiment

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.stats import describe

# Read and format csvs from disk 

In [None]:
yrs = range(2014,2024)
vnames = ["PID", "YR_REMODEL", "YR_BUILT", "LIVING_AREA", "LU", "ZIPCODE", "TOTAL_VALUE","YEAR"]
reader = ReadAllYears(yrs, vnames)
reader.read_all_years()

In [None]:
reader.df.info()

In [None]:
print(reader.df.head())

# Remove outliers

In [None]:
remover = RemoveOutlier(reader.df)
remover.run()

# Change unit of total value

Originally, total value is in dollars. We use units of hundred of thousands of dollars

In [None]:
scaler = ScaleTotalValue(remover.df)
scaler.run()

# Focus on R1

In [None]:
df = scaler.df[scaler.df["LU"] == "R1"].copy()
df.info()

# Cross-validation experiment

There is clear benefit to using light gbm over linear regression.

In [None]:
var_names = ["YR_BUILT",
             "YEAR",
             "YR_REMODEL",
            "LIVING_AREA"]
response_name = "TOTAL_VALUE_IN_HUNDRED_GRAND"
random_seed = 0
n_fold = 10
experiment = Experiment(df, 
                    var_names, 
                    response_name, 
                    n_fold=n_fold,
                    random_seed = random_seed)
experiment.set_up()

## Linear regression

In [None]:
lm_cs_res = experiment.cv_fit(learner = "lm")

Average absolute error is about $\$270,000$.

In [None]:
print(lm_cs_res["test_neg_mean_absolute_error"])

Average absolute percentage error is about $50\%$

In [None]:
print(lm_cs_res["test_neg_mean_absolute_percentage_error"])

## Light GBM

In [None]:
lgbm_cs_res = experiment.cv_fit(learner = "lgbm")

Average absolute error is about $\$160,000$.
There is clearly a benefit compared to using linear model.

In [None]:
print(lgbm_cs_res["test_neg_mean_absolute_error"])

Average absolute percentage error is about $27\%$

In [None]:
print(lgbm_cs_res["test_neg_mean_absolute_percentage_error"])

### Visualize errors as boxplots

In [None]:
dflist = []
for prefix in ["lm", "lgbm"]:
    name = "%s_cs_res" %prefix
    d_ = locals()[name]
    tempdf = pd.DataFrame.from_dict(d_)
    tempdf["learner"] = name
    tempdf["fold_idx"] = tempdf.index
    dflist.append(tempdf)
errdf = pd.concat(dflist).reset_index(drop=True)

errdf = pd.wide_to_long(df = errdf,
                stubnames="test",
                i = ["learner","fold_idx"],
                j = "score_type",
                sep='_', 
                suffix=r'\w+').reset_index()
errdf.rename(columns = {"test": "error"}, inplace=True)
errdf["error"] = -errdf["error"]
errdf["error_type"] = errdf["score_type"].apply(lambda x: "_".join(x.split("_")[1:]))

In [None]:
sns.set(font_scale = 1.0)
g = sns.FacetGrid(data = errdf,
                  height = 3,
                  aspect =1.7,
                  col="error_type",
                  sharex=False)

g.map_dataframe(sns.boxplot,
                x = "error",
                y = "learner",
                hue = "learner")
g.add_legend()
plt.show()

# One train-test split experiment

In [None]:
var_names = ["YR_BUILT",
             "YEAR",
             "YR_REMODEL",
            "LIVING_AREA"]
response_name = "TOTAL_VALUE_IN_HUNDRED_GRAND"
random_seed = 0
n_fold = 10
experiment = Experiment(df, 
                    var_names, 
                    response_name, 
                    n_fold=n_fold,
                    random_seed = random_seed)
experiment.set_up()


## Error distribution

In [None]:
def lm_vs_lgbm(fold_idx):
    dflist = []
    for name in ["lm", "lgbm"]: 
        _, ytest, _, ypred = experiment.fit(learner=name,
                             fold_idx = 0)
        epsilon = np.finfo(np.float64).eps
        mape = np.abs(ypred - ytest) / np.maximum(np.abs(ytest), epsilon)
        errdf = pd.DataFrame(data = mape[:,np.newaxis],
                             columns= ["error"])
        errdf["test_idx"] = errdf.index
        errdf["learner"] = name 
        dflist.append(errdf)
    errdf = pd.concat(dflist)
    return errdf 

In [None]:
errdf = lm_vs_lgbm(0)

In [None]:
sns.histplot(data = errdf,
         log_scale = True,
         x = "error",
         hue = "learner")
plt.show()

In [None]:
g = sns.boxplot(data = errdf,
         x = "error",
         y = "learner",
         hue = "learner")
g.legend()
plt.show()

In [None]:
errdf.groupby(by = "learner")["error"].mean()

## Feature importance

In [None]:
import shap