In [3]:
import json
import pickle

import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor

from sklearn.pipeline import Pipeline
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:
def mean_average_percentage_error(y_true, y_pred, epsilon=0.01):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.mean(abs(y_true - y_pred) / (y_true + epsilon))

In [55]:
def get_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_average_percentage_error(y_true, y_pred)
    return {"RMSE": rmse, "MAE": mae, "MAPE": mape}

In [56]:
results = pd.DataFrame(columns=["RMSE", "MAE", "MAPE"])

# Data

In [36]:
sklearn_data = load_boston()

In [37]:
print(sklearn_data["DESCR"])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [38]:
data = pd.DataFrame(sklearn_data["data"], columns=sklearn_data["feature_names"])
data["MEDV"] = sklearn_data["target"]

In [39]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [40]:
data_train, data_val = train_test_split(data, test_size=0.2, shuffle=True, random_state=10)
data_train.reset_index(drop=True, inplace=True)
data_val.reset_index(drop=True, inplace=True)

In [41]:
data_train.to_csv("../data/boston_house_prices_train.csv", index=False)
data_val.to_csv("../data/boston_house_prices_val.csv", index=False)

In [42]:
X_train = data_train.drop("MEDV", axis=1)
y_train = data_train["MEDV"]

X_val = data_val.drop("MEDV", axis=1)
y_val = data_val["MEDV"]

In [43]:
with open("../models/feature_sequence.txt", "w") as f:
    f.write(json.dumps(X_train.columns.tolist()))

# Baseline: random

In [57]:
class RandomRegressor(object):
    
    def __init__(self):
        super().__init__()
        self._max = 100
        self._min = 0
    
    def fit(self, X, y):
        self._max, self._min = np.quantile(y, (0.01, 0.99))
        return self
    
    def predict(self, X):
        np.random.seed(10)
        return np.random.rand(len(X)) * (self._max - self._min) + self._min

In [58]:
model = RandomRegressor()

In [59]:
model.fit(X_train, y_train)

<__main__.RandomRegressor at 0x1a200a9a10>

In [60]:
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)

In [68]:
metrics_train = get_metrics(y_train, preds_train)
metrics_val = get_metrics(y_val, preds_val)

results.loc["Random (Baseline)"] = [metrics_val[k] for k in ["RMSE", "MAE", "MAPE"]]

In [69]:
results

Unnamed: 0,RMSE,MAE,MAPE
Random (Baseline),16.942223,14.275515,0.70892


# Mean model

In [81]:
class MeanRegressor(object):
    
    def __init__(self):
        super().__init__()
        self._mean = 0
    
    def fit(self, X, y):
        self._mean = np.mean(y)
        return self
    
    def predict(self, X):
        return np.full(len(X), self._mean)

In [82]:
model = MeanRegressor()

In [83]:
model.fit(X_train, y_train)

<__main__.MeanRegressor at 0x1a1f4efa50>

In [84]:
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)

In [85]:
metrics_train = get_metrics(y_train, preds_train)
metrics_val = get_metrics(y_val, preds_val)

results.loc["Mean"] = [metrics_val[k] for k in ["RMSE", "MAE", "MAPE"]]
results

Unnamed: 0,RMSE,MAE,MAPE
Random (Baseline),16.942223,14.275515,0.70892
Mean,10.785657,7.554703,0.31322


In [86]:
with open("../models/MeanRegressor.pkl", "wb") as f:
    pickle.dump(model, f)

# Linear regression model

In [87]:
model = Pipeline([
    ("Scaler", StandardScaler()),
    ("Model", LinearRegression())
])

In [88]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('Scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('Model',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [89]:
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)

In [90]:
metrics_train = get_metrics(y_train, preds_train)
metrics_val = get_metrics(y_val, preds_val)

results.loc["Linear Regression"] = [metrics_val[k] for k in ["RMSE", "MAE", "MAPE"]]
results

Unnamed: 0,RMSE,MAE,MAPE
Random (Baseline),16.942223,14.275515,0.70892
Mean,10.785657,7.554703,0.31322
Linear Regression,5.866342,4.061419,0.179058


In [91]:
with open("../models/LinearRegressor.pkl", "wb") as f:
    pickle.dump(model, f)

# LightGBM model

In [92]:
model = LGBMRegressor()

In [93]:
model.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [94]:
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)

In [95]:
metrics_train = get_metrics(y_train, preds_train)
metrics_val = get_metrics(y_val, preds_val)

results.loc["LightGBM"] = [metrics_val[k] for k in ["RMSE", "MAE", "MAPE"]]
results

Unnamed: 0,RMSE,MAE,MAPE
Random (Baseline),16.942223,14.275515,0.70892
Mean,10.785657,7.554703,0.31322
Linear Regression,5.866342,4.061419,0.179058
LightGBM,3.967011,2.545855,0.101984


In [34]:
with open("../models/LGBMRegressor.pkl", "wb") as f:
    pickle.dump(model, f)