# Regression
***

# Import Packages

In [1]:
# for creating dataset
from sklearn.datasets import load_boston

# general import
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error

# this package
from stack import StackModel, StackMaster

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

# Create dataset

In [2]:
# load data
boston = load_boston()

# convert pandas dataframe
all_df = pd.DataFrame(boston.data, columns=boston.feature_names)
all_df['PRICE'] = boston.target

# split data for train and test
train = all_df.sample(frac=0.8, random_state=0)
test = all_df.drop(train.index)

# drop target column from test dataframe
test_y = test.PRICE
test = test.drop('PRICE', axis=1)

# Fit stage 1 models

In [3]:
# initialize models
models_1 = [
    StackModel(
        model_name='LinearRegression',
        model=LinearRegression,
        x_names=['CRIM', 'ZN', 'INDUS', 'CHAS'],
        y_names='PRICE'),
    StackModel(
        model_name='Ridge',
        model=Ridge,
        x_names=['NOX', 'RM', 'AGE', 'DIS'],
        y_names='PRICE'),
    StackModel(
        model_name='Lasso',
        model=Lasso,
        x_names=['RAD', 'TAX', 'PTRATIO', 'B'],
        y_names='PRICE'),
    StackModel(
        model_name='RandomForestRegressor',
        model=RandomForestRegressor,
        x_names=boston.feature_names,
        y_names='PRICE',
        params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3}),
    StackModel(
        model_name='ExtraTreesRegressor',
        model=ExtraTreesRegressor,
        x_names=boston.feature_names,
        y_names='PRICE',
        params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3})
]

# fit models
master = StackMaster(models=models_1)
master.fit(train, refit=True)

2019-09-21 13:05:54,145 - stack - INFO - LinearRegression start fit
2019-09-21 13:05:54,225 - stack - INFO - LinearRegression end fit
2019-09-21 13:05:54,238 - stack - INFO - LinearRegression save fit pkl
2019-09-21 13:05:54,244 - stack - INFO - Ridge start fit
2019-09-21 13:05:54,339 - stack - INFO - Ridge end fit
2019-09-21 13:05:54,346 - stack - INFO - Ridge save fit pkl
2019-09-21 13:05:54,349 - stack - INFO - Lasso start fit
2019-09-21 13:05:54,551 - stack - INFO - Lasso end fit
2019-09-21 13:05:54,554 - stack - INFO - Lasso save fit pkl
2019-09-21 13:05:54,555 - stack - INFO - RandomForestRegressor start fit
2019-09-21 13:05:55,846 - stack - INFO - RandomForestRegressor end fit
2019-09-21 13:05:55,929 - stack - INFO - RandomForestRegressor save fit pkl
2019-09-21 13:05:55,929 - stack - INFO - ExtraTreesRegressor start fit
2019-09-21 13:05:57,207 - stack - INFO - ExtraTreesRegressor end fit
2019-09-21 13:05:57,242 - stack - INFO - ExtraTreesRegressor save fit pkl


In [4]:
# look at the predicted train data
master.train_pred.head()

Unnamed: 0,LinearRegression,Ridge,Lasso,RandomForestRegressor,ExtraTreesRegressor
329,26.463475,27.33891,22.085124,24.806773,24.959569
371,17.27753,20.400825,17.734009,22.576476,21.373683
219,26.502925,21.642398,28.449293,21.442048,22.897222
403,13.508535,13.813801,19.194976,13.365689,14.505905
78,21.516007,23.795082,21.022313,21.005219,21.899043


# Fit Stage 2 model

In [5]:
# initialize model
model_2 = StackModel(
    model_name='XGBRegressor', 
    model=XGBRegressor, 
    x_names=master.train_pred.columns, 
    y_names='PRICE', 
    params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3})

# make data for stage 2
train_2 = pd.concat([master.train_pred, train.PRICE], axis=1)

# fit model
model_2.fit(train_2, refit=True)

2019-09-21 13:05:59,222 - stack - INFO - XGBRegressor start fit
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \
2019-09-21 13:05:59,406 - stack - INFO - XGBRegressor end fit
2019-09-21 13:05:59,411 - stack - INFO - XGBRegressor save fit pkl




# Predict test data

In [6]:
master.predict(test, repredict=True)
model_2.predict(master.test_pred, repredict=True)

2019-09-21 13:06:00,410 - stack - INFO - LinearRegression start predict
2019-09-21 13:06:00,424 - stack - INFO - LinearRegression end predict
2019-09-21 13:06:00,426 - stack - INFO - LinearRegression save pred pkl
2019-09-21 13:06:00,427 - stack - INFO - Ridge start predict
2019-09-21 13:06:00,440 - stack - INFO - Ridge end predict
2019-09-21 13:06:00,447 - stack - INFO - Ridge save pred pkl
2019-09-21 13:06:00,453 - stack - INFO - Lasso start predict
2019-09-21 13:06:00,473 - stack - INFO - Lasso end predict
2019-09-21 13:06:00,477 - stack - INFO - Lasso save pred pkl
2019-09-21 13:06:00,479 - stack - INFO - RandomForestRegressor start predict
2019-09-21 13:06:01,051 - stack - INFO - RandomForestRegressor end predict
2019-09-21 13:06:01,053 - stack - INFO - RandomForestRegressor save pred pkl
2019-09-21 13:06:01,054 - stack - INFO - ExtraTreesRegressor start predict
2019-09-21 13:06:01,592 - stack - INFO - ExtraTreesRegressor end predict
2019-09-21 13:06:01,594 - stack - INFO - ExtraT

In [7]:
# MAE for test data
print('Stage 1 : LinearRegression')
print('                    {:.4f}'.format(mean_absolute_error(test_y, master.test_pred.LinearRegression)))
print('Stage 1 : Ridge')
print('                    {:.4f}'.format(mean_absolute_error(test_y, master.test_pred.Ridge)))
print('Stage 1 : Lasso')
print('                    {:.4f}'.format(mean_absolute_error(test_y, master.test_pred.Lasso)))
print('Stage 1 : RandomForestRegressor')
print('                    {:.4f}'.format(mean_absolute_error(test_y, master.test_pred.RandomForestRegressor)))
print('Stage 1 : ExtraTreesRegressor')
print('                    {:.4f}'.format(mean_absolute_error(test_y, master.test_pred.ExtraTreesRegressor)))
print('Stage 2 : XGBRegressor')
print('                    {:.4f}'.format(mean_absolute_error(test_y, model_2.test_pred)))

Stage 1 : LinearRegression
                    5.2088
Stage 1 : Ridge
                    4.0497
Stage 1 : Lasso
                    5.4300
Stage 1 : RandomForestRegressor
                    2.7109
Stage 1 : ExtraTreesRegressor
                    2.8769
Stage 2 : XGBRegressor
                    2.4902
