# Regression
***

# Import Packages

In [1]:
# for creating dataset
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

# general import
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error

# this package
from stack import StackModel, StackMaster

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

# Create dataset

In [2]:
boston = load_boston()
X, y = boston.data, boston.target

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# convert pandas dataframe or series
X_train = pd.DataFrame(X_train, columns=boston.feature_names)
X_test = pd.DataFrame(X_test, columns=boston.feature_names)
y_train = pd.Series(y_train, name='PRICE')
y_test = pd.Series(y_test, name='PRICE')

# Fit stage 1 models

In [3]:
# initialize models
models_1 = [
    StackModel(
        model_name='LinearRegression',
        model=LinearRegression,
        x_names=['CRIM', 'ZN', 'INDUS', 'CHAS']),
    StackModel(
        model_name='Ridge',
        model=Ridge,
        x_names=['NOX', 'RM', 'AGE', 'DIS']),
    StackModel(
        model_name='Lasso',
        model=Lasso,
        x_names=['RAD', 'TAX', 'PTRATIO', 'B']),
    StackModel(
        model_name='RandomForestRegressor',
        model=RandomForestRegressor,
        params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3}),
    StackModel(
        model_name='ExtraTreesRegressor',
        model=ExtraTreesRegressor,
        params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3})
]

# fit models
master = StackMaster(models=models_1)
master.fit(X_train, y_train, refit=True)

2019-09-21 18:14:41,559 - stack - INFO - LinearRegression start fit
2019-09-21 18:14:41,613 - stack - INFO - LinearRegression end fit
2019-09-21 18:14:41,615 - stack - INFO - LinearRegression save fit pkl
2019-09-21 18:14:41,627 - stack - INFO - Ridge start fit
2019-09-21 18:14:41,699 - stack - INFO - Ridge end fit
2019-09-21 18:14:41,708 - stack - INFO - Ridge save fit pkl
2019-09-21 18:14:41,711 - stack - INFO - Lasso start fit
2019-09-21 18:14:41,818 - stack - INFO - Lasso end fit
2019-09-21 18:14:41,836 - stack - INFO - Lasso save fit pkl
2019-09-21 18:14:41,837 - stack - INFO - RandomForestRegressor start fit
2019-09-21 18:14:43,156 - stack - INFO - RandomForestRegressor end fit
2019-09-21 18:14:43,240 - stack - INFO - RandomForestRegressor save fit pkl
2019-09-21 18:14:43,241 - stack - INFO - ExtraTreesRegressor start fit
2019-09-21 18:14:44,600 - stack - INFO - ExtraTreesRegressor end fit
2019-09-21 18:14:44,627 - stack - INFO - ExtraTreesRegressor save fit pkl


In [4]:
# look at the predicted train data
master.train_pred.head()

Unnamed: 0,LinearRegression,Ridge,Lasso,RandomForestRegressor,ExtraTreesRegressor
0,30.292733,27.57247,27.15151,27.096734,27.393825
1,22.065163,25.415399,23.100898,22.015542,22.187602
2,27.173329,28.54862,28.016604,27.244074,26.924984
3,24.710997,19.933657,29.612183,21.289207,21.726326
4,12.631813,12.883161,12.867917,11.822341,12.782818


# Fit Stage 2 model

In [5]:
# initialize model
model_2 = StackModel(
    model_name='XGBRegressor', 
    model=XGBRegressor, 
    params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3})

# fit model
model_2.fit(master.train_pred, y_train, refit=True)

2019-09-21 18:14:44,738 - stack - INFO - XGBRegressor start fit
2019-09-21 18:14:44,945 - stack - INFO - XGBRegressor end fit




2019-09-21 18:14:44,948 - stack - INFO - XGBRegressor save fit pkl


In [6]:
# look at the predicted train data
model_2.train_pred.head()

0    27.499838
1    22.727331
2    29.140097
3    23.769829
4    11.781913
Name: XGBRegressor, dtype: float64

# Predict test data

In [7]:
master.predict(X_test, repredict=True)
model_2.predict(master.test_pred, repredict=True)

2019-09-21 18:14:44,965 - stack - INFO - LinearRegression start predict
2019-09-21 18:14:44,976 - stack - INFO - LinearRegression end predict
2019-09-21 18:14:44,978 - stack - INFO - LinearRegression save pred pkl
2019-09-21 18:14:44,979 - stack - INFO - Ridge start predict
2019-09-21 18:14:44,989 - stack - INFO - Ridge end predict
2019-09-21 18:14:44,991 - stack - INFO - Ridge save pred pkl
2019-09-21 18:14:44,992 - stack - INFO - Lasso start predict
2019-09-21 18:14:45,003 - stack - INFO - Lasso end predict
2019-09-21 18:14:45,004 - stack - INFO - Lasso save pred pkl
2019-09-21 18:14:45,005 - stack - INFO - RandomForestRegressor start predict
2019-09-21 18:14:45,536 - stack - INFO - RandomForestRegressor end predict
2019-09-21 18:14:45,539 - stack - INFO - RandomForestRegressor save pred pkl
2019-09-21 18:14:45,540 - stack - INFO - ExtraTreesRegressor start predict
2019-09-21 18:14:46,076 - stack - INFO - ExtraTreesRegressor end predict
2019-09-21 18:14:46,078 - stack - INFO - ExtraT

In [8]:
# look at the predicted test data
master.test_pred.head()

Unnamed: 0,LinearRegression,Ridge,Lasso,RandomForestRegressor,ExtraTreesRegressor
0,25.695274,27.370086,23.19061,23.891939,24.79883
1,16.645482,20.260903,17.051835,25.157991,20.954086
2,27.287732,21.684029,28.715042,21.549445,23.154109
3,12.542779,12.194879,17.663391,11.674114,13.451735
4,21.416366,24.053299,21.697774,21.378585,21.978077


In [9]:
# look at the predicted test data
model_2.test_pred.head()

0    22.907314
1    24.427757
2    21.578939
3    11.538013
4    21.501339
Name: XGBRegressor, dtype: float32

In [10]:
# MAE for test data
print('Stage 1 : LinearRegression')
print('                    {:.4f}'.format(mean_absolute_error(y_test, master.test_pred.LinearRegression)))
print('Stage 1 : Ridge')
print('                    {:.4f}'.format(mean_absolute_error(y_test, master.test_pred.Ridge)))
print('Stage 1 : Lasso')
print('                    {:.4f}'.format(mean_absolute_error(y_test, master.test_pred.Lasso)))
print('Stage 1 : RandomForestRegressor')
print('                    {:.4f}'.format(mean_absolute_error(y_test, master.test_pred.RandomForestRegressor)))
print('Stage 1 : ExtraTreesRegressor')
print('                    {:.4f}'.format(mean_absolute_error(y_test, master.test_pred.ExtraTreesRegressor)))
print('Stage 2 : XGBRegressor')
print('                    {:.4f}'.format(mean_absolute_error(y_test, model_2.test_pred)))

Stage 1 : LinearRegression
                    6.0037
Stage 1 : Ridge
                    4.0124
Stage 1 : Lasso
                    5.9507
Stage 1 : RandomForestRegressor
                    3.0999
Stage 1 : ExtraTreesRegressor
                    3.5053
Stage 2 : XGBRegressor
                    2.9169
