# Model stacking example 
Author: Andrew Szwec


In [1]:
import numpy as np
import pandas as pd

## Load data / add dummies

In [2]:
# read in vehicle data
vehicles = pd.read_csv('used_vehicles.csv')

In [3]:
vehicles

Unnamed: 0,price,year,miles,doors,type
0,22000,2012,13000,2,car
1,14000,2010,30000,2,car
2,13000,2010,73500,4,car
3,9500,2009,78000,4,car
4,9000,2007,47000,4,car
5,4000,2006,124000,2,car
6,3000,2004,177000,4,car
7,2000,2004,209000,4,truck
8,3000,2003,138000,2,car
9,1900,2003,160000,4,car


In [4]:
# convert car to 0 and truck to 1
# vehicles['type'] = vehicles.type.map({'car':0, 'truck':1})
# OR

# create three dummy variables, drop the first dummy variable, and store this as a DataFrame
type_dummies = pd.get_dummies(vehicles.type, prefix='type')

In [5]:
# concatenate the two dummy variable columns onto the original DataFrame
# note: axis=0 means rows, axis=1 means columns
vehicles = pd.concat([vehicles, type_dummies], axis=1)
vehicles.drop(['type'], axis=1, inplace=True)

In [7]:
vehicles

Unnamed: 0,price,year,miles,doors,type_car,type_truck
0,22000,2012,13000,2,1,0
1,14000,2010,30000,2,1,0
2,13000,2010,73500,4,1,0
3,9500,2009,78000,4,1,0
4,9000,2007,47000,4,1,0
5,4000,2006,124000,2,1,0
6,3000,2004,177000,4,1,0
7,2000,2004,209000,4,0,1
8,3000,2003,138000,2,1,0
9,1900,2003,160000,4,1,0


In [49]:
vehicles.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
price         14 non-null int64
year          14 non-null int64
miles         14 non-null int64
doors         14 non-null int64
type_car      14 non-null uint8
type_truck    14 non-null uint8
dtypes: int64(4), uint8(2)
memory usage: 556.0 bytes


In [61]:
vehicles.shape

(14, 6)

## Train test split

In [11]:
from sklearn.model_selection import train_test_split
X = vehicles.drop(['price'], axis=1)
y = vehicles['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=121)

## Random Forest

In [12]:
# import class, instantiate estimator, fit with all data
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rfreg = RandomForestRegressor(n_estimators=100, max_features='auto', oob_score=True, random_state=121)
rfreg.fit(X_train, y_train)
rf_mse = mean_squared_error( rfreg.predict(X_test), y_test)
print('Random forest MSE =',rf_mse)

Random forest MSE = 560586.0


## Ridge Regression

In [61]:
from sklearn.linear_model import RidgeCV

# 5 fold cross validation
ridge = RidgeCV(cv=5, normalize=True)
ridge.fit(X_train, y_train)
rd_mse = mean_squared_error( ridge.predict(X_test), y_test)
print('Ridge Regression MSE =',rd_mse)

Ridge Regression MSE = 1950454.73929


## Model Stacking
1. Concatenate predictions from random forest and ridge regression into one df, using training set

In [62]:
df2 = pd.DataFrame( {'rf':rfreg.predict(X_train), 'rdg':ridge.predict(X_train)})
df2

Unnamed: 0,rdg,rf
0,1984.476787,2897.0
1,6330.955657,5291.0
2,-740.3639,2272.0
3,17283.995315,18395.0
4,11048.044106,9930.0
5,1280.749586,2735.0
6,14926.52937,15000.0
7,11383.592636,8180.0
8,3954.331852,2915.0
9,6845.329385,4756.0


'2. Use a linear model to stack the RF and Ridge together. This is like a weighted average of models

In [63]:
from sklearn.linear_model import LinearRegression

lin = LinearRegression()
lin.fit(df2, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

'3. Now predict on test set to get MSE

In [64]:
# Now predict on test set
testdf = pd.DataFrame( {'rf':rfreg.predict(X_test), 'rdg':ridge.predict(X_test)})
stacked_mse = mean_squared_error( lin.predict(testdf) , y_test)
print('Stacked Model MSE =',stacked_mse)

Stacked Model MSE = 108156.973398


In [65]:
if stacked_mse < rd_mse and stacked_mse < rf_mse:
    print('Stacked Model MSE less than both RF and Ridge')

Stacked Model MSE less than both RF and Ridge


In [None]:
# Test Data
# 4000	2006	124000	2	1	0

In [66]:
newcar = pd.DataFrame({'year':2006, 'miles':124000, 'doors':2, 'type_car':1, 'type_truck':0}, index=[0])
newcar['type_car'] = newcar.type_car.astype('uint8')
newcar['type_truck'] = newcar.type_truck.astype('uint8')

In [70]:
newcar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 5 columns):
doors         1 non-null int64
miles         1 non-null int64
type_car      1 non-null uint8
type_truck    1 non-null uint8
year          1 non-null int64
dtypes: int64(3), uint8(2)
memory usage: 34.0 bytes


In [85]:
newcar = X_train.loc[5].values.reshape(1,5)

In [86]:
x1 = rfreg.predict(newcar)
x2 = ridge.predict(newcar)

In [91]:
x1

array([ 4756.])

In [89]:
zz = pd.DataFrame({'rdg': x1, 'rf': x2}, index=[0])

In [90]:
lin.predict(zz)

array([ 7000.6267653])

In [73]:
X_train.loc[5]

year            2006
miles         124000
doors              2
type_car           1
type_truck         0
Name: 5, dtype: int64