In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# data

In [2]:
train_df = pd.read_csv('../big_data/train.csv')

In [3]:
input_cols = ['GrLivArea', 'YearBuilt']
target = 'SalePrice'
train_df[target] = np.log(train_df[target])

In [4]:
train_set, test_set = train_test_split(train_df, test_size=0.2,
                                       shuffle=True, random_state=42)

# model training

In [5]:
metrics = {'linear_reg':[],
           'svr':[],
           'dt':[]}

models = [LinearRegression(), DecisionTreeRegressor(), SVR()]

model_names = ['linear_reg', 'svr', 'dt',]
preds = {}
oofs = []
for name, model in zip(model_names, models):
    reg = model.fit(train_set[input_cols], train_set[target])
    oofs.append(reg.predict(train_set[input_cols]))
    pred = reg.predict(test_set[input_cols])
    score = mean_squared_error(test_set[target], pred)
    metrics[name].append(score)
    preds[name] = pred

In [6]:
#### result ####
for name in metrics.keys():
    print(f'Model {name}:', np.round(np.mean(metrics[name]), 3))

Model linear_reg: 0.052
Model svr: 0.063
Model dt: 0.054


# Ensemble

In [7]:
### blending ####
weights = [0.4, 0.2, 0.4]

final_pred = None
for i, name in enumerate(model_names):
    if i==0:
        final_pred = weights[i]*preds[name]
    else:
        final_pred = final_pred + weights[i]*preds[name]

score = mean_squared_error(test_set[target], final_pred)
print(f'Blending:', np.round(np.mean(score), 3))

Blending: 0.048


# Stacked Ensemble

In [9]:
### stacking ####
stacking_model = LinearRegression()
X_pred = np.asarray(oofs).T
X_test_pred = np.zeros((test_set.shape[0], 3))
for i, name in enumerate(model_names):
    X_test_pred[:, i] = preds[name]

reg = stacking_model.fit(X_pred, train_set[target])
pred_stack = reg.predict(X_test_pred)

score = mean_squared_error(test_set[target], pred_stack)
print(f'Stacking:', np.round(np.mean(score), 3))

Stacking: 0.063


In [10]:
# ensemble 
#### bagging ###
bagging = 5

metrics = {'linear_reg': [],
           'svr': [],
           'dt': []}

models = [LinearRegression(), DecisionTreeRegressor(),   SVR()]
model_names = ['linear_reg', 'svr', 'dt', ]
preds = {}

for i in range(bagging):
    frac = np.random.randint(80, 90)
    train_ = train_set.sample(int(train_set.shape[0]*frac/100))
    for name, model in zip(model_names, models):
        reg = model.fit(train_[input_cols], train_[target])
        pred = reg.predict(test_set[input_cols])
        score = mean_squared_error(test_set[target], pred)
        metrics[name].append(score)
        if i==0:
            preds[name] = pred/bagging
        else:
            preds[name] += pred / bagging

In [11]:
#### result ####
for name in metrics.keys():
    print(f'Model {name}:', np.round(np.mean(metrics[name]), 3))

weights = [0.4, 0.2, 0.4]

final_pred = None
for i, name in enumerate(model_names):
    if i==0:
        final_pred = weights[i]*preds[name]
    else:
        final_pred = final_pred + weights[i]*preds[name]

score = mean_squared_error(test_set[target], final_pred)
print(f'Bagging and Blending:', np.round(np.mean(score), 3))

Model linear_reg: 0.052
Model svr: 0.077
Model dt: 0.055
Bagging and Blending: 0.049
