# Sprint アンサンブル学習

In [24]:
import math
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

%matplotlib inline

In [25]:

df = pd.read_csv("train.csv")
X = df[["GrLivArea", "YearBuilt"]].values
y = df["SalePrice"].values

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [27]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)
print("線形回帰 MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))

線形回帰 MSE : 2495554898.6683216


In [28]:
svm = SVR()
svm.fit(X_train, y_train)
svm_y_pred = svm.predict(X_test)
print("SVM MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))

SVM MSE : 7861854841.842987




In [29]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree_y_pred = tree.predict(X_test)
print("決定木 MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))

決定木 MSE : 2115019610.5217845


# 【問題1】ブレンディングのスクラッチ実装
ブレンディング をスクラッチ実装し、単一モデルより精度があがる例を 最低3つ 示してください。精度があがるとは、検証用データに対する平均二乗誤差（MSE）が小さくなることを指します。

In [30]:
def blend(X_train, X_test, y_train, model1, model2):
    model1.fit(X_train, y_train)
    model1_pred = model1.predict(X_test)
    
    model2.fit(X_train, y_train)
    model2_pred = model2.predict(X_test)
    y_pred = (model1_pred + model2_pred) / 2
    return y_pred

In [31]:

# 線形回帰と決定木の予測結果の平均を予測値とする
y_pred = blend(X_train, X_test, y_train, lr, tree)
print("   線形回帰     MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))
print("      SVM       MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))
print("    決定木      MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("線形回帰+決定木 MSE : {}".format(mean_squared_error(y_test, y_pred)))

   線形回帰     MSE : 2495554898.6683216
      SVM       MSE : 7861854841.842987
    決定木      MSE : 2115019610.5217845
線形回帰+決定木 MSE : 1890511222.7814226


In [32]:
# SVMと決定木の予測結果の平均を予測値とする
y_pred = blend(X_train, X_test, y_train, svm, tree)
print("線形回帰   MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))
print("SVM        MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))
print("決定木     MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("SVM+決定木 MSE : {}".format(mean_squared_error(y_test, y_pred)))

線形回帰   MSE : 2495554898.6683216
SVM        MSE : 7861854841.842987
決定木     MSE : 2115019610.5217845
SVM+決定木 MSE : 3376643804.7117834




In [33]:
# SVMの線形カーネルと多項式カーネルのモデルの予測の平均
model1 = SVR(gamma='scale', kernel='linear')
model2 = SVR(gamma='scale') 
y_pred = blend(X_train, X_test, y_train, model1, model2)
print("   線形回帰     MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))
print("      SVM       MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))
print("    決定木      MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("SVM 線形+多項式 MSE : {}".format(mean_squared_error(y_test, y_pred)))

   線形回帰     MSE : 2495554898.6683216
      SVM       MSE : 7861854841.842987
    決定木      MSE : 2115019610.5217845
SVM 線形+多項式 MSE : 4450951532.463163


In [11]:
print("LogisticRegression\n",mean_squared_error(t_test, y_lr))
print("DecisionTree\n", mean_squared_error(t_test, y_dt))
print("SVR\n", mean_squared_error(t_test, y_svr))
print("Blend\n",mean_squared_error(t_test, blending_1))

LogisticRegression
 3214243459.650467
DecisionTree
 3009856794.286149
SVR
 9343844380.188307
Blend
 3756024787.423033


# 【問題2】バギングのスクラッチ実装
バギング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

In [34]:
def bagging(X_train, X_test, y_train, model, n=2):
    y_pred = np.zeros(X_test.shape[0])
    for i in range(n):
        X_divided = train_test_split(X_train, random_state=random.randint(0, i))[0]
        y_divided = train_test_split(X_train, random_state=random.randint(0, i))[0]
        model.fit(X_train, y_train)
        y_pred += model.predict(X_test)
    y_pred = y_pred / n
    return y_pred

y_pred = bagging(X_train, X_test, y_train, tree, n=100)

print("    決定木      MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("bagging 決定木  MSE : {}".format(mean_squared_error(y_test, y_pred)))

    決定木      MSE : 2115019610.5217845
bagging 決定木  MSE : 2164721843.0078597


# 【問題3】スタッキングのスクラッチ実装¶
スタッキング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

In [35]:
class Stacking():
    def __init__(self, split_n=3, model_n=2):
        self.split_n = split_n
        self.model_n = model_n

    def fit(self, X_train, y_train, X_test, y_test, models):
        # K個に分割するdividerを作る
        divider = np.zeros(self.split_n)
        vol = X_train.shape[0]
        num = self.split_n
        for i in range(self.split_n):
            divider[i] = math.ceil(vol/num)
            num -= 1
            vol = vol-divider[i]
        
        self.divider = divider.astype(int)
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.models = models
        print(self.divider)
    
    def predict(self, X_test):
        for m in range(self.model_n):
            divide_point = 0
            for n in range(self.split_n):
                idx = np.zeros(X_train.shape[0], dtype=bool)
                idx[divide_point:divide_point+self.divider[n]]= True
                self.X_test_divided = X_train[idx, :]
                self.X_train_divided = X_train[~idx, :]
                self.y_test_divided = y_train[idx]
                self.y_train_divided = y_train[~idx]                    
                
                models[m].fit(self.X_train_divided, self.y_train_divided)
                if n == 0:
                    blend = models[m].predict(self.X_test_divided)
                    pred_data = models[m].predict(X_test)
                else:
                    blend = np.r_[blend, models[m].predict(self.X_test_divided)]
                    pred_data = np.c_[pred_data, models[m].predict(X_test)]
            
                divide_point += self.divider[n]
            if m ==0:
                blend_data =blend.reshape(-1, 1)
                blend_pred_data = np.mean(pred_data, axis=1)
            else:
                blend_data = np.c_[blend_data, blend.reshape(-1, 1) ]
                blend_pred_data = np.c_[blend_pred_data, np.mean(pred_data, axis=1)]
        
        models[0].fit(blend_data, y_train)
        y_pred = models[0].predict(blend_pred_data)                               
        return y_pred

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
models = [LinearRegression(), DecisionTreeRegressor()]

stacking = Stacking()
stacking.fit(X_train, y_train, X_test, y_test, models)
y_pred = stacking.predict(X_test)

print("線形回帰 MSE : {}".format(mean_squared_error(y_test, lr_y_pred)))
print("SVM      MSE : {}".format(mean_squared_error(y_test, svm_y_pred)))
print("決定木   MSE : {}".format(mean_squared_error(y_test, tree_y_pred)))
print("Stacking MSE : {}".format(mean_squared_error(y_test, y_pred)))

[390 389 389]
線形回帰 MSE : 2495554898.6683216
SVM      MSE : 7861854841.842987
決定木   MSE : 2115019610.5217845
Stacking MSE : 1905881409.1172748
