# Sprint アンサンブル学習

In [1]:
import numpy as np
import random
import pandas as pd
from numpy import linalg as LA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # testとtrain分割
from sklearn.preprocessing import StandardScaler #　標準化モジュール
from sklearn.linear_model import LinearRegression # 線形回帰
from sklearn.metrics import mean_squared_error # 平均二乗誤差
from sklearn.tree import DecisionTreeRegressor # 決定木
from sklearn.neighbors import KNeighborsClassifier # 最近傍法
from sklearn.svm import SVR # こっちはSVM回帰　最適化すると強いが手間がかかる、重い
from sklearn.svm import SVC # SVM
from sklearn.preprocessing import MinMaxScaler # データが0〜1の間に収まるよう変換
from sklearn.model_selection import GridSearchCV

### データセットの用意

In [2]:
data_raw = pd.read_csv("../../data/house-prices-advanced-regression-techniques/train.csv")

In [3]:
train_data = data_raw.copy()

In [4]:
X = train_data.loc[:, ["GrLivArea", "YearBuilt"]]
t = train_data.loc[:, ["SalePrice"]]

In [5]:
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2)

### 【問題1】ブレンディングのスクラッチ実装
単一モデルで学習、推定をする。  
その後MSEで評価する。  
単一モデルの推定結果を水増し（ブレンディング）し、改めてMSEを求める。  
このMSEが単一モデルと比べて下がっている事を確認する。MSEが下がると精度が高いと言える。  
今回のブレンディングはデータに対して0.1を掛ける等を実施する。  

In [6]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
# svr = SVR(gamma="scale") # こける。。
svr = SVR(gamma=0.001, C=1.0, epsilon=0.2)
# svr = SVR(kernel='rbf', gamma='scale') # こける。。
# svr = SVR(gamma='auto') # mseでこける。。
# svr = SVC(gamma="scale") # fitでこける。。

### 1回目

In [7]:
lr.fit(X_train, t_train)
dt.fit(X_train, t_train)
svr.fit(X_train, t_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma=0.001,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [8]:
y_lr = lr.predict(X_test)
y_dt = dt.predict(X_test)
y_svr = svr.predict(X_test)
# shapeを強引に整えた
y_blend_1 = (y_lr.reshape(292,) + y_dt.reshape(292,) + y_svr.reshape(292,)) / 3

In [9]:
print("LogisticRegression\n", mean_squared_error(t_test, y_lr))
print("DecisionTree\n", mean_squared_error(t_test, y_dt))
print("SVR\n", mean_squared_error(t_test, y_svr))
print("Blend\n", mean_squared_error(t_test, y_blend_1))

LogisticRegression
 2589510516.639211
DecisionTree
 2916423941.3056507
SVR
 7540865788.495035
Blend
 2675414248.1508765


予測：ブレンディングをすると精度が上がる（MSEの値が小さくなる）と思っている。  
考察：決定木の単一モデルの方が精度が良かった。

In [10]:
# 型がバラバラ。。
print(t_test.shape)
print(y_lr.shape)
print(y_dt.shape)
print(y_svr.shape)
print(y_blend_1.shape)

(292, 1)
(292, 1)
(292,)
(292,)
(292,)


### 2回目

In [11]:
lr2 = LinearRegression(normalize = True)
dt2 = DecisionTreeRegressor(max_depth=5)
svr2 = SVR(gamma="scale", kernel="linear")

lr2.fit(X_train, t_train)
dt2.fit(X_train, t_train)
svr2.fit(X_train, t_train)

y_lr2 = lr2.predict(X_test)
y_dt2 = dt2.predict(X_test)
y_svr2 = svr2.predict(X_test)
blending_2 = ((y_lr2) + (y_dt2) + (y_svr2)) / 3

print(mean_squared_error(t_test, y_lr2))
print(mean_squared_error(t_test, y_dt2))
print(mean_squared_error(t_test, y_svr2))
print(mean_squared_error(t_test, blending_2))

### 3回目

In [12]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)

lr3_sc = LinearRegression()
dt3_sc = DecisionTreeRegressor(max_depth=3)
svr3_sc = SVR(gamma="scale", kernel="linear")

lr3 = LinearRegression()
dt3 = DecisionTreeRegressor(max_depth=3)
svr3 = SVR(gamma="scale", kernel="linear")

lr3_sc.fit(X_train_sc, t_train)
dt3_sc.fit(X_train_sc, t_train)
svr3_sc.fit(X_train_sc, t_train)

lr3.fit(X_train, t_train)
dt3.fit(X_train, t_train)
svr3.fit(X_train, t_train)

y_lr3_sc = lr3_sc.predict(X_test) 
y_dt3_sc = dt3_sc.predict(X_test) 
y_svr3_sc = svr3_sc.predict(X_test)

blending_3 = ((y_lr3_sc*0.0001) + (y_dt3_sc*0.9) + (y_svr3_sc*0.0009)) / 3

print(mean_squared_error(t_test, y_lr3_sc))
print(mean_squared_error(t_test, y_dt3_sc))
print(mean_squared_error(t_test, y_svr3_sc))

print(mean_squared_error(t_test, blending_3))

### 【問題2】バギングのスクラッチ実装
バギング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

In [13]:
X_train_b, X_test_b, t_train_b, t_test_b = train_test_split(X, t, test_size=0.2, shuffle=True)

In [14]:
# データ作成
X_train_b_1 = X_train_b.iloc[:292, :]
t_train_b_1 = t_train_b.iloc[:292]

X_train_b_2 = X_train_b.iloc[292:584 , :]
t_train_b_2 = t_train_b.iloc[292:584]

X_train_b_3 = X_train_b.iloc[584:876 , :]
t_train_b_3 = t_train_b.iloc[584:876]

X_train_b_4 = X_train_b.iloc[876:1168 , :]
t_train_b_4 = t_train_b.iloc[876:1168]

In [15]:
# インスタンス作成
dt_b_1 = DecisionTreeRegressor(max_depth=5)
dt_b_2 = DecisionTreeRegressor(max_depth=5)
dt_b_3 = DecisionTreeRegressor(max_depth=5)
dt_b_4 = DecisionTreeRegressor(max_depth=5)

In [16]:
# 学習
dt_b_1.fit(X_train_b_1, t_train_b_1)
dt_b_2.fit(X_train_b_2, t_train_b_2)
dt_b_3.fit(X_train_b_3, t_train_b_3)
dt_b_4.fit(X_train_b_4, t_train_b_4)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [17]:
# 推定
y_dt_b_1 = dt_b_1.predict(X_test_b)
y_dt_b_2 = dt_b_2.predict(X_test_b)
y_dt_b_3 = dt_b_3.predict(X_test_b)
y_dt_b_4 = dt_b_4.predict(X_test_b)

baging_1 = (y_dt_b_1+y_dt_b_2+y_dt_b_3+y_dt_b_4) / 4

In [18]:
# mse算出
print(mean_squared_error(t_test_b, y_dt_b_1))
print(mean_squared_error(t_test_b, y_dt_b_2))
print(mean_squared_error(t_test_b, y_dt_b_3))
print(mean_squared_error(t_test_b, y_dt_b_4))

print(mean_squared_error(t_test_b, baging_1))

2487209595.46346
2265302861.3943405
3177284015.9451513
2275345894.218206
1644376789.6520386


バギングをするとmseが下がった。

### 【問題3】スタッキングのスクラッチ実装
スタッキング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。
### 学習フェーズ
#### データを分割しデータセットを3つ作る

In [19]:
# データを分割する
X_train_1 = X_train.iloc[:389, :].values
t_train_1 = t_train.iloc[:389].values

X_train_2 = X_train.iloc[389:778, :].values
t_train_2 = t_train.iloc[389:778].values

X_train_3 = X_train.iloc[778:, :].values
t_train_3 = t_train.iloc[778:].values

In [20]:
# トレインデータとして用意する。
train_dataset_1_x = np.concatenate([X_train_2, X_train_3], axis=0)
train_dataset_1_t = np.concatenate([t_train_2, t_train_3], axis=0)

train_dataset_2_x = np.concatenate([X_train_1, X_train_3], axis=0)
train_dataset_2_t = np.concatenate([t_train_1, t_train_3], axis=0)

train_dataset_3_x = np.concatenate([X_train_1, X_train_2], axis=0)
train_dataset_3_t = np.concatenate([t_train_1, t_train_2], axis=0)
# X_train_1は train_dataset_1_x のテストデータという関係になる。

### 学習、推定、ブレンドデータ作成（モデル１）

In [21]:
DTR_1 = DecisionTreeRegressor(max_depth=5)
DTR_2 = DecisionTreeRegressor(max_depth=5)
DTR_3 = DecisionTreeRegressor(max_depth=5)

DTR_1.fit(train_dataset_1_x, train_dataset_1_t)
DTR_2.fit(train_dataset_2_x, train_dataset_2_t)
DTR_3.fit(train_dataset_3_x, train_dataset_3_t)

y_DTR_1 = DTR_1.predict(X_train_1)
y_DTR_2 = DTR_2.predict(X_train_2)
y_DTR_3 = DTR_3.predict(X_train_3)

#ブレンドデータ作成
DTR_blend_data = np.concatenate([y_DTR_1, y_DTR_2, y_DTR_3], axis=0)
DTR_blend_data.shape

(1168,)

In [22]:
SVR_1 = SVR(gamma=0.001, C=1.0, epsilon=0.2)
SVR_2 = SVR(gamma=0.001, C=1.0, epsilon=0.2)
SVR_3 = SVR(gamma=0.001, C=1.0, epsilon=0.2)

SVR_1.fit(train_dataset_1_x, train_dataset_1_t)
SVR_2.fit(train_dataset_2_x, train_dataset_2_t)
SVR_3.fit(train_dataset_3_x, train_dataset_3_t)

y_SVR_1 = SVR_1.predict(X_train_1)
y_SVR_2 = SVR_2.predict(X_train_2)
y_SVR_3 = SVR_3.predict(X_train_3)

SVR_blend_data = np.concatenate([y_SVR_1, y_SVR_2, y_SVR_3], axis=0)
SVR_blend_data.shape

  y = column_or_1d(y, warn=True)


(1168,)

In [23]:
DTR_SVR_blend = np.stack([DTR_blend_data , SVR_blend_data], 1)
DTR_SVR_blend.shape

(1168, 2)

### 学習、推定、ブレンドデータ作成（モデル2）

In [24]:
LR_1 = LinearRegression()
LR_2 = LinearRegression()
LR_3 = LinearRegression()

LR_1.fit(train_dataset_1_x, train_dataset_1_t)
LR_2.fit(train_dataset_2_x, train_dataset_2_t)
LR_3.fit(train_dataset_3_x, train_dataset_3_t)

y_LR_1 = LR_1.predict(X_train_1)
y_LR_2 = LR_1.predict(X_train_2)
y_LR_3 = LR_1.predict(X_train_3)

LR_blend_data = np.concatenate([y_LR_1, y_LR_2, y_LR_3], axis=0)
LR_blend_data.shape

(1168, 1)

In [25]:
# ブレンドデータ結合
blend_data = np.concatenate([DTR_blend_data.reshape(1168, 1), LR_blend_data], axis = 1)
blend_data.shape

(1168, 2)

### 学習フェーズのラストステージ

In [26]:
print(t_train_1.shape)

(389, 1)


In [27]:
DTR_last = DecisionTreeRegressor(max_depth=5)

DTR_last.fit(train_dataset_1_x, train_dataset_1_t)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [28]:
DTR_last = DecisionTreeRegressor(max_depth=5)

DTR_last.fit(blend_data, t_train)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

### 推定フェーズ

#### 推定パート　1回目

In [29]:
y_test_DTR_1 = DTR_1.predict(X_test)
y_test_DTR_2 = DTR_2.predict(X_test)
y_test_DTR_3 = DTR_3.predict(X_test)

y_test_SVR_1 = SVR_1.predict(X_test)
y_test_SVR_2 = SVR_2.predict(X_test)
y_test_SVR_3 = SVR_3.predict(X_test)

blend_test_DTR = np.stack([y_test_DTR_1, y_test_DTR_2, y_test_DTR_3], 1)
blend_test_DTR_av = np.average(blend_test_DTR , 1)
blend_test_DTR_av.shape

(292,)

In [30]:
blend_test_SVR = np.stack([y_test_SVR_1, y_test_SVR_2, y_test_SVR_3], 1)
blend_test_SVR_av = np.average(blend_test_SVR , 1)
blend_test_SVR_av.shape

(292,)

In [31]:
blend_test_DTSVR = np.stack([blend_test_DTR_av, blend_test_SVR_av], 1)
blend_test_DTSVR.shape

(292, 2)

#### 推定パート　2回目

In [32]:
y_test_DTR_2_1 = DTR_1.predict(blend_test_DTSVR)
y_test_DTR_2_2 = DTR_2.predict(blend_test_DTSVR)
y_test_DTR_2_3 = DTR_3.predict(blend_test_DTSVR)

y_test_SVR_2_1 = SVR_1.predict(blend_test_DTSVR)
y_test_SVR_2_2 = SVR_2.predict(blend_test_DTSVR)
y_test_SVR_2_3 = SVR_3.predict(blend_test_DTSVR)

In [33]:
blend_test_DTR_2 = np.stack([y_test_DTR_2_1, y_test_DTR_2_2, y_test_DTR_2_3], 1)
blend_test_DTR_2_av = np.average(blend_test_DTR_2 , 1)
blend_test_DTR_2_av.shape

(292,)

In [34]:
blend_test_SVR_2 = np.stack([y_test_SVR_2_1, y_test_SVR_2_2, y_test_SVR_2_3], 1)
blend_test_SVR_2_av = np.average(blend_test_SVR_2 , 1)
blend_test_SVR_2_av.shape

(292,)

In [35]:
blend_test_DTSVR_2 = np.stack([blend_test_DTR_2_av, blend_test_SVR_2_av], 1)
blend_test_DTSVR_2.shape

(292, 2)

**推定　最終**

In [36]:
last_y = DTR_last.predict(blend_test_DTSVR_2)

In [37]:
print(mean_squared_error(t_test, last_y))

8132164091.2765045
