# アンサンブル学習

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
All = pd.read_csv("house-train.csv")
X = All.loc[:, ["GrLivArea", "YearBuilt"]]
y = All.loc[:, ["SalePrice"]]

In [3]:
All = pd.read_csv("house-train.csv")
X = All.loc[:, ["GrLivArea", "YearBuilt"]]
y = All.loc[:, ["SalePrice"]]
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)

# データの標準化処理
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

## 小さなデータセットの用意

## scikit-learn

In [4]:
# MSE

from sklearn.metrics import mean_squared_error
#mean_squared_error(y_true, y_pred)


# ブレンディング

## 【問題1】ブレンディングのスクラッチ実装


In [5]:
#　線形回帰
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_std, y_train)
y_pred_lr = lr.predict(X_test_std)

print("{:,}".format(mean_squared_error(y_test, y_pred_lr)))

2,942,066,921.672107


In [6]:
# SVM
from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_lin = SVR(kernel='linear', C=1e3)
svr_poly = SVR(kernel='poly', C=1e3, degree=2)
y_rbf = svr_rbf.fit(X_train_std, y_train).predict(X_test_std)
y_lin = svr_lin.fit(X_train_std, y_train).predict(X_test_std)
y_poly = svr_poly.fit(X_train_std, y_train).predict(X_test_std)

y_rbf = np.reshape(y_rbf, (-1,1))
y_lin = np.reshape(y_lin, (-1,1))
y_poly = np.reshape(y_poly, (-1,1))
print("{:,}".format(mean_squared_error(y_test, y_rbf)))
print("{:,}".format(mean_squared_error(y_test, y_lin)))
print("{:,}".format(mean_squared_error(y_test, y_poly)))

3,747,597,967.329022
2,958,209,367.9421964
7,461,818,210.812097


  y = column_or_1d(y, warn=True)


In [7]:
# 決定木
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(max_depth = 3)
tree.fit(X_train_std, y_train)
y_pred_tree = tree.predict(X_test_std)

y_pred_tree = np.reshape(y_pred_tree, (-1,1))

print("{:,}".format(mean_squared_error(y_test, y_pred_tree)))

2,695,315,830.674088


In [8]:
# ニューラルネットワーク
from sklearn.neural_network import MLPRegressor
MLP= MLPRegressor(random_state=42)
MLP.fit(X_train_std, y_train)
y_pred_MLP = MLP.predict(X_test_std)

print("{:,}".format(mean_squared_error(y_test, y_pred_MLP)))

  y = column_or_1d(y, warn=True)


39,657,496,867.119156




In [9]:
#線形回帰,SVM＿Lin,決定木の平均を取ってみる
y_pred_lrSVMtree = np.concatenate((y_pred_lr,y_lin, y_pred_tree), axis=1)
y_pred_mean = np.reshape(np.mean(y_pred_lrSVMtree,axis=1),(-1,1))

print("{:,}".format(mean_squared_error(y_test, y_pred_mean)))
#少し改善された

2,620,935,356.8208737


In [10]:
#ニューラルネットワークのパラメーター変更、隠れ層を(100,)→（1000,1000,1000）に
MLP= MLPRegressor(hidden_layer_sizes=(1000, 1000,1000),activation='relu',random_state=42)
MLP.fit(X_train_std, y_train)
y_pred_MLP = MLP.predict(X_test_std)
y_pred_MLP = np.reshape(y_pred_MLP, (-1,1))
print("{:,}".format(mean_squared_error(y_test, y_pred_MLP)))

#ニューラルネットワーク同士の比較では、かなり良くなったが、決定木と比べて少し悪い

  y = column_or_1d(y, warn=True)


2,926,497,284.5278735


In [11]:
#線形回帰,SVM＿Lin,決定木に重みをつける
y_pred_lrSVMtree = np.concatenate((y_pred_lr*0.2,y_lin*0.3, y_pred_tree*0.5), axis=1)
y_pred_mean = np.reshape(np.mean(y_pred_lrSVMtree,axis=1),(-1,1))

print("{:,}".format(mean_squared_error(y_test, y_pred_mean)))
#大幅に悪くなった

19,537,542,480.66832


In [12]:
#線形回帰、ニューラルネットワーク、決定木の平均をとってみた
y_pred_lrMLPtree = np.concatenate((y_pred_lr,y_pred_MLP, y_pred_tree), axis=1)
y_pred_mean2 = np.reshape(np.mean(y_pred_lrMLPtree,axis=1),(-1,1))

print("{:,}".format(mean_squared_error(y_test, y_pred_mean)))
#大幅に退化

19,537,542,480.66832


In [13]:
#線形回帰と決定木の平均をとってみた
y_pred_lrtree = np.concatenate((y_pred_lr, y_pred_tree), axis=1)
y_pred_mean3 = np.reshape(np.mean(y_pred_lrtree,axis=1),(-1,1))

print("{:,}".format(mean_squared_error(y_test, y_pred_mean3)))
#少し改善された

2,537,034,970.345986


In [14]:
# 決定木のdepth=10に変更した
from sklearn.tree import DecisionTreeRegressor
tree2 = DecisionTreeRegressor(max_depth = 10)
tree2.fit(X_train_std, y_train)
y_pred_tree2 = tree2.predict(X_test_std)

y_pred_tree2 = np.reshape(y_pred_tree2, (-1,1))

print("{:,}".format(mean_squared_error(y_test, y_pred_tree2)))
#depth = 3から一気に改善された

2,595,987,936.5740814


# バギング

## 【問題2】バギングのスクラッチ実装

In [15]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(
        X, y, test_size=0.2, random_state=0)

# データの標準化処理
sc = StandardScaler()
sc.fit(X_train1)
X_train_std1 = sc.transform(X_train1)
X_test_std1 = sc.transform(X_test1)

m = 10
y_pred_lr_list  = np.zeros((X_test.shape[0], m))
for i in range(m):
    X_train2, X_test2, y_train2, y_test2 = train_test_split(
        X, y, test_size=0.2, shuffle = True)
    
    # データの標準化処理
    sc = StandardScaler()
    sc.fit(X_train2)
    X_train_std2 = sc.transform(X_train2)

    #　線形回帰
    from sklearn.linear_model import LinearRegression
    lr1 = LinearRegression()
    lr1.fit(X_train_std2, y_train2)
    y_temp = lr1.predict(X_test_std1)#X_test_std"1"
    y_pred_lr_list[:, i] = y_temp[:, 0]
    
y_pred_mean_lr = np.mean(y_pred_lr_list,axis= 1)    
print("MSE : {:,}".format(mean_squared_error(y_test1, y_pred_mean_lr)))

# 上記の線形回帰（学習１度: 2,942,066,921.672107）より多少改善された

MSE : 2,919,717,092.893424


# スタッキング

## 【問題3】スタッキングのスクラッチ実装

In [16]:
class ScratchStacking():
    """
    ステージ0とステージ1のみのスタッキング
    """
    def __init__(self):
        self.k0 = 0
        self.m0 = 0
        self.list_models = None
    
    def fit(self, X, y, k0, m0, *models, model_z):
        self.k0 = k0
        self.m0 = m0
        n_split_samples = int(len(X)/k0)+1
        idx = np.arange(X.shape[0])
        idx_split = [idx[i*n_split_samples:(i+1)*n_split_samples] for i in range(k0)]

        y_pred_ndarray0 = np.empty((X.shape[0], m0))
        list_models = []
        for i in range(m0):
            model = models[i]
            y_pred_ndarray = np.empty(X.shape[0])
            _list_model = []
            for j in range(k0):
                idx_bl_j = np.full(idx.shape, False)
                idx_bl_j[idx_split[j]] = True
                idx_bl_j_c = idx_bl_j==np.full(idx.shape, False)  # idx_bl_j_c = ~idx_bl_j
                _reg = model().fit(X[idx_bl_j_c], y[idx_bl_j_c])
                y_pred_ndarray[idx_split[j]] = _reg.predict(X[idx_bl_j])
                _list_model.append(_reg)
            y_pred_ndarray0[:, i] = y_pred_ndarray
            list_models.append(_list_model)
        self.list_models = [list_models]
        model0 = model_z().fit(y_pred_ndarray0, y)
        self.list_models.append(model0)
        
    def predict(self, X):
        y_pred_ndarray0 = np.empty((X.shape[0], self.m0, self.k0))
        
        for i in range(self.m0):
            for j in range(self.k0):
                y_pred_ndarray0[:, i, j] = self.list_models[0][i][j].predict(X)
        y_pred_ndarray1 = np.average(y_pred_ndarray0, axis=2)
        
        y_pred = self.list_models[1].predict(y_pred_ndarray1)
        
        return y_pred

In [17]:
y_train = y_train.values

In [18]:
y_train = np.reshape(y_train, [-1,])

In [19]:
y_test = y_test.values

In [20]:
y_test = np.reshape(y_test, [-1,])

In [21]:
y_train.shape

(1168,)

In [22]:
stacking = ScratchStacking()
stacking.fit(X_train, y_train, 3, 2, LinearRegression, DecisionTreeRegressor, model_z=LinearRegression)
y_pred_stacking = stacking.predict(X_test)
print("MSE(stack) = ", mean_squared_error(y_pred_stacking, y_test), sep="")
#上記のどの場合と比べても改善された

MSE(stack) = 2490045433.670812
