### 【問題1】ブレンディングのスクラッチ実装

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

df = pd.read_csv("train.csv")
df_Xy = df.loc[:, ["GrLivArea", "YearBuilt", "SalePrice"]].values
X = df_Xy[:,:2]
y = df_Xy[:,2]

#訓練データ、テストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#訓練データの対数変換・標準化
X_train = np.log(X_train)
X_test = np.log(X_test)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

#SVMによる予測
clf = SVR()
clf.fit(X_train_transformed, y_train)
clf_y_pred = clf.predict(X_test_transformed)
mse_svr = mean_squared_error(y_test, clf_y_pred)
print('MSE_SVR:', f'{mse_svr:.3g}')


#決定木による予測
tree = DecisionTreeRegressor(max_depth = 3)
tree.fit(X_train_transformed, y_train)
tree_y_pred = tree.predict(X_test_transformed)
mse_tree = mean_squared_error(y_test, tree_y_pred)
print('MSE_TREE:', f'{mse_tree:.3g}')


#線形回帰による予測
lr = LinearRegression()
lr.fit(X_train_transformed, y_train)
lr_y_pred = lr.predict(X_test_transformed)
mse_lr = mean_squared_error(y_test, lr_y_pred)
print('MSE_LR:', f'{mse_lr:.3g}')

#ブレンディング
clf_y_pred = clf_y_pred.reshape(-1,1)
tree_y_pred = tree_y_pred.reshape(-1,1)
lr_y_pred = lr_y_pred.reshape(-1,1)
pred_array = np.hstack((clf_y_pred, tree_y_pred, lr_y_pred))

w = np.array([0.05, 0.5, 0.45])
blending_y_pred = np.average(pred_array, axis=1, weights = w)
mse_blending = mean_squared_error(y_test, blending_y_pred)
print('MSE_BLENDING:', f'{mse_blending:.3g}')


MSE_SVR: 9.64e+09
MSE_TREE: 3.59e+09
MSE_LR: 3.38e+09
MSE_BLENDING: 3.34e+09


### 【問題2】バギングのスクラッチ実装

In [58]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

df = pd.read_csv("train.csv")
df_Xy = df.loc[:, ["GrLivArea", "YearBuilt", "SalePrice"]].values
X = df_Xy[:,:2]
y = df_Xy[:,2]

#訓練データ、テストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#訓練データの対数変換・標準化
X_train = np.log(X_train)
X_test = np.log(X_test)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

#データのサイズ確認
n_samples = X_train_transformed.shape[0]
n_features = X_train_transformed.shape[1]
n_test_data = y_test.shape[0]

#バギング分割数：n_devided
n_devided = 10

#訓練データに対し、バギング分割ごとに機械学習を実施
y_pred_list = {}
sum_array = np.zeros((n_test_data, 1))
for i in range(n_devided):
    
    #訓練データからランダムな重複しないインデックスを入手
    bagg_idx = random.sample(range(0,n_samples), int(n_samples/n_devided))

    #バギング分割後の訓練データを保存
    X_bagg = X_train_transformed[bagg_idx]
    y_bagg = y_train[bagg_idx]
    
    #機械学習実施
    tree = DecisionTreeRegressor(max_depth = 5)
    tree.fit(X_bagg, y_bagg)
    y_pred = tree.predict(X_test_transformed).reshape(-1,1)
    
    #バギング分割毎に予測値を保管
    y_pred_list[i] = y_pred
    
    #予測値の合計を保管（後に分割数で割り、平均をとる）
    sum_array += y_pred

#バギング分割毎のMSEを表示
for i in range(n_devided):
    mse_tree = mean_squared_error(y_test, y_pred_list[i])
    print('MSE_TREE:', i, f'{mse_tree:.3g}')

#バギングして結合後のMSEを表示
y_pred_mean = sum_array / n_devided
mse_bagg_tree = mean_squared_error(y_test, y_pred_mean)
print('MSE_BAGGING_TREE:', f'{mse_bagg_tree:.3g}')

MSE_TREE: 0 3.93e+09
MSE_TREE: 1 3.91e+09
MSE_TREE: 2 3.87e+09
MSE_TREE: 3 3.07e+09
MSE_TREE: 4 3.58e+09
MSE_TREE: 5 3.1e+09
MSE_TREE: 6 2.48e+09
MSE_TREE: 7 3.46e+09
MSE_TREE: 8 3.34e+09
MSE_TREE: 9 2.57e+09
MSE_BAGGING_TREE: 2.12e+09


### 【問題3】スタッキングのスクラッチ実装

In [98]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression


class base_model():
    
    def __init__(self,  idx_splits, idx_models):
        self.idx_splits = idx_splits
        self.idx_models = idx_models
        
        self.clf = None
        self.tree = None
        self.lr = None
        
    def fit(self, X_train, y_train):

        #SVMによる学習
        if self.idx_models == 0:
            self.clf = SVR()
            self.clf.fit(X_train, y_train)

        #決定木による学習
        if self.idx_models == 1:
            self.tree = DecisionTreeRegressor(max_depth = 3)
            self.tree.fit(X_train, y_train)
        
        #線形回帰による学習 
        if self.idx_models == 2:
            self.lr = LinearRegression()
            self.lr.fit(X_train, y_train)
        
        return
    
    def predict(self, X_test):
        
        #SVMによる予測 
        if self.idx_models == 0:
            y_pred = self.clf.predict(X_test)

        #決定木による予測
        if self.idx_models == 1:
            y_pred = self.tree.predict(X_test)
        
        #線形回帰による予測 
        if self.idx_models == 2:
            y_pred = self.lr.predict(X_test)
        
        return y_pred.reshape(-1,1)


#データの読み込み
df = pd.read_csv("train.csv")
df_Xy = df.loc[:, ["GrLivArea", "YearBuilt", "SalePrice"]].values
X = df_Xy[:,:2]
y = df_Xy[:,2]

#訓練データ、テストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#訓練データの対数変換・標準化
X_train = np.log(X_train)
X_test = np.log(X_test)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transformed = scaler.transform(X_test)

#データのサイズ確認
n_samples = X_train_transformed.shape[0]  #1168
n_features = X_train_transformed.shape[1]
n_test_data = y_test.shape[0]

#Level 1のモデルのセット
n_splits = 4
n_models = 3

#学習フェーズ
#n_models × n_splitsのモデルインスタンスを作成・学習する。

#ベースモデルの予測値を格納する。
y_pred = {}

#ベースモデルのインスタンス n_models×n_splits 作成する。
base = [[0 for i2 in range(n_splits)] for i1 in range(n_models)]

#ベースモデルの学習・推定
#index_last_stageは、ラストステージでの学習時のy_trainのインデックスで使用する。
index_last_stage = np.array([])
skf = StratifiedKFold(n_splits=n_splits)
for spt, (train_index, test_index) in enumerate (skf.split(X_train,y_train)):
    for mdl in range(n_models):
        base[mdl][spt] = base_model(idx_models=mdl, idx_splits=spt)
        base[mdl][spt] .fit(X_train_transformed[train_index], y_train[train_index])
        y_pred[(mdl, spt)] = base[mdl][spt] .predict(X_train_transformed[test_index])  #test_index * n_splitでn_samplesになる。
    index_last_stage = np.append(index_last_stage, test_index).astype('int')

#ラストステージの特徴量を作成する。
for mdl in range(n_models):
    #モデル毎のブレンドデータを保存
    blended_y_pred = np.array([])

    #予測したn_splits毎のデータをndarrayとして取り出す。
    for spt in range(n_splits):
        blended_y_pred = np.append(blended_y_pred, y_pred[(mdl, spt)] )
    blended_y_pred = blended_y_pred.reshape(-1,1)
    
    #ブレンドデータをモデル毎に１列に並べる。（モデル数分の列ができる）
    if mdl == 0:
        OOF = blended_y_pred
    else:
        OOF = np.hstack((OOF, blended_y_pred))
        
#ラストステージでの学習（ここでは決定木を使用する）
last_stage = DecisionTreeRegressor(max_depth = 3)
last_stage.fit(OOF, y_train[index_last_stage])

#推定フェーズ
#各ベースモデル種で複数モデルの平均(y_pred_mean)を求める。
for mdl in range(n_models):
    y_pred_temp = np.zeros((n_test_data,1))
    for spt in range(n_splits):
        y_pred_temp += base[mdl][spt] .predict(X_test_transformed)
    if mdl == 0:
        y_pred_mean = y_pred_temp / n_splits
    else:
        y_pred_mean = np.hstack((y_pred_mean, y_pred_temp/n_splits))

#ラストステージでの推定
y_pred = last_stage.predict(y_pred_mean)
mse_stacking = mean_squared_error(y_test, y_pred)
print('MSE_STACKING:', f'{mse_stacking:.3g}')



MSE_STACKING: 1.55e+09


