In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (GridSearchCV, KFold, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

%matplotlib inline

In [2]:
df = pd.read_csv("train.csv") # データをダウンロード
X = df[['GrLivArea', 'YearBuilt']] # 2つの特徴量を抜き出し変数に格納
y = df[['SalePrice']] # 目的変数を抜き出し、変数に格納
X.shape

(1460, 2)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8)

In [4]:
#対数変換処理をする
X_train_log = np.log(X_train)
y_train_log = np.log(y_train)
X_test_log = np.log(X_test)
y_test_log = np.log(y_test)

In [5]:
#SVM
svm = SVR(gamma='scale', C=1.0, epsilon=0.2) # SVCに訓練データを学習させる
svm.fit(X_train_log, y_train_log.values.ravel()) # SVCによる検証データの分類
svm_predict = svm.predict(X_test_log) #推定
svm_mse = mean_squared_error(y_test_log, svm_predict) #mse

In [6]:
# 決定木
dtr = DecisionTreeRegressor(max_depth=3)
dtr.fit(X_train_log, y_train_log)#学習
dtr_predict = dtr.predict(X_test_log)#推定
dtr_mse = mean_squared_error(y_test_log, dtr_predict) #mse

In [7]:
#線形回帰
lr = LinearRegression()
lr.fit(X_train_log, y_train_log)#学習
lr_predict = lr.predict(X_test_log)#推定
lr_mse = mean_squared_error(y_test_log, lr_predict) #mse

In [8]:
#学習データの分割数
n=4
#モデル数
m=2
# データを分割し、変数に格納
X_0 = np.array_split(X, n, 0)
y_0 = np.array_split(y, n, 0)

pd.concat([X_0[0], X_0[1]], 0).head()

Unnamed: 0,GrLivArea,YearBuilt
0,1710,2003
1,1262,1976
2,1786,2001
3,1717,1915
4,2198,2000


In [9]:
a = np.arange(10)[np.newaxis, :]
b = np.arange(10, 20)[np.newaxis, :]

np.concatenate([a, b],0)

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]])

In [10]:
models = {"dtr": DecisionTreeRegressor(max_depth=3), "lr": LinearRegression()}
len(models)

2

In [11]:
key = list(models.keys())[0]
key

'dtr'

In [12]:
models.keys()

dict_keys(['dtr', 'lr'])

In [13]:
models.values()

dict_values([DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best'), LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)])

In [14]:
for k, v in enumerate(models):
    print(k, v)

0 dtr
1 lr


In [15]:
models[key]

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [16]:
(key)

'dtr'

In [46]:
#学習データの分割数
n=4
#モデル数
m=2
#ブレンドデータを格納するリスト
x_size = int(len(X)/n)
blend_data_0 = np.empty((n, x_size))
blend_data = np.empty([n, x_size])


for i in range(m-1):
    blend_data =np.stack([blend_data, blend_data_0], 1)

print(blend_data.shape)

(4, 2, 365)


In [79]:
aa = list(range(10))
bb = list(range(10, 20))
cc =[]
cc.append(aa)
cc.append(bb)
print(cc[0])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [18]:
#学習データの分割数
n=4
#モデル数
m=2
#行・省略しない
pd.set_option('display.max_rows', 2000)
#ブレンドデータを格納するリスト
x_size = int(len(X)/4)
blend_data = np.zeros((n, x_size))

for i in range(n):
    #学習モデルを分割
    X_0 = np.array_split(X, n, 0)
    y_0 = np.array_split(y, n, 0)
    #i番目のデータを抽出
    k = X_0.pop(i)
    l = y_0.pop(i)
    #インスタンス化、学習、推定
    dtr = DecisionTreeRegressor(max_depth=3)
    dtr.fit(pd.concat(X_0), pd.concat(y_0)) #学習
    dtr_predict = dtr.predict(k) #推定
    blend_data[i] = dtr_predict

In [19]:
type(X_train_log)

pandas.core.frame.DataFrame

# クラス化

In [97]:
class Stacking():
    '''
    スタッキングのスクラッチ実装
    Parameters
    
    ---------
    model : dict
        モデルのインスタンス名：インスタンス
    divi_num : int
        データの分割数
        
    '''
    def __init__(self, divi_num):
        #self.models = models # モデルの辞書
        self.divi_num = divi_num # 分割数
        self.blend_data = [] #ブレンドデータを格納するリスト
        
    def fit(self, X, y, models):
        
        for i in range(len(models)):
            for j in range(self.divi_num):
                #学習モデルを分割
                X_0 = np.array_split(X, self.divi_num, 0)
                y_0 = np.array_split(y, self.divi_num, 0)
                #i番目のデータを抽出
                k = X_0.pop(j)
                l = y_0.pop(j)
                #インスタンス化、学習、推定
                key = list(models.keys())[i]
                clf = models[key]
                clf.fit(pd.concat(X_0), pd.concat(y_0)) #学習
                clf_predict = clf.predict(k) #推定
                clf_predict = np.ravel(clf_predict).tolist() #配列をリスト化
                self.blend_data.append(clf_predict) #リストに格納

In [98]:
models=models = {"dtr": DecisionTreeRegressor(max_depth=3), "lr": LinearRegression()}

stk = Stacking(divi_num=4)
stk.fit(X_train_log, y_train_log, models)