# Adaboost

In [7]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [8]:
cancer = datasets.load_breast_cancer()
x = cancer.data 
y = cancer.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [9]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.9912280701754386


# blending

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df_train = pd.read_csv('data/output/house_train_clean.csv')

df_train_y = df_train['單價(元/平方公尺)']
df_train = df_train.drop(['單價(元/平方公尺)'], axis=1)
display(df_train.head())

x_train, x_test, y_train, y_test = train_test_split(df_train, df_train_y, test_size=0.2, random_state=1)

Unnamed: 0,土地移轉總面積(平方公尺),建物現況格局-廳,建物現況格局-房,建物現況格局-衛,建物移轉總面積(平方公尺),總價(元),車位移轉總面積(平方公尺),車位總價(元),num_of_bus_stations_in_100m,income_avg,...,土地區段位置/建物區段門牌,建物型態,建物現況格局-隔間,有無管理組織,車位類別,都市土地使用分區,鄉鎮市區,非都市土地使用分區,location_type,nearest_tarin_station
0,2.364366,-1.567971,-1.41939,-1.495833,-1.353538,-0.91014,-0.539096,-0.389257,-1.429543,0.102731,...,933,2,0,1,1,1,6,1,1,40
1,0.631359,0.810838,-0.732958,-0.343334,-0.343538,-0.661491,-0.539096,-0.389257,1.77241,-0.432691,...,12,1,0,1,1,0,1,1,3,39
2,2.364366,0.810838,1.326338,2.537915,2.297815,2.232775,2.427249,-0.389257,1.132019,0.450769,...,1090,0,0,0,4,0,5,1,2,23
3,0.297232,0.810838,0.639906,0.809166,-0.095638,-0.443924,-0.539096,-0.389257,-0.148762,-0.435469,...,717,1,0,1,1,0,9,1,3,52
4,-0.977402,0.810838,-0.046526,-0.343334,-0.67882,-0.3196,-0.539096,-0.389257,-1.429543,-0.729956,...,55,0,0,0,1,2,0,1,3,82


In [11]:
# 使用三種模型 : 線性迴歸 / 梯度提升機 / 隨機森林
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
lr = LinearRegression()
gdbt = GradientBoostingRegressor(tol=0.1, subsample=0.37, n_estimators=200, max_features=20, 
                                 max_depth=6, learning_rate=0.03)
rf = RandomForestRegressor(n_estimators=300, min_samples_split=9, min_samples_leaf=10, 
                           max_features='sqrt', max_depth=8, bootstrap=False)

In [18]:
# 線性迴歸預測
model_lr = lr.fit(x_train, y_train)
lr_pred = model_lr.predict(x_test)
print(lr_pred)
mse_lr = mean_squared_error(lr_pred,y_test, squared=False)
print(mse_lr)

[182791.01032123  14222.87217784 120810.20320323 ...  91932.25542118
 203975.87374086 164058.65312487]
88045.6769070151


In [13]:
# 隨機森林預測
model_rf = rf.fit(x_train, y_train)
rf_pred = model_rf.predict(x_test)
mse_rf = mean_squared_error(rf_pred,y_test, squared=False)
print(mse_rf)

86606.3117036469


In [14]:
# 梯度提升機預測
model_gdbt = gdbt.fit(x_train, y_train)
gdbt_pred = gdbt.predict(x_test)
mse_gdbt = mean_squared_error(gdbt_pred,y_test, squared=False)
print(mse_gdbt)

92482.81759417226


In [15]:
# 混合泛化預測
mse_sum = 1/mse_gdbt + 1/mse_rf + 1/mse_lr
blending_pred = lr_pred*((1/mse_lr)/mse_sum) + gdbt_pred*((1/mse_gdbt)/mse_sum) + rf_pred*((1/mse_rf)/mse_sum)
mean_squared_error(blending_pred,y_test, squared=False)

73304.58299696611

#### 注意，Blending 的前提是 : 個別單模效果都很好(有調參)並且模型差異⼤，單模要好尤其重要，如果單模效果差異太⼤，Blending 的效果提升就相當有限

In [16]:
from sklearn.ensemble import AdaBoostRegressor
adb = AdaBoostRegressor(n_estimators=50,random_state=0)

model_adb = adb.fit(x_train, y_train)
adb_pred = adb.predict(x_test)
mse_adb = mean_squared_error(adb_pred,y_test, squared=False)
print(mse_adb)

340878.6545812134


In [17]:
#加入 adaboost 後效果變差
mse_sum = 1/mse_gdbt + 1/mse_rf + 1/mse_lr + 1/mse_adb
blending_pred = lr_pred*((1/mse_lr)/mse_sum) + gdbt_pred*((1/mse_gdbt)/mse_sum) + rf_pred*((1/mse_rf)/mse_sum) + adb_pred*((1/mse_adb)/mse_sum)
mean_squared_error(blending_pred,y_test, squared=False)

82110.93196756986