# Adaboost

In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
cancer = datasets.load_breast_cancer()
x = cancer.data 
y = cancer.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [3]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(accuracy_score(y_pred, y_test))

0.9649122807017544


# blending

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df_train = pd.read_csv('data/output/house_train_clean.csv')

df_train_y = df_train['單價(元/平方公尺)']
df_train = df_train.drop(['單價(元/平方公尺)'], axis=1)
display(df_train.head())

x_train, x_test, y_train, y_test = train_test_split(df_train, df_train_y, test_size=0.2, random_state=1)

Unnamed: 0,土地移轉總面積(平方公尺),建物現況格局-廳,建物現況格局-房,建物現況格局-衛,建物移轉總面積(平方公尺),總價(元),車位移轉總面積(平方公尺),車位總價(元),num_of_bus_stations_in_100m,income_avg,...,建物型態,建物現況格局-隔間,有無管理組織,車位類別,都市土地使用分區,鄉鎮市區,非都市土地使用分區,location_type,low_use_electricity,nearest_tarin_station
0,2.361522,-1.567592,-1.421213,-1.496114,-1.352965,-0.910604,0.0,0.0,-1.4251,0.105592,...,2,0,1,1,1,6,1,1,228,40
1,0.62873,0.812083,-0.734407,-0.343028,-0.344173,-0.66232,0.0,0.0,1.769935,-0.430513,...,1,0,1,1,0,1,1,3,252,39
2,2.361522,0.812083,1.326014,2.539689,2.294019,2.227697,0.0,0.0,1.130928,0.454073,...,0,0,0,4,0,5,1,2,335,23
3,0.294644,0.812083,0.639207,0.810059,-0.09657,-0.445073,0.0,0.0,-0.147086,-0.433294,...,1,0,1,1,0,9,1,3,76,52
4,-0.979831,0.812083,-0.0476,-0.343028,-0.679053,-0.320931,0.0,0.0,-1.4251,-0.728156,...,0,0,0,1,2,0,1,3,306,82


In [5]:
# 使用三種模型 : 線性迴歸 / 梯度提升機 / 隨機森林, 參數使用 Random Search 尋找
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
lr = LinearRegression(normalize=False, fit_intercept=True, copy_X=True)
gdbt = GradientBoostingRegressor(tol=0.1, subsample=0.37, n_estimators=200, max_features=20, 
                                 max_depth=6, learning_rate=0.03)
rf = RandomForestRegressor(n_estimators=300, min_samples_split=9, min_samples_leaf=10, 
                           max_features='sqrt', max_depth=8, bootstrap=False)

In [6]:
# 線性迴歸預測
model_lr = lr.fit(x_train, y_train)
lr_pred = model_lr.predict(x_test)
mse_lr = mean_squared_error(lr_pred,y_test, squared=False)
print(mse_lr)

88171.56857232089


In [7]:
# 隨機森林預測
model_rf = rf.fit(x_train, y_train)
rf_pred = model_rf.predict(x_test)
mse_rf = mean_squared_error(rf_pred,y_test, squared=False)
print(mse_rf)

87275.93511369871


In [8]:
# 梯度提升機預測
model_gdbt = gdbt.fit(x_train, y_train)
gdbt_pred = gdbt.predict(x_test)
mse_gdbt = mean_squared_error(gdbt_pred,y_test, squared=False)
print(mse_gdbt)

87522.71193572339


In [9]:
# 混合泛化預測
mse_sum = mse_gdbt + mse_rf + mse_lr
blending_pred = lr_pred*(mse_lr/mse_sum) + gdbt_pred*(mse_gdbt/mse_sum) + rf_pred*(mse_rf/mse_sum)
mean_squared_error(blending_pred,y_test, squared=False)

72094.14137134096

#### 注意，Blending 的前提是 : 個別單模效果都很好(有調參)並且模型差異⼤，單模要好尤其重要，如果單模效果差異太⼤，Blending 的效果提升就相當有限

In [10]:
from sklearn.ensemble import AdaBoostRegressor
adb = AdaBoostRegressor(n_estimators=50,random_state=0)

model_adb = adb.fit(x_train, y_train)
adb_pred = adb.predict(x_test)
mse_adb = mean_squared_error(adb_pred,y_test, squared=False)
print(mse_adb)

304948.95791727374


In [11]:
mse_sum = mse_gdbt + mse_rf + mse_lr + mse_adb
blending_pred = lr_pred*(mse_lr/mse_sum) + gdbt_pred*(mse_gdbt/mse_sum) + rf_pred*(mse_rf/mse_sum) + adb_pred*(mse_adb/mse_sum)
mean_squared_error(blending_pred,y_test, squared=False)

180502.68597362222