In [1]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from gbdt import GradBoostOnDT
from catboost import CatBoostRegressor

In [2]:
X, y = make_regression(n_samples=100, n_features=3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
trees = []

In [3]:
train_df = pd.DataFrame(X_train)
train_df["y_true"] = y_train
test_df = pd.DataFrame(X_test)
train_df

Unnamed: 0,0,1,2,y_true
0,-0.322062,1.549934,-0.783253,5.641086
1,0.751933,-1.168678,1.142823,43.758637
2,0.954002,0.570891,1.135566,107.859291
3,-0.730367,-0.715304,0.679598,-62.937683
4,-1.220844,-1.057711,0.822545,-106.874231
...,...,...,...,...
85,-0.919424,-1.062304,0.473592,-90.570574
86,-0.714351,0.232254,0.293072,-41.869649
87,-0.469474,1.579213,0.767435,22.917148
88,0.714000,0.482472,-0.223463,63.236207


In [4]:
train_df["y_pred"] = train_df["y_true"].mean()
train_df

Unnamed: 0,0,1,2,y_true,y_pred
0,-0.322062,1.549934,-0.783253,5.641086,3.124915
1,0.751933,-1.168678,1.142823,43.758637,3.124915
2,0.954002,0.570891,1.135566,107.859291,3.124915
3,-0.730367,-0.715304,0.679598,-62.937683,3.124915
4,-1.220844,-1.057711,0.822545,-106.874231,3.124915
...,...,...,...,...,...
85,-0.919424,-1.062304,0.473592,-90.570574,3.124915
86,-0.714351,0.232254,0.293072,-41.869649,3.124915
87,-0.469474,1.579213,0.767435,22.917148,3.124915
88,0.714000,0.482472,-0.223463,63.236207,3.124915


In [5]:
k = 400
nu = 0.1

for i in range(k):
    train_df["residual"] = train_df["y_true"] - train_df["y_pred"]
    tree = DecisionTreeRegressor(max_depth=1)
    tree.fit(train_df[[0, 1, 2]], train_df["residual"])
    train_df["y_pred"] += nu * tree.predict(train_df[[0, 1, 2]])
    trees.append(tree)

In [6]:
train_df

Unnamed: 0,0,1,2,y_true,y_pred,residual
0,-0.322062,1.549934,-0.783253,5.641086,10.497093,-4.833189
1,0.751933,-1.168678,1.142823,43.758637,43.406094,0.375362
2,0.954002,0.570891,1.135566,107.859291,115.487178,-7.741980
3,-0.730367,-0.715304,0.679598,-62.937683,-63.625089,0.710225
4,-1.220844,-1.057711,0.822545,-106.874231,-102.360703,-4.490709
...,...,...,...,...,...,...
85,-0.919424,-1.062304,0.473592,-90.570574,-91.835885,1.288130
86,-0.714351,0.232254,0.293072,-41.869649,-46.417779,4.570948
87,-0.469474,1.579213,0.767435,22.917148,23.199281,-0.259315
88,0.714000,0.482472,-0.223463,63.236207,64.143774,-0.884748


In [7]:
test_df["y_pred"] = train_df["y_true"].mean()
for tree in trees:
    test_df["y_pred"] += nu * tree.predict(test_df[[0, 1, 2]])

In [8]:
test_df["y_true"] = y_test
test_df

Unnamed: 0,0,1,2,y_pred,y_true
0,0.917862,0.404982,-1.260884,56.160093,57.944889
1,-1.724918,0.241962,-1.91328,-139.282542,-156.599862
2,1.057122,-0.719844,-0.460639,72.08788,50.873505
3,3.852731,0.513786,0.515048,222.062477,312.821232
4,-0.471932,0.68626,-1.612716,-28.552168,-44.684949
5,1.765454,-0.474945,-0.653329,138.235876,107.529856
6,-0.46573,0.54256,-0.463418,-16.1989,-27.879156
7,-0.839218,1.031,0.93128,-29.205791,-17.390562
8,-1.463515,-0.327662,-0.392108,-115.967629,-126.052345
9,0.852433,0.186454,-0.661786,41.55853,57.500851


In [9]:
mean_squared_error(test_df["y_pred"], y_test)

1082.5289744498045

In [10]:
model = GradBoostOnDT(400, 0.1, 1)

In [11]:
model.fit(X_train, y_train)

In [12]:
model.train()

In [13]:
predict = model.predict(X_test)
predict

array([  56.16009274, -139.28254151,   72.08787993,  222.06247709,
        -28.55216837,  138.23587603,  -16.19889983,  -29.20579064,
       -115.9676291 ,   41.5585299 ])

In [14]:
mean_squared_error(predict, y_test)

1082.5289744498045

In [15]:
catboost_model = CatBoostRegressor(learning_rate=0.1, depth=1, n_estimators=400)

In [16]:
catboost_model.fit(X_train, y_train)

0:	learn: 73.1586964	total: 133ms	remaining: 52.9s
1:	learn: 70.0009046	total: 133ms	remaining: 26.5s
2:	learn: 66.6811370	total: 133ms	remaining: 17.7s
3:	learn: 64.0871410	total: 134ms	remaining: 13.2s
4:	learn: 61.7846644	total: 134ms	remaining: 10.6s
5:	learn: 59.4842871	total: 134ms	remaining: 8.8s
6:	learn: 57.2471223	total: 134ms	remaining: 7.53s
7:	learn: 55.1801277	total: 134ms	remaining: 6.58s
8:	learn: 53.3055986	total: 135ms	remaining: 5.85s
9:	learn: 51.6303330	total: 135ms	remaining: 5.26s
10:	learn: 50.0432520	total: 135ms	remaining: 4.77s
11:	learn: 48.6576510	total: 135ms	remaining: 4.37s
12:	learn: 47.4925261	total: 135ms	remaining: 4.02s
13:	learn: 46.4132840	total: 135ms	remaining: 3.73s
14:	learn: 45.0816878	total: 135ms	remaining: 3.47s
15:	learn: 43.8505871	total: 135ms	remaining: 3.25s
16:	learn: 42.9749076	total: 136ms	remaining: 3.05s
17:	learn: 41.9934652	total: 136ms	remaining: 2.88s
18:	learn: 41.0436189	total: 136ms	remaining: 2.72s
19:	learn: 40.3750962	t

<catboost.core.CatBoostRegressor at 0x1b5a2191b50>

In [17]:
c_predict = catboost_model.predict(X_test)

In [18]:
mean_squared_error(c_predict, y_test)

877.0233338811662