In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

%matplotlib inline

def print_regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f'MSE = {mse:.6f}, RMSE = {rmse:.6f}')
    #print(f'MSE = {mse}, RMSE = {rmse}')

raw_df = pd.read_csv('../../data/boston.csv', sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

X_train, X_test, y_train, y_test = train_test_split(data, target, train_size = 0.75, shuffle=False)

base_algorithms_list =  []
coefficients_list =  []

def  L(p, y):
    return y-p

def gbm_predict(X):
    return [sum([coeff * algo.predict([x])[0] for algo, coeff in zip(base_algorithms_list, coefficients_list)]) for x in X] 

trees_count = 50
depth = 5
random_state=139
coeff = 0.9

#values = []
#train_scores = []
#test_scores = []
for i in range(trees_count):
    y_pred = gbm_predict(X_train)
    base_algorithms_list.append(DecisionTreeRegressor(max_depth=depth, random_state=random_state).fit(X_train, L(y_pred, y_train)))
    coeff = 0.9/(1.0+i)
    coefficients_list.append(coeff)
    #mse_test = mean_squared_error(y_test, gbm_predict(X_test))
    #mse_train = mean_squared_error(y_train, y_pred)
    #values.append(i)
    #test_scores.append(np.sqrt(mse_test))
    #train_scores.append(np.sqrt(mse_train))


print_regression_metrics(y_test, gbm_predict(X_test))
#plt.plot(values, train_scores, '-o', label='Train')
#plt.plot(values, test_scores, '-o', label='Test')
#plt.legend()
#plt.show()

MSE = 22.632176, RMSE = 4.757329


In [52]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)
print_regression_metrics(y_test, reg.predict(X_test))

MSE = 68.144691, RMSE = 8.254980
