In [92]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor


In [2]:
import pandas as pd
import numpy as np

In [9]:
import os
project_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('.')), '../'))
import sys
sys.path.append(project_path)

In [37]:
def evaluate(d_test, pred):
    new_d_test = d_test.copy()
    new_d_test['pred'] = pred
    new_d_test['err'] = new_d_test.apply(lambda row: row.price - row.pred, axis=1)
    new_d_test['abserr'] = new_d_test.err.map(lambda x: abs(x))
    new_d_test['ape'] = new_d_test.apply(lambda row: row.abserr / row.price, axis=1)
    mape = np.mean(new_d_test.ape)
    print(f'MAPE: {round(mape * 100, 2)}%')
    accuracy5p = len(list(filter(lambda x: x <= 0.05, new_d_test.ape))) / len(new_d_test)
    print(f'5%: {round(accuracy5p, 4)*100}%')
    mse = np.mean(new_d_test.err * new_d_test.err)
    print(f'mse: {round(mse, 4)}')
    rmse = np.sqrt(mse)
    print(f'rmse: {round(rmse, 4)}')

In [10]:
project_path

'/Users/lemon/PycharmProjects/way_to_ml'

In [11]:
from util.data_process import DataProcess

In [12]:

data_train = pd.read_csv(f'{project_path}/data/car_price_train.201908.csv')
data_test = pd.read_csv(f'{project_path}/data/car_price_test.201908.csv')
###
series_name = '宝马5系'
d_train = data_train[data_train.model_series == series_name]
d_test = data_test[data_test.model_series == series_name]

In [13]:
with open('car_price_feat.txt') as f:
    feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n')))

In [14]:
label_encode_map, f_map = DataProcess.gencode(pd.concat([data_train, data_test]), feat_list)

In [18]:
en_train, en_test = DataProcess.encode_process(d_train[feat_list], feat_list, label_encode_map), DataProcess.encode_process(d_test[feat_list], feat_list, label_encode_map)

In [63]:
est = HistGradientBoostingRegressor(max_iter=200, learning_rate=0.3, max_depth=6, min_samples_leaf=20, max_leaf_nodes=40)
est.fit(en_train, d_train.price)

HistGradientBoostingRegressor(l2_regularization=0.0, learning_rate=0.3,
                              loss='least_squares', max_bins=256, max_depth=6,
                              max_iter=200, max_leaf_nodes=40,
                              min_samples_leaf=20, n_iter_no_change=None,
                              random_state=None, scoring=None, tol=1e-07,
                              validation_fraction=0.1, verbose=0)

In [64]:
pred = est.predict(en_test)
evaluate(d_test, pred)

MAPE: 6.23%
5%: 51.28%
mse: 4.6821
rmse: 2.1638


In [65]:
est.score(en_train, d_train.price), est.score(en_test, d_test.price)

(0.9932548245376968, 0.8364660061612945)

In [66]:
from sklearn.ensemble import GradientBoostingRegressor

In [87]:
gbt = GradientBoostingRegressor(min_samples_leaf=5, min_samples_split=11, max_depth=3, max_leaf_nodes=30, subsample=0.8, n_estimators=250)
gbt.fit(en_train, d_train.price)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=30,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=5, min_samples_split=11,
                          min_weight_fraction_leaf=0.0, n_estimators=250,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=0.8, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [88]:
pred1= gbt.predict(en_test)
evaluate(d_test, pred1)

MAPE: 5.76%
5%: 53.38%
mse: 4.0399
rmse: 2.01


In [90]:
### 以下是grid search + CV
from sklearn.model_selection import GridSearchCV

In [106]:

param_grid = {
        "learning_rate": [0.1, 0.2, 0.05, 0.03, 0.02, 0.01],
        "max_depth": [3, 4],
        "subsample": [0.8, 0.9, 1],
        "max_leaf_nodes": [20, 25, 30]
    }
gbt = GradientBoostingRegressor(min_samples_leaf=5, min_samples_split=11, max_depth=3, max_leaf_nodes=30,
                                    subsample=0.8, n_estimators=250)
gscv = GridSearchCV(gbt, param_grid, cv=5)
gscv.fit(en_train, d_train.price)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=30,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=5,
                                                 min_samples_split=11,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=250,
                                                 n_iter_...None,
                        

In [104]:
print("Test set score:{:.2f}".format(gscv.score(en_test,d_test.price)))

Test set score:0.86


In [105]:

print("Best parameters:{}".format(gscv.best_params_))
print("Best score on train set:{:.2f}".format(gscv.best_score_))

Best parameters:{'learning_rate': 0.05, 'max_depth': 3, 'subsample': 0.9}
Best score on train set:0.91
