In [1]:
import copy
import plotly
from plotly import graph_objs as go
import pyecharts
import pandas as pd

In [2]:
import lightgbm as lgb


Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.



In [3]:
import pandas as pd
import numpy as np

In [4]:
import os
project_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('.')), '../'))
import sys
sys.path.append(project_path)
from util.data_process import DataProcess

In [5]:
def evaluate(d_test, pred, silent=1):
    new_d_test = d_test.copy()
    new_d_test['pred'] = pred
    new_d_test['err'] = new_d_test.apply(lambda row: row.price - row.pred, axis=1)
    new_d_test['abserr'] = new_d_test.err.map(lambda x: abs(x))
    new_d_test['ape'] = new_d_test.apply(lambda row: row.abserr / row.price, axis=1)
    mape = np.mean(new_d_test.ape)
    accuracy5p = len(list(filter(lambda x: x <= 0.05, new_d_test.ape))) / len(new_d_test)
    mse = np.mean(new_d_test.err * new_d_test.err)
    rmse = np.sqrt(mse)
    if silent:
        print(f'5%: {round(accuracy5p, 4)*100}%')
        print(f'mse: {round(mse, 4)}')
        print(f'MAPE: {round(mape * 100, 2)}%')
        print(f'rmse: {round(rmse, 4)}')
    return rmse
def gen_line_plotly(img_path, df, attrs=None, names=None, consult_cols=None, **kwargs):
    if isinstance(df, dict):
        df = pd.DataFrame(df)
    keys = list(df.columns) if names is None else names
    if attrs is None:
        attrs = [f'att{i}' for i in range(len(df))]
    elif len(attrs) != len(df):
        raise Exception('attrs length error.')
    else:
        attrs = [f'_{i}' for i in attrs]
    ####
    data = [go.Scatter(x=attrs, y=df[keys[i]], name=keys[i]) for i in range(len(keys))]
    if consult_cols:
        data += [go.Scatter(x=attrs, y=line, name='-', line={'dash': 'dot'}) for line in consult_cols]
    plotly.offline.plot({
        "data": data,
        "layout": go.Layout(title=kwargs.get('graph_name', '-'))
    },
        filename=img_path,
        auto_open=kwargs.get('auto_open', False)
    )


In [6]:
project_path

'/Users/lemon/PycharmProjects/way_to_ml'

In [7]:
data_train = pd.read_csv(f'{project_path}/data/car_price_train.201908.csv')
data_test = pd.read_csv(f'{project_path}/data/car_price_test.201908.csv')
###
series_name = '宝马5系'
d_train = data_train[data_train.model_series == series_name]
d_test = data_test[data_test.model_series == series_name]

In [8]:
with open('car_price_feat.txt') as f:
    feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n')))

In [9]:
label_encode_map, f_map = DataProcess.gencode(pd.concat([data_train, data_test]), feat_list)
en_train, en_test = DataProcess.encode_process(d_train[feat_list], feat_list, label_encode_map), DataProcess.encode_process(d_test[feat_list], feat_list, label_encode_map)

In [10]:
train_data = lgb.Dataset(en_train, label=d_train.price)
test_data = lgb.Dataset(en_test)

In [None]:
# objective regression, type = enum, options: regression, regression_l1, huber, fair, poisson, quantile, mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda, lambdarank
# boosting default = gbdt, type = enum, options: gbdt, rf, dart, goss, aliases: boosting_type, boost
# num_iterations  aliases: num_iteration, n_iter, num_tree, num_trees, num_round, num_rounds, num_boost_round, n_estimators

In [11]:
def run(param, num_round=200):
    bst = lgb.train(param, train_data, num_round)
    pred = bst.predict(en_test)
    r = evaluate(d_test, pred, 0)
    return r

In [12]:
core_params = {
    #'task': train, predict, convert_model, refit
    'objective': 'mape',
    'boosting': 'gbdt',  # gbdt, rf, dart, goss
    'learning_rate': 0.12, # 0.1
    'num_leaves': 6,
    'num_iterations': 270, # 100
    'tree_learner': 'data', # serial, feature, data, voting
}
learning_params = {
    'max_depth': 3, # -1
    'min_data_in_leaf': 21, # 20
    'min_sum_hessian_in_leaf': 0.001,  # 0.001
    'bagging_fraction': 0.9, # 采样 subsample
    'feature_fraction': 0.24, # 1
    'max_delta_step': 0, # 0 更保守, max_tree_output max_leaf_output
    'lambda_l1': 0.7, 
    'lambda_l2': 2.5,
    'min_gain_to_split': 0.0001,
}
io_params = {
    'verbosity': 2,
    'max_bin': 255, # 
}
metric_params = {
    'metric': 'rmse',
}
lgb_params = {
    **core_params,
    **learning_params,
    **metric_params,
    **io_params,
}
run(lgb_params)


Found `num_iterations` in params. Will use it instead of argument



2.2250486964004565

In [25]:
train_data1 = lgb.Dataset(en_train, label=d_train.price, categorical_feature=['deal_city', ])
bst = lgb.train(lgb_params, train_data1)
pred = bst.predict(en_test)
r = evaluate(d_test, pred, 0)
r

2.2308714577480253

In [20]:
fidf = pd.DataFrame({'feat': en_train.columns, 'importance': bst.feature_importance()})
fidf.sort_values(by='importance', ascending=False)

Unnamed: 0,feat,importance
61,used_months,87
10,deal_city,86
63,mileage,86
23,edition,52
48,power_displacement_cartesian,46
28,fender_flaw,45
39,model_age,44
49,region,42
6,car_height,36
68,far_beam,32


In [225]:
### metric
### 默认0
rmse_arr = {}
for v in ['l1', 'l2', 'rmse', 'quantile', 'mape', 'huber', 'fair']:

    d = copy.deepcopy(lgb_params)
    d['metric'] = v
    x = run(param=d)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./metric.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.112436812913914


In [223]:
### max_bin 没啥效果
### 默认0
rmse_arr = {}
for v in [10, 20, 50, 100, 200, 300, 500]:

    d = copy.deepcopy(lgb_params)
    d['max_bin'] = v
    x = run(param=d)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./max_bin.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.112436812913914


In [210]:
### min_gain_to_split
### 默认0
rmse_arr = {}
for i in range(1, 10):
    v = i / 1000
    d = copy.deepcopy(lgb_params)
    d['min_gain_to_split'] = v
    x = run(param=d)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./min_gain_to_split.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.155366020865733


In [210]:
### min_gain_to_split
### 默认0
rmse_arr = {}
for i in range(1, 10):
    v = i / 1000
    d = copy.deepcopy(lgb_params)
    d['min_gain_to_split'] = v
    x = run(param=d)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./min_gain_to_split.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.155366020865733


In [205]:
### lambda_l2
### 默认0
rmse_arr = {}
for i in range(2, 30):
    v = i / 10
    d = copy.deepcopy(lgb_params)
    d['lambda_l2'] = v
    x = run(param=d)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./lambda_l2.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.09158677773926


In [201]:
### lambda_l1
### 默认0
rmse_arr = {}
for i in range(1, 30, 2):
    v = i / 10
    d = copy.deepcopy(lgb_params)
    d['lambda_l1'] = v
    x = run(param=d)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./lambda_l1.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.021951449582994


In [199]:
### max_delta_step
### 默认0
rmse_arr = {}
for i in range(1, 30, 2):
    v = i / 10
    d = copy.deepcopy(lgb_params)
    d['max_delta_step'] = v
    x = run(param=d)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./max_delta_step.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.0206829090623253


In [195]:
### feature_fraction
### 默认1
rmse_arr = {}
#for i in range(10, 50, 2):
for i in range(20, 30):
    v = i / 100
    d = copy.deepcopy(lgb_params)
    d['feature_fraction'] = v
    x = run(param=d, num_round=270)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./feature_fraction1.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.0206829090623253


In [194]:
### feature_fraction
### 默认1
rmse_arr = {}
for i in range(5, 100, 5):
    v = i / 100
    d = copy.deepcopy(lgb_params)
    d['feature_fraction'] = v
    x = run(param=d, num_round=270)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./feature_fraction.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.0312264776264244


In [196]:
### bagging_fraction
### 默认1
rmse_arr = {}
for i in range(5, 100, 5):
    v = i / 100
    d = copy.deepcopy(lgb_params)
    d['bagging_fraction'] = v
    x = run(param=d, num_round=270)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./bagging_fraction.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.091472588979605


In [183]:
### min_sum_hessian_in_leaf
### 默认0.001
rmse_arr = {}
#for i in range(10, 50, 2):
for i in range(1, 20):
    v = i / 10000
    d = copy.deepcopy(lgb_params)
    d['min_sum_hessian_in_leaf'] = v
    x = run(param=d, num_round=270)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./min_sum_hessian_in_leaf.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.0818824202517483


In [182]:
### min_data_in_leaf
rmse_arr = {}
for i in range(15, 27):
    d = copy.deepcopy(lgb_params)
    d['min_data_in_leaf'] = i 
    x = run(param=d, num_round=270)
    rmse_arr[i] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./min_data_in_leaf1.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.0818824202517483


In [181]:
### min_data_in_leaf
### 20
rmse_arr = {}
for i in range(10, 50, 2):
    d = copy.deepcopy(lgb_params)
    d['min_data_in_leaf'] = i 
    x = run(param=d, num_round=270)
    rmse_arr[i] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./min_data_in_leaf.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.087939630373224


In [171]:
### max_depth
### 3 就行, 再大就过拟合了
rmse_arr = {}
for i in range(1, 10):
    d = copy.deepcopy(lgb_params)
    d['max_depth'] = i 
    x = run(param=d, num_round=270)
    rmse_arr[i] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./max_depth.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.0836279020979886


In [167]:
### learning_rate
rmse_arr = {}
for i in range(1, 100):
    d = copy.deepcopy(lgb_params)
    d['learning_rate'] = i/100 
    x = run(param=d, num_round=270)
    rmse_arr[i/100] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./learning_rate.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.0836279020979886


In [166]:
### num_round
rmse_arr = {}
for i in range(50, 350, 5):
    d = copy.deepcopy(lgb_params)
    x = run(param=d, num_round=i)
    rmse_arr[i] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./num_round.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.1055735970613205


In [159]:
### num_leaves
rmse_arr = {}
for i in range(2, 30):
    d = copy.deepcopy(lgb_params)
    d['num_leaves'] = i
    x = run(param=d)
    rmse_arr[i] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./num_leaves.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)

best_rmse: 2.113846860364541


2.113846860364541