In [15]:
import plotly
from plotly import graph_objs as go
import pyecharts
import pandas as pd
import numpy as np

In [2]:
import os
project_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('.')), '../'))
import sys
sys.path.append(project_path)
from util.data_process import DataProcess

In [8]:
from sklearn.ensemble import RandomForestRegressor

In [3]:
def evaluate(d_test, pred, silent=1):
    new_d_test = d_test.copy()
    new_d_test['pred'] = pred
    new_d_test['err'] = new_d_test.apply(lambda row: row.price - row.pred, axis=1)
    new_d_test['abserr'] = new_d_test.err.map(lambda x: abs(x))
    new_d_test['ape'] = new_d_test.apply(lambda row: row.abserr / row.price, axis=1)
    mape = np.mean(new_d_test.ape)
    accuracy5p = len(list(filter(lambda x: x <= 0.05, new_d_test.ape))) / len(new_d_test)
    mse = np.mean(new_d_test.err * new_d_test.err)
    rmse = np.sqrt(mse)
    if silent:
        print(f'5%: {round(accuracy5p, 4)*100}%')
        print(f'mse: {round(mse, 4)}')
        print(f'MAPE: {round(mape * 100, 2)}%')
        print(f'rmse: {round(rmse, 4)}')
    return rmse
def gen_line_plotly(img_path, df, attrs=None, names=None, consult_cols=None, **kwargs):
    if isinstance(df, dict):
        df = pd.DataFrame(df)
    keys = list(df.columns) if names is None else names
    if attrs is None:
        attrs = [f'att{i}' for i in range(len(df))]
    elif len(attrs) != len(df):
        raise Exception('attrs length error.')
    else:
        attrs = [f'_{i}' for i in attrs]
    ####
    data = [go.Scatter(x=attrs, y=df[keys[i]], name=keys[i]) for i in range(len(keys))]
    if consult_cols:
        data += [go.Scatter(x=attrs, y=line, name='-', line={'dash': 'dot'}) for line in consult_cols]
    plotly.offline.plot({
        "data": data,
        "layout": go.Layout(title=kwargs.get('graph_name', '-'))
    },
        filename=img_path,
        auto_open=kwargs.get('auto_open', False)
    )


In [4]:
data_train = pd.read_csv(f'{project_path}/data/small_car_price_train.201908.csv')
data_test = pd.read_csv(f'{project_path}/data/small_car_price_test.201908.csv')
###

In [6]:
with open('car_price_feat.txt') as f:
    feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n')))

label_encode_map, f_map = DataProcess.gencode(pd.concat([data_train, data_test]), feat_list)
en_train, en_test = DataProcess.encode_process(data_train[feat_list], feat_list, label_encode_map), DataProcess.encode_process(data_test[feat_list], feat_list, label_encode_map)




In [27]:
# min_samples_leaf
# 5
rmse_arr = {}
for i in range(2, 10):
    v = i
    regr = RandomForestRegressor(max_depth=7, 
                             n_estimators=305,
                                min_samples_split=7,
                                min_samples_leaf= v)
    regr.fit(en_train, data_train.price)  

    pred = regr.predict(en_test)
    x = evaluate(data_test, pred, 0)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./min_samples_leaf.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)


best_rmse: 2.2713637272825937


In [26]:
# min_samples_split
# 7
rmse_arr = {}
for i in range(2, 20):
    v = i
    regr = RandomForestRegressor(max_depth=7, 
                             n_estimators=305,
                                min_samples_split=v)
    regr.fit(en_train, data_train.price)  

    pred = regr.predict(en_test)
    x = evaluate(data_test, pred, 0)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./min_samples_split.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)


best_rmse: 2.2375070754003303


In [25]:
# n_estimators
rmse_arr = {}
for i in range(300, 330):
    v = i
    regr = RandomForestRegressor(max_depth=7, 
                             n_estimators=v)
    regr.fit(en_train, data_train.price)  

    pred = regr.predict(en_test)
    x = evaluate(data_test, pred, 0)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./n_estimators1.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)


best_rmse: 2.2340290582796682


In [21]:
# max_depth
rmse_arr = {}
for i in range(6, 14):
    v = i
    regr = RandomForestRegressor(max_depth=i, 
                             n_estimators=200)
    regr.fit(en_train, data_train.price)  

    pred = regr.predict(en_test)
    x = evaluate(data_test, pred, 0)
    rmse_arr[v] = x

print(f'best_rmse: {min(list(rmse_arr.values()))}')
gen_line_plotly(
    img_path='./max_depth.html',
    df={'a': list(rmse_arr.values())},
    attrs=list(rmse_arr.keys()),
    auto_open=1,
)


best_rmse: 2.188000395773692


In [10]:
feat_importance = pd.DataFrame({'feat':feat_list, 'imp': list(regr.feature_importances_)})

In [11]:
feat_importance.sort_values(by='imp', ascending=False)

Unnamed: 0,feat,imp
61,used_months,0.601675
39,model_age,0.088310
64,used_ages,0.058751
7,car_length,0.055457
42,model_year,0.054354
62,transmission_case,0.048033
8,car_width,0.043680
67,dipped_headlight,0.011197
37,max_horse_power,0.009730
23,edition,0.009580


In [31]:
vdf = pd.DataFrame({'pred': pred, 'price': data_test.price, 'id':data_test.id})
vdf = vdf.sort_values(by='price')
gen_line_plotly(
    './rf_car_price.html',
    df=vdf[['pred', 'price']],
    auto_open=1,
    attrs=vdf.id
)