In [1]:
import plotly
from plotly import graph_objs as go
import pyecharts
import pandas as pd
import numpy as np

In [4]:
import os
project_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('.')), '../'))
import sys
sys.path.append(project_path)
from util.data_process import DataProcess
from util.visualization import draw_lines
from util.evaluate_process import car_price_evaluate

In [5]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
def car_price_evaluate(d_test, pred, silent=0):
    new_d_test = d_test.copy()
    new_d_test['pred'] = pred
    new_d_test['err'] = new_d_test.apply(lambda row: row.price - row.pred, axis=1)
    new_d_test['abserr'] = new_d_test.err.map(lambda x: abs(x))
    new_d_test['ape'] = new_d_test.apply(lambda row: row.abserr / row.price, axis=1)
    mape = np.mean(new_d_test.ape)
    accuracy5p = len(list(filter(lambda x: x <= 0.05, new_d_test.ape))) / len(new_d_test)
    msev = float(np.mean(new_d_test.err * new_d_test.err))
    rmse = np.sqrt(msev)
    if not silent:
        print(f'rmse: {round(rmse, 4)}')
        print(f'mse: {round(msev, 4)}')
        print(f'MAPE: {round(mape * 100, 2)}%')
        print(f'5%: {round(accuracy5p, 4) * 100}%')
    return {
        'rmse': round(rmse, 4),
        'mse': round(msev, 4),
        'mape': round(mape*100, 4),
        '5%': round(accuracy5p, 4) * 100
    }
def draw_lines(xs, ys, names):
    fig = go.Figure()
    if isinstance(ys, pd.DataFrame):
        ys = [ys[k] for k in ys.columns]
    # Add traces
    for y, name in zip(ys, names):
        fig.add_trace(go.Scatter(x=xs, y=y,
                    mode='lines+markers',
                    name=name))
    fig.show()

In [6]:
data_train = pd.read_csv(f'{project_path}/data/small_car_price_train.201908.csv')
data_test = pd.read_csv(f'{project_path}/data/small_car_price_test.201908.csv')
###

In [7]:
with open('car_price_feat.txt') as f:
    feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n')))

label_encode_map, f_map = DataProcess.gencode(pd.concat([data_train, data_test]), feat_list)
en_train, en_test = DataProcess.encode_process(data_train[feat_list], feat_list, label_encode_map), DataProcess.encode_process(data_test[feat_list], feat_list, label_encode_map)

In [14]:
# min_samples_leaf
# 5
rmse_arr = {}
for i in range(2, 10):
    v = i
    regr = RandomForestRegressor(max_depth=7, 
                             n_estimators=305,
                                min_samples_split=7,
                                min_samples_leaf= v)
    regr.fit(en_train, data_train.price)  

    pred = regr.predict(en_test)
    ret = car_price_evaluate(data_test, pred, 1)
    rmse_arr[v] = ret['rmse']
print(ret)
print(f'best_rmse: {min(list(rmse_arr.values()))}')
draw_lines(list(rmse_arr.keys()), [list(rmse_arr.values())], ['a'])



{'rmse': 2.3385, 'mse': 5.4688, 'mape': 6.3444, '5%': 48.449999999999996}
best_rmse: 2.2676


In [15]:
# min_samples_split
# 7
rmse_arr = {}
for i in range(2, 20):
    v = i
    regr = RandomForestRegressor(max_depth=7, 
                             n_estimators=305,
                                min_samples_split=v)
    regr.fit(en_train, data_train.price)  

    pred = regr.predict(en_test)
    ret = car_price_evaluate(data_test, pred, 1)
    rmse_arr[v] = ret['rmse']
print(ret)
print(f'best_rmse: {min(list(rmse_arr.values()))}')
draw_lines(list(rmse_arr.keys()), [list(rmse_arr.values())], ['a'])



{'rmse': 2.2644, 'mse': 5.1275, 'mape': 6.2543, '5%': 49.18}
best_rmse: 2.2363


In [16]:
# n_estimators
rmse_arr = {}
for i in range(300, 330):
    v = i
    regr = RandomForestRegressor(max_depth=7, 
                             n_estimators=v)
    regr.fit(en_train, data_train.price)  

    pred = regr.predict(en_test)
    ret = car_price_evaluate(data_test, pred, 1)
    rmse_arr[v] = ret['rmse']
print(ret)
print(f'best_rmse: {min(list(rmse_arr.values()))}')
draw_lines(list(rmse_arr.keys()), [list(rmse_arr.values())], ['a'])



{'rmse': 2.2335, 'mse': 4.9884, 'mape': 6.1582, '5%': 50.82}
best_rmse: 2.2315


In [17]:
# max_depth
rmse_arr = {}
for i in range(6, 14):
    v = i
    regr = RandomForestRegressor(max_depth=i, 
                             n_estimators=200)
    regr.fit(en_train, data_train.price)  

    pred = regr.predict(en_test)
    ret = car_price_evaluate(data_test, pred, 1)
    rmse_arr[v] = ret['rmse']
print(ret)
print(f'best_rmse: {min(list(rmse_arr.values()))}')
draw_lines(list(rmse_arr.keys()), [list(rmse_arr.values())], ['a'])



{'rmse': 2.2092, 'mse': 4.8806, 'mape': 6.1191, '5%': 51.370000000000005}
best_rmse: 2.2017


In [18]:
feat_importance = pd.DataFrame({'feat':feat_list, 'imp': list(regr.feature_importances_)})

In [19]:
feat_importance.sort_values(by='imp', ascending=False)

Unnamed: 0,feat,imp
61,used_months,5.695652e-01
64,used_ages,7.493898e-02
39,model_age,5.139220e-02
42,model_year,4.853518e-02
7,car_length,4.043051e-02
62,transmission_case,3.835661e-02
8,car_width,3.633432e-02
63,mileage,3.178266e-02
37,max_horse_power,1.492801e-02
23,edition,1.229652e-02


In [23]:
vdf = pd.DataFrame({'pred': pred, 'price': data_test.price, 'id':data_test.id})
vdf = vdf.sort_values(by='price')

draw_lines([f'_{i}' for i in vdf.id], vdf[['pred', 'price']], ['pred', 'price'])