In [17]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import os
project_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('.')), '../'))
import sys
sys.path.append(project_path)
from util.visualization import draw_lines
from util.visualization import draw_scatters
from util.evaluate_process import classifier_evaluate, car_price_evaluate
from collections import Counter
from sklearn.metrics import auc, roc_curve


In [2]:
data_train = pd.read_csv(f'{project_path}/data/small_car_price_train.201908.csv')
data_test = pd.read_csv(f'{project_path}/data/small_car_price_test.201908.csv')

In [20]:
data_test.columns

Index(['c_price', 'max_price', 'min_price', 'dealer_price', 'car_model',
       'fuel_type', 'register_city', 'model_generation',
       'is_electric_skylight', 'is_panoramic_sunroof',
       ...
       'bottom_bigside_chromaticnum', 'bottom_bigside_weldnum',
       'bottom_bigside_injurynum', 'bottom_bigside_injury_sagnum',
       'bottom_bigside_injury_scratchnum', 'bottom_bigside_injury_brokennum',
       'bottom_bigside_injury_scarsnum', 'bottom_bigside_injury_cracknum',
       'bottom_bigside_repairnum', 'id'],
      dtype='object', length=131)

In [3]:
from util.evaluate_process import car_price_evaluate
from util.visualization import draw_lines
from util.data_process import DataProcess

In [4]:
with open(f'{project_path}/data/car_price_feat.txt') as f:
    feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n')))

label_encode_map, f_map = DataProcess.gencode(pd.concat([data_train, data_test]), feat_list)
X_train, X_test = DataProcess.encode_process(data_train[feat_list], feat_list, label_encode_map), DataProcess.encode_process(data_test[feat_list], feat_list, label_encode_map)
y_train = data_train.price
y_test = data_test.price

In [18]:
def car_price_evaluate(target, pred, silent=0):
    if isinstance(target, pd.DataFrame):
        new_d_test = target.copy()
        new_d_test['pred'] = pred
    else:
        new_d_test = pd.DataFrame({'price': target, 'pred':pred})

    new_d_test['err'] = new_d_test.apply(lambda row: row.price - row.pred, axis=1)
    new_d_test['abserr'] = new_d_test.err.map(lambda x: abs(x))
    new_d_test['ape'] = new_d_test.apply(lambda row: row.abserr / row.price, axis=1)
    mape = np.mean(new_d_test.ape)
    accuracy5p = len(list(filter(lambda x: x <= 0.05, new_d_test.ape))) / len(new_d_test)
    msev = float(np.mean(new_d_test.err * new_d_test.err))
    rmse = np.sqrt(msev)
    if not silent:
        print(f'rmse: {round(rmse, 4)}')
        print(f'mse: {round(msev, 4)}')
        print(f'MAPE: {round(mape * 100, 2)}%')
        print(f'5%: {round(accuracy5p, 4) * 100}%')
    return {
        'rmse': round(rmse, 4),
        'mse': round(msev, 4),
        'mape': round(mape*100, 4),
        '5%': round(accuracy5p, 4) * 100
    }


rmse: 2.4969
mse: 6.2344
MAPE: 7.06%
5%: 45.61%


{'rmse': 2.4969, 'mse': 6.2344, 'mape': 7.0601, '5%': 45.61}

In [9]:
def tree_to_dot(tree_dict, keys=None):
    # node: 0 [label="abc"];
    # line: 0 -> 1;
    if keys is None:
        keys = ['key', 'val', 'size', 'reason']
    nodes = [];
    lines = [];
    index = [0];
    def _process(struct, parent_id=None, path=''):
        cnt = '\\n'.join([f'{key}:{struct.get(key, "-")}' for key in keys])
        node = f'{index[0]} [label="{cnt}"];'
        nodes.append(node)
        if parent_id is not None:
            if path:
                lines.append(f'{parent_id} -> {index[0]} [headlabel="{path}"];')
            else:
                lines.append(f'{parent_id} -> {index[0]};')
        ###
        selfindex = index[0]
        index[0] += 1
        if 'trees' in struct:
            for v in struct['trees']:
                _process(struct['trees'][v], selfindex, path=v)
    ###
    _process(tree_dict)
    nodesstr = '\n'.join(nodes)
    linesstr = '\n'.join(lines)
    outputstr = 'digraph Tree {\nnode [shape=box] ;\n %s\n%s\n}' % (nodesstr, linesstr)
    return outputstr

## 这里dot 是一个绘图命令
## open 是mac 上的打开文件命令
def draw_tree(tree, save_path, keys=None):
    s = tree_to_dot(tree, keys)
    with open(f'{save_path}.dot', 'w') as f:
        f.write(s)
    os.system(f'dot -Tpng {save_path}.dot -o {save_path}.png && open {save_path}.png')

In [5]:
def lsd(arr):
    m = np.mean(arr)
    return sum([(x-m)**2 for x in arr])

def lad(arr):
    m = np.mean(arr)
    return sum([abs(x-m) for x in arr])
    

In [15]:
### 
def gen_tree_cart_reg(df, depth=0, min_samples_split=4, min_samples_leaf=2, max_depth=8, min_ms=4):
    ori_quota = lsd(df.target)
    size = len(df)
    ret = {
            'ori_quota': ori_quota,
            'size': size,
            'regval': np.mean(df.target),
            'depth': depth
    }
    if size < min_samples_split:
        # 这个节点数量太少, 不应该分裂
        ret['reason'] = 'less than min_samples_split'
        return ret
    if depth >= max_depth:
        # 太深了, 停止分裂
        ret['reason'] = 'to deep'
        return ret
    ###
    best_quota = 1
    best_key = None
    best_val = None
    ###
    for k in df.columns:
        if k == 'target':
            continue
        ## 其实可以在这里区分离散值和连续值, 来给出最好分裂点
        uniq_val = set(df[k])
        for v in uniq_val:
            ### 缺省值划分到 <=
            df1 = df[~(df[k] > v)]
            df2 = df[  df[k] > v ]
            if len(df1) == size or len(df1) == 0:
                continue
            quota1 = lsd(df1.target)
            quota2 = lsd(df2.target)
            new_quota = quota1 + quota2
            if best_key is None or new_quota < best_quota:
                best_key = k
                best_val = v
                best_quota = new_quota
    ###
    ret['best_quota'] = best_quota
    ret['key'] = best_key
    ret['val'] = best_val
    df1 = df[~(df[best_key] > best_val)]
    df2 = df[  df[best_key] > best_val ]
    ret['leaf_num'] = [len(df1), len(df2)]
    if len(df1) < min_samples_leaf or len(df2) < min_samples_leaf:
        # 子节点数量过少, 不应该分裂
        ret['reason'] = 'leaf samples less than min_samples_leaf'
        return ret
    tree1 = gen_tree_cart_reg(df1, depth=depth+1)
    tree2 = gen_tree_cart_reg(df2, depth=depth+1)
    ret['trees'] = {}
    ret['trees']['<='] = tree1
    ret['trees']['>'] = tree2
    return ret

def predict_process_cart_reg(cart_tree, data):
    prob = 0
    label = cart_tree['regval']
    ####
    if 'trees' in cart_tree:
        key = cart_tree['key']
        val = cart_tree['val']
        condition = '>' if data[key] > val else '<='
        tree = cart_tree['trees'][condition]
        ret = predict_process_cart_reg(tree, data)
        label = ret[0]
        prob = ret[1]
    return label, prob

    
def predict_cart_reg(cart_tree, df):
    d11 = df.to_dict()
    size = len(df)
    d12 = [{k: d11[k][i] for k in d11} for i in df.index]
    preds = []
    probs = []
    for data in d12:
        ret = predict_process_cart_reg(cart_tree, data)
        preds.append(ret[0])
        probs.append(ret[1])
    return preds, probs

In [25]:
rdf = X_train.copy()
rdf['target'] = y_train
adt = gen_tree_cart_reg(rdf, max_depth=10)
adt

{'ori_quota': 62415.53526871712,
 'size': 1509,
 'regval': 27.012158714380387,
 'depth': 0,
 'best_quota': 28711.611049271465,
 'key': 'used_months',
 'val': 56,
 'leaf_num': [879, 630],
 'trees': {'<=': {'ori_quota': 14440.467480773605,
   'size': 879,
   'regval': 31.013185437997723,
   'depth': 1,
   'best_quota': 9369.693195228836,
   'key': 'used_months',
   'val': 18,
   'leaf_num': [81, 798],
   'trees': {'<=': {'ori_quota': 1374.045283950617,
     'size': 81,
     'regval': 38.55197530864198,
     'depth': 2,
     'best_quota': 805.9157334149329,
     'key': 'car_height',
     'val': 1.49,
     'leaf_num': [38, 43],
     'trees': {'<=': {'ori_quota': 228.47974736842122,
       'size': 38,
       'regval': 35.73473684210527,
       'depth': 3,
       'best_quota': 133.3252190476191,
       'key': 'edition',
       'val': 4,
       'leaf_num': [14, 24],
       'trees': {'<=': {'ori_quota': 26.963085714285725,
         'size': 14,
         'regval': 33.66285714285714,
         'de

In [30]:
draw_tree(adt, 'cart_reg_price', keys=['key', 'val', 'ori_quota', 'size', 'regval', 'reason'])

In [28]:

## 测试集表现
pred, prob = predict_cart_reg(adt, X_test)
car_price_evaluate(target=y_test, pred=pred)

dldf = pd.DataFrame({'car_id':[f'_{x}' for x in data_test.id], 'price': y_test, 'pred': pred})
dldf.sort_values(by='price', inplace=True)
draw_lines(dldf.car_id, [dldf.price, dldf.pred], ['price', 'pred'])

rmse: 2.4969
mse: 6.2344
MAPE: 7.06%
5%: 45.61%


In [29]:
# 训练集
pred, prob = predict_cart_reg(adt, X_train)
car_price_evaluate(target=y_train, pred=pred)

dldf = pd.DataFrame({'car_id':[f'_{x}' for x in data_train.id], 'price': y_train, 'pred': pred})
dldf.sort_values(by='price', inplace=True)
draw_lines(dldf.car_id, [dldf.price, dldf.pred], ['price', 'pred'])

rmse: 1.6369
mse: 2.6795
MAPE: 4.62%
5%: 64.48%
