In [7]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
import os
project_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath('.')), '../'))
import sys
sys.path.append(project_path)
from util.visualization import draw_lines
from util.visualization import draw_scatters
from util.evaluate_process import classifier_evaluate
from collections import Counter
from sklearn.metrics import auc, roc_curve

In [8]:
df = pd.read_csv('../../data/preprocessed.samecar.csv')

In [9]:
feats = [
    'colorp1', 'colorp2', 
    'fuel_typep1', 'fuel_typep2','displacement_standard1', 'displacement_standard2',
    'gearboxp1', 'gearboxp2', 'displacement_diff', 'displacement_diff_sparse',
    'mile_diff', 'mile_diff_sparse', 'mile_diff_rate', 'mile_diff_rate_sparse',
    'year_diff', 'year_diff_sparse', 'licensed_city_diff_sparse', 'title_diff', 
    'title_diff_sparse', 'register_time_diff', 'register_time_diff_sparse',
    'is_import_diff_sparse', 'transfer_times_diff', 'transfer_times_diff_sparse'
]
# 只保留离散特征
sparse_feats = [
    'colorp1', 'colorp2', 
    'fuel_typep1', 'fuel_typep2','displacement_standard1', 'displacement_standard2',
    'gearboxp1', 'gearboxp2', 'displacement_diff_sparse',
    'mile_diff_sparse', 'mile_diff_rate_sparse',
    'year_diff_sparse', 'licensed_city_diff_sparse', 
    'title_diff_sparse', 'register_time_diff_sparse',
    'is_import_diff_sparse', 'transfer_times_diff_sparse'
]

In [10]:
rdf = df[sparse_feats]
X_train, X_test, y_train, y_test = train_test_split(rdf, df.is_same, test_size=0.25, random_state=10)
len(X_train), len(X_test)

(8038, 2680)

In [51]:
def tree_to_dot(tree_dict):
    # node: 0 [label="abc"];
    # line: 0 -> 1;
    nodes = [];
    lines = [];
    index = [0];
    def _process(struct, parent_id=None, path=''):
        cnt = '\\n'.join([f'{key}:{struct.get(key, "-")}' for key in ['key','val', 'gini', 'size', 'tags', 'reason']])
        node = f'{index[0]} [label="{cnt}"];'
        nodes.append(node)
        if parent_id is not None:
            if path:
                lines.append(f'{parent_id} -> {index[0]} [headlabel="{path}"];')
            else:
                lines.append(f'{parent_id} -> {index[0]};')
        ###
        selfindex = index[0]
        index[0] += 1
        if 'trees' in struct:
            for v in struct['trees']:
                _process(struct['trees'][v], selfindex, path=v)
    ###
    _process(tree_dict)
    nodesstr = '\n'.join(nodes)
    linesstr = '\n'.join(lines)
    outputstr = 'digraph Tree {\nnode [shape=box] ;\n %s\n%s\n}' % (nodesstr, linesstr)
    return outputstr

## 这里dot 是一个绘图命令
## open 是mac 上的打开文件命令
def draw_tree(tree, save_path):
    s = tree_to_dot(tree)
    with open(f'{save_path}.dot', 'w') as f:
        f.write(s)
    os.system(f'dot -Tpng {save_path}.dot -o {save_path}.png && open {save_path}.png')

In [85]:
# 计算gini
def cal_gini(data):
    if len(data) == 0:
        return 0
    c = Counter(data)
    l = len(data)
    return sum([c.get(k)/l *  for k in c])

0.0

In [86]:
### 这个是按照cart 树的思路做的, id3 + 二叉树
def gen_tree_cart(df, depth=0, min_samples_split=4, min_samples_leaf=2, max_depth=8):
    ori_gini = cal_gini(df.target)
    size = len(df)
    ret = {
            'gini': ori_gini,
            'size': size,
            'tags': dict(Counter(df.target)),
            'depth': depth
    }
    if size < min_samples_split:
        # 这个节点数量太少, 不应该分裂
        ret['reason'] = 'less than min_samples_split'
        return ret
    if depth >= max_depth:
        # 太深了, 停止分裂
        ret['reason'] = 'to deep'
        return ret
    ###
    best_gini = 1
    best_key = None
    best_val = None
    ###
    for k in df.columns:
        if k == 'target':
            continue
        uniq_val = set(df[k])
        for v in uniq_val:
            if len(df[df[k] == v]) == size or len(df[df[k] == v]) == 0:
                continue
            gini1 = cal_gini(df[df[k] == v].target)
            gini2 = cal_gini(df[df[k] !=v].target)
            newgini = len(df[df[k] == v])/size * gini1 + len(df[df[k] != v])/size * gini2
            if best_key is None or newgini < best_gini:
                best_key = k
                best_val = v
                best_gini = newgini
    ###
    ret['gini_gain'] = best_gini
    ret['key'] = best_key
    ret['val'] = best_val
    df1 = df[df[best_key] == best_val]
    df2 = df[df[best_key] != best_val]
    ret['leaf num'] = [len(df1), len(df2)]
    if len(df1) < min_samples_leaf or len(df2) < min_samples_leaf:
        # 子节点数量过少, 不应该分裂
        ret['reason'] = 'leaf samples less than min_samples_leaf'
        return ret
    tree1 = gen_tree_cart(df1, depth=depth+1)
    tree2 = gen_tree_cart(df2, depth=depth+1)
    ret['trees'] = {}
    ret['trees']['=='] = tree1
    ret['trees']['!='] = tree2
    return ret

def predict_process_cart(decision_tree, data):
    n1 = decision_tree['tags'].get(1,0)
    n0 = decision_tree['tags'].get(0,0)
    prob = n1/(n1+n0)
    ## >= 召回高, > 精准高
    label = 1 if n1>=n0 else 0
    ####
    if 'trees' in decision_tree:
        key = decision_tree['key']
        val = decision_tree['val']
        condition = '==' if data[key] == val else '!='
        tree = decision_tree['trees'][condition]
        ret = predict_process_cart(tree, data)
        label = ret[0]
        prob = ret[1]
    return label, prob

    
def predict_cart(decision_tree, df):
    d11 = df.to_dict()
    size = len(df)
    d12 = [{k: d11[k][i] for k in d11} for i in df.index]
    preds = []
    probs = []
    for data in d12:
        ret = predict_process_cart(decision_tree, data)
        preds.append(ret[0])
        probs.append(ret[1])
    return preds, probs

In [87]:
rdf = X_train.copy()
rdf['target'] = y_train
adt = gen_tree_cart(rdf)
adt

{'gini': 0.4445825907432356,
 'size': 8038,
 'tags': {0: 5357, 1: 2681},
 'depth': 0,
 'gini_gain': 0.058167920243534066,
 'key': 'register_time_diff_sparse',
 'val': 2.0,
 'leaf num': [2770, 5268],
 'trees': {'==': {'gini': 0.11330813642820835,
   'size': 2770,
   'tags': {1: 2603, 0: 167},
   'depth': 1,
   'gini_gain': 0.06744574022738059,
   'key': 'mile_diff_rate_sparse',
   'val': 0.0,
   'leaf num': [126, 2644],
   'trees': {'==': {'gini': 0.37100025195263286,
     'size': 126,
     'tags': {1: 31, 0: 95},
     'depth': 2,
     'gini_gain': 0.28874458874458875,
     'key': 'fuel_typep1',
     'val': -1,
     'leaf num': [60, 66],
     'trees': {'==': {'gini': 0.06444444444444444,
       'size': 60,
       'tags': {0: 58, 1: 2},
       'depth': 3,
       'gini_gain': 0.057142857142857155,
       'key': 'title_diff_sparse',
       'val': 2.0,
       'leaf num': [14, 46],
       'trees': {'==': {'gini': 0.24489795918367352,
         'size': 14,
         'tags': {0: 12, 1: 2},
     

In [84]:
draw_tree(adt, 'cart_same_car')

In [88]:
## 测试集表现
pred, prob = predict_cart(adt, X_test)
eval_ret = classifier_evaluate(y_test, pred)
print(eval_ret)

fpr, tpr, thresholds = roc_curve(y_test, prob)
roc_auc = auc(fpr, tpr)
print(f'auc{roc_auc}')
draw_lines(fpr, [tpr, fpr], ['tp', 'fp'])

{'recall': 0.9713302752293578, 'precision': 0.9657924743443558, 'accuracy': 0.9794776119402985, 'tp': 847, 'fp': 30, 'tn': 1778, 'fn': 25}
auc0.9844492114557117


In [83]:
# 训练集
pred, prob = predict_cart(adt, X_train)
eval_ret = classifier_evaluate(y_train, pred)
print(eval_ret)

fpr, tpr, thresholds = roc_curve(y_train, prob)
roc_auc = auc(fpr, tpr)
print(f'auc{roc_auc}')
draw_lines(fpr, [tpr, fpr], ['tp', 'fp'])

{'recall': 0.9787392763894069, 'precision': 0.9718518518518519, 'accuracy': 0.9834535954217467, 'tp': 2624, 'fp': 76, 'tn': 5281, 'fn': 57}
auc0.9937290233744789
