In [9]:
import pandas as pd
import numpy as np

# 1.读取数据

In [10]:
df = pd.read_excel("./bin_data.xlsx")
df.head()

Unnamed: 0,年龄,收入,孩子数量,是否违约
0,46,0,0,0
1,34,3200,4,1
2,31,3300,3,1
3,39,1500,0,1
4,32,0,3,0


# 2.定义函数

In [3]:
def transform_data(data,x,y):
    """1. 将单列数据转换成estimator需要的格式"""
    X = np.array(data[x]).reshape((-1,1))
    y = data[y]
    return X,y

In [4]:
X,y = transform_data(data = df,x ="年龄",y = "是否违约")

In [5]:
def grid_search(estimator,param_grid,X,y,scoring = "roc_auc"):
    """2. 用于决策树分箱的决策树训练网格搜索，返回最佳超参数"""
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(estimator=estimator,param_grid= param_grid,scoring = scoring,cv=3,n_jobs = -1)
    grid.fit(X=X,y=y)
    print("网格搜索得到的最佳%s为%s" %(scoring,grid.best_score_))
    dic = grid.best_params_
    return dic

In [6]:
from sklearn.tree import DecisionTreeClassifier
estimator = DecisionTreeClassifier()
param_grid = {"max_depth":range(1,8),"min_samples_leaf":range(1,100)}
scoring = "roc_auc"
dic = grid_search(estimator,param_grid,scoring = scoring,X=X,y=y)
dic

网格搜索得到的最佳roc_auc为0.6582526666666667


{'max_depth': 3, 'min_samples_leaf': 97}

In [7]:
def bin_tree(data,dic,x,y):
    """ 3. 进行正式决策树分箱的函数
    dic为网格搜索得到的最佳参数
    X:为待分箱的数据，数组形式，可以为transform_data()函数得到的X
    """
    from sklearn.tree import DecisionTreeClassifier
    dtc = DecisionTreeClassifier(max_depth=dic.get("max_depth"),
                                       min_samples_leaf = dic.get("min_samples_leaf"))
    X = np.array(data[x]).reshape((-1,1))
    y = data[y]
    dtc.fit(X,y)
    threshold = dtc.tree_.threshold #每个节点的分割值
    splits = threshold[threshold > 0]
    splits = np.sort(splits) # 得到升序的，大于0的分割值
    
    labels = [] 
    for i in range(len(splits[:-1])):
        t = str(splits[i]) + "-" + str(splits[i+1])
        labels.append(t) # 最终labels为不含上下界的每个箱子名字
        
    bin0 = "<=" + str(splits[0])
    bin_last = ">" + str(splits[-1])
    index0 = (df[x] <= splits[0])
    index1 = (df[x] > splits[0]) | (df[x] <= splits[-1])
    index_last = df[x] > splits[-1]
    
    df.loc[index1,x]  = pd.cut(df.loc[index1,x],bins = splits,labels=labels)
    df.loc[index0,x] = bin0
    df.loc[index_last,x] = bin_last
    print("变量%s利用决策树分箱结果为%s" %(x,df[x].unique()))
    return data

In [8]:
data1 = bin_tree(data = df,dic = dic,x = "年龄",y="是否违约")
data1.head()

变量年龄利用决策树分箱结果为['45.5-57.5' '27.5-35.5' '35.5-45.5' '22.5-23.5' '23.5-27.5' '<=21.5'
 '21.5-22.5' '>57.5']


Unnamed: 0,年龄,收入,孩子数量,是否违约
0,45.5-57.5,0,0,0
1,27.5-35.5,3200,4,1
2,27.5-35.5,3300,3,1
3,35.5-45.5,1500,0,1
4,27.5-35.5,0,3,0


# 3. 测试

In [12]:
def transform_data(data,x,y):
    """
    1. 将单列数据转换成estimator需要的格式
    Args:
        data:
        X: 要分箱的变量名
        y: 目标变量名

    Returns:

    """
    X = np.array(data[x]).reshape((-1,1))
    y = data[y]
    return X,y
X,y = transform_data(df,"年龄","是否违约")

In [13]:
def grid_search(estimator,param_grid,X,y,scoring = "roc_auc",cv =3):
    """
    2. 用于决策树分箱的决策树训练网格搜索，返回最佳超参数
    Args:
        estimator: 估计器对象
        param_grid: 待搜索的参数网格
        X: transform_data函数转换出的x，是一个shape = (-1,1）
        y: 目标变量
        scoring: 评分的预测值

    Returns:
        网格搜索最佳参数的字典
    """
    from sklearn.model_selection import GridSearchCV
    grid = GridSearchCV(estimator=estimator,param_grid= param_grid,scoring = scoring,cv=3,n_jobs = -1)
    grid.fit(X=X,y=y)
    print("网格搜索得到的最佳%s为%s" %(scoring,grid.best_score_))
    dic = grid.best_params_
    return dic

In [14]:
from sklearn.tree import DecisionTreeClassifier
estimator = DecisionTreeClassifier()
param_grid = {"max_depth":range(1,8),"min_samples_leaf":range(1,100)}
scoring = "roc_auc"
dic = grid_search(estimator,param_grid,scoring = scoring,X=X,y=y)
dic

网格搜索得到的最佳roc_auc为0.6582526666666667


{'max_depth': 3, 'min_samples_leaf': 97}

In [15]:
def bin_tree(df, dic, x, y):
    """
    3. 进行正式决策树分箱的函数
    X:为待分箱的数据，数组形式，可以为transform_data()函数得到的X
    Args:
        data: 完整数据框
        dic: 网格搜索得到的最佳参数的字典
        x: 待分箱的变量名
        y: 目标变量名

    Returns:
        分箱完毕的数据框
    """
    from sklearn.tree import DecisionTreeClassifier
    dtc = DecisionTreeClassifier(max_depth=dic.get("max_depth"),
                                 min_samples_leaf=dic.get("min_samples_leaf"))
    X = np.array(df[x]).reshape((-1, 1))
    y = df[y]
    dtc.fit(X, y)
    threshold = dtc.tree_.threshold  # 每个节点的分割值
    splits = threshold[threshold > 0]
    splits = np.sort(splits)  # 得到升序的，大于0的分割值

    labels = []
    for i in range(len(splits[:-1])):
        t = str(splits[i]) + "-" + str(splits[i + 1])
        labels.append(t)  # 最终labels为不含上下界的每个箱子名字

    bin0 = "<=" + str(splits[0])
    bin_last = ">" + str(splits[-1])
    index0 = (df[x] <= splits[0])
    index1 = (df[x] > splits[0]) | (df[x] <= splits[-1])
    index_last = df[x] > splits[-1]

    df.loc[index1, x] = pd.cut(df.loc[index1, x], bins=splits, labels=labels)
    df.loc[index0, x] = bin0
    df.loc[index_last, x] = bin_last
    print("变量%s利用决策树分箱结果为%s" % (x, df[x].unique()))
    return df

In [16]:
bin_tree(df,dic,x="年龄",y="是否违约")

变量年龄利用决策树分箱结果为['45.5-57.5' '27.5-35.5' '35.5-45.5' '22.5-23.5' '23.5-27.5' '<=21.5'
 '21.5-22.5' '>57.5']


Unnamed: 0,年龄,收入,孩子数量,是否违约
0,45.5-57.5,0,0,0
1,27.5-35.5,3200,4,1
2,27.5-35.5,3300,3,1
3,35.5-45.5,1500,0,1
4,27.5-35.5,0,3,0
5,22.5-23.5,0,0,1
6,35.5-45.5,1900,0,0
7,27.5-35.5,0,2,0
8,23.5-27.5,1700,1,1
9,23.5-27.5,3400,1,1
