In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('max.columns', 100)

data_path = './data/'

In [2]:
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submit = pd.read_csv(data_path + 'submit.csv')

train.shape, test.shape, submit.shape

((40000, 21), (15000, 20), (15000, 2))

### WOE、IV

In [3]:
def WOE(data, feat, label):

    bin_values = data[feat].unique()
    good_total_num = len(data[data[label]==1])
    bad_total_num = len(data[data[label]==0])

    woe_dic = {}
    df = pd.DataFrame()
    for i,val in enumerate(bin_values):
        good_num = len(data[(data[feat]==val) & (data[label]==1)])
        bad_num = len(data[(data[feat]==val) & (data[label]==0)])
        df.loc[i,feat] = val
        df.loc[i, feat+'_woe'] = np.log( (good_num/good_total_num) / ((bad_num/bad_total_num+0.0001)) )
        woe_dic[val] = np.log( (good_num/good_total_num) / ((bad_num/bad_total_num+0.0001)) )

    return woe_dic,df

In [4]:
def IV(data, woe_dic, feat, label):
    good_total_num = len(data[data[label] == 1])
    bad_total_num = len(data[data[label] == 0])
    bin_values = data[feat].unique()
    feat_IV = 0
    for val in bin_values:
        woe = woe_dic[val]
        good_num = len(data[(data[feat] == val) & (data[label] == 1)])
        bad_num = len(data[(data[feat] == val) & (data[label] == 0)])

        feat_IV += ((good_num/good_total_num)-(bad_num/bad_total_num))*woe

    return feat_IV

### 分箱示例
省略等频、等距分箱：pd.qcut、pd.cut，

#### 决策树分箱

In [5]:
from sklearn.tree import DecisionTreeClassifier
def optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
    '''
        利用决策树获得最优分箱的边界值列表
    '''
    boundary = []  # 待return的分箱边界值列表
    
    x = x.fillna(nan).values  # 填充缺失值
    y = y.values
    
    clf = DecisionTreeClassifier(criterion='entropy',    #“信息熵”最小化准则划分
                                 max_leaf_nodes=6,       # 最大叶子节点数
                                 min_samples_leaf=0.05)  # 叶子节点样本数量最小占比

    clf.fit(x.reshape(-1, 1), y)  # 训练决策树
    
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold
    
    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  # 获得决策树节点上的划分边界值
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x.min()
    max_x = x.max() + 0.1  # +0.1是为了考虑后续groupby操作时，能包含特征最大值的样本
    boundary = [min_x] + boundary + [max_x]

    return boundary

In [6]:
optimal_binning_boundary(train['GRJCJS'], train['label'])

[787.0, 3745.75, 4423.25, 5262.75, 6862.25, 8325.75, 13692.1]

#### best-ks分箱

In [7]:
def best_ks_box(data, var_name, target_col, box_num):
    data = data[[var_name, target_col]]
    """
    KS值函数
    """
    def ks_bin(data_, limit):
        g = data_.iloc[:, 1].value_counts()[0]
        b = data_.iloc[:, 1].value_counts()[1]
        data_cro = pd.crosstab(data_.iloc[:, 0], data_.iloc[:, 1])
        data_cro[0] = data_cro[0] / g
        data_cro[1] = data_cro[1] / b
        data_cro_cum = data_cro.cumsum()
        ks_list = abs(data_cro_cum[1] - data_cro_cum[0])
        ks_list_index = ks_list.nlargest(len(ks_list)).index.tolist()
        for i in ks_list_index:
            data_1 = data_[data_.iloc[:, 0] <= i]
            data_2 = data_[data_.iloc[:, 0] > i]
            if len(data_1) >= limit and len(data_2) >= limit:
                break
        return i

    """
    区间选取函数
    """

    def ks_zone(data_, list_):
        list_zone = list()
        list_.sort()
        n = 0
        for val in list_:
            m = sum(data_.iloc[:, 0] <= val) - n
            n = sum(data_.iloc[:, 0] <= val)
#             print(val,' , m:',m,' n:',n)
            list_zone.append(m)
        #list_zone[i]存放的是list_[i]-list[i-1]之间的数据量的大小
        list_zone.append(50000 - sum(list_zone))
#         print('sum ',sum(list_zone[:-1]))
#         print('list zone ',list_zone)
        #选取最大数据量的区间
        max_index = list_zone.index(max(list_zone))
        if max_index == 0:
            rst = [data_.iloc[:, 0].unique().min(), list_[0]]
        elif max_index == len(list_):
            rst = [list_[-1], data_.iloc[:, 0].unique().max()]
        else:
            rst = [list_[max_index - 1], list_[max_index]]
        return rst

    data_ = data.copy()
    limit_ = data.shape[0] / 20  # 总体的5%
    """"
    循环体
    """
    zone = list()
    for i in range(box_num - 1):
        #找出ks值最大的点作为切点，进行分箱
        ks_ = ks_bin(data_, limit_)
        zone.append(ks_)
        new_zone = ks_zone(data, zone)
        data_ = data[(data.iloc[:, 0] > new_zone[0]) & (data.iloc[:, 0] <= new_zone[1])]

    zone.append(data.iloc[:, 0].unique().max())
    zone.append(data.iloc[:, 0].unique().min())
    zone.sort()
    return zone

In [8]:
best_ks_box(train, 'GRJCJS', 'label', box_num=5)

[787.0, 3745.5, 4423.0, 5262.5, 6862.0, 13692.0]

#### 卡方分箱 1

In [9]:
# 计算2*2列联表的卡方值
def get_chi2_value(arr):
    rowsum = arr.sum(axis=1)  # 对行求和
    colsum = arr.sum(axis=0)  # 对列求和
    n = arr.sum()
    emat = np.array([i * j / n for i in rowsum for j in colsum])
    arr_flat = arr.reshape(-1)
    arr_flat = arr_flat[emat != 0]  # 剔除了期望为0的值,不参与求和计算，不然没法做除法！
    emat = emat[emat != 0]  # 剔除了期望为0的值,不参与求和计算，不然没法做除法！
    E = (arr_flat - emat) ** 2 / emat
    return E.sum()

# 自由度以及分位点对应的卡方临界值
def get_chi2_threshold(percents, nfree):
    return chi2.isf(percents, df=nfree)

# 计算卡方切分的切分点
def get_chimerge_cutoff(ser, tag, max_groups=None, threshold=None):
    freq_tab = pd.crosstab(ser, tag)
    cutoffs = freq_tab.index.values  # 保存每个分箱的下标
    freq = freq_tab.values  # [M,N_class]大小的矩阵，M是初始箱体的个数，N_class是目标变量类别的个数
    while True:
        min_value = None #存放所有对相邻区间中卡方值最小的区间的卡方值
        min_idx = None #存放最小卡方值的一对区间中第一个区间的下标
        for i in range(len(freq) - 1):
            chi_value = get_chi2_value(freq[i:(i + 2)]) #计算第i个区间和第i+1个区间的卡方值
            if min_value == None or min_value > chi_value:
                min_value = chi_value
                min_idx = i
        if (max_groups is not None and max_groups < len(freq)) or (
                threshold is not None and min_value < get_chi2_threshold(threshold, len(cutoffs)-1)):
            tmp = freq[min_idx] + freq[min_idx + 1] #合并卡方值最小的那一对区间
            freq[min_idx] = tmp
            freq = np.delete(freq, min_idx + 1, 0) #删除被合并的区间
            cutoffs = np.delete(cutoffs, min_idx + 1, 0)
        else:
            break
    return cutoffs

In [10]:
get_chimerge_cutoff(train['GRJCJS'], train['label'])

array([  787. ,   837. ,   847. , ..., 12811. , 12811.5, 13692. ])

#### 卡方分箱 2

In [11]:
# 定义一个卡方分箱（可设置参数置信度水平与箱的个数）停止条件为大于置信水平且小于bin的数目
def ChiMerge(df, variable, flag, confidenceVal=3.841, bin=10, sample = None): 
    '''
    运行前需要 import pandas as pd 和 import numpy as np
    df:传入一个数据框仅包含一个需要卡方分箱的变量与正负样本标识（正样本为1，负样本为0）
    variable:需要卡方分箱的变量名称（字符串）
    flag：正负样本标识的名称（字符串）
    confidenceVal：置信度水平（默认是不进行抽样95%）
    bin：最多箱的数目
    sample: 为抽样的数目（默认是不进行抽样），因为如果观测值过多运行会较慢
    '''
    # 进行是否抽样操作
    if sample != None:
        df = df.sample(n=sample)
    else:
        df  
         
    # 进行数据格式化录入
    total_num = df.groupby([variable])[flag].count()  # 统计需分箱变量每个值数目
    total_num = pd.DataFrame({'total_num': total_num})  # 创建一个数据框保存之前的结果
    positive_class = df.groupby([variable])[flag].sum()  # 统计需分箱变量每个值正样本数
    positive_class = pd.DataFrame({'positive_class': positive_class})  # 创建一个数据框保存之前的结果
    regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,
                       how='inner')  # 组合total_num与positive_class
    regroup.reset_index(inplace=True)
    regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']  # 统计需分箱变量每个值负样本数
    regroup = regroup.drop('total_num', axis=1)
    np_regroup = np.array(regroup)  # 把数据框转化为numpy（提高运行效率）
    print('已完成数据读入,正在计算数据初处理')
 
    # 处理连续没有正样本或负样本的区间，并进行区间的合并（以免卡方值计算报错）
    i = 0
    while (i <= np_regroup.shape[0] - 2):
        if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)):
            np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1]  # 正样本
            np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2]  # 负样本
            np_regroup[i, 0] = np_regroup[i + 1, 0]
            np_regroup = np.delete(np_regroup, i + 1, 0)
            i = i - 1
        i = i + 1
  
    # 对相邻两个区间进行卡方值计算
    chi_table = np.array([])  # 创建一个数组保存相邻两个区间的卡方值
    for i in np.arange(np_regroup.shape[0] - 1):
        chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \
          * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \
          ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * (
          np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2]))
        chi_table = np.append(chi_table, chi)
    print('已完成数据初处理，正在进行卡方分箱核心操作')
 
    # 把卡方值最小的两个区间进行合并（卡方分箱核心）
    while (1):
        if (len(chi_table) <= (bin - 1) and min(chi_table) >= confidenceVal):
            break
        chi_min_index = np.argwhere(chi_table == min(chi_table))[0]  # 找出卡方值最小的位置索引
        np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]
        np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]
        np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0]
        np_regroup = np.delete(np_regroup, chi_min_index + 1, 0)
 
        if (chi_min_index == np_regroup.shape[0] - 1):  # 最小值试最后两个区间的时候
            # 计算合并后当前区间与前一个区间的卡方值并替换
            chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
                                           * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                                       ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
            # 删除替换前的卡方值
            chi_table = np.delete(chi_table, chi_min_index, axis=0)
 
        else:
            # 计算合并后当前区间与前一个区间的卡方值并替换
            chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
                                       * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                                       ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
            # 计算合并后当前区间与后一个区间的卡方值并替换
            chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \
                                       * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \
                                   ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]))
            # 删除替换前的卡方值
            chi_table = np.delete(chi_table, chi_min_index + 1, axis=0)
    print('已完成卡方分箱核心操作，正在保存结果')
 
    # 把结果保存成一个数据框
    result_data = pd.DataFrame()  # 创建一个保存结果的数据框
    result_data['variable'] = [variable] * np_regroup.shape[0]  # 结果表第一列：变量名
    list_temp = []
    for i in np.arange(np_regroup.shape[0]):
        if i == 0:
            x = '0' + ',' + str(np_regroup[i, 0])
        elif i == np_regroup.shape[0] - 1:
            x = str(np_regroup[i - 1, 0]) + '+'
        else:
            x = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0])
        list_temp.append(x)
    result_data['interval'] = list_temp       # 结果表第二列：区间
    result_data['flag_0'] = np_regroup[:, 2]  # 结果表第三列：负样本数目
    result_data['flag_1'] = np_regroup[:, 1]  # 结果表第四列：正样本数目
 
    return result_data

# #调用函数参数示例
# bins = ChiMerge(train, 'GRJCJS','label', confidenceVal=3.841, bin=10,sample=None)
# bins