# 1.特征分箱的作用
1. 优点
> 1. 稳定性：避免特征中无意义的波动对评分带来的波动；
> 2. 健壮性：避免了极端值的影响；
> 3. 缺失值带入分箱：可将缺失值作为独立的一个箱带入模型中； 
> 4. 消除量纲影响：将所有变量变换到相似的尺度上
2. 缺点
> 1. 计算量大

In [38]:
import pandas as pd
import numpy as np
data = pd.read_excel("./bin_data.xlsx")
data.head()

Unnamed: 0,年龄,收入,孩子数量,是否违约
0,46,0,0,0
1,34,3200,4,1
2,31,3300,3,1
3,39,1500,0,1
4,32,0,3,0


In [None]:
from sklearn.

# 2. 变量分箱封装

In [None]:
class AutoBin:
    
    def bin_features(self,data,cat_features,value_threshold):
        """函数作用：根据设定的离散变量的取值个数的阈值，判断需要再分箱的离散型变量。超过阈值的离散或类别型变量需要再分箱"""
        total = list(data.columns)
        for var in list(data.columns):
            if var in cat_features:
                total.remove(var)
            else:
                pass
        num_features = total
        more_value_features =[]
        less_value_features = []
        for var in cat_features:
            value_counts = len(set(data[var]))
            if value_counts > value_threshold:
                more_value_features.append(var)

            else:
                less_value_features.append(var)
        return more_value_features,less_value_features,num_features
    
    def cat_mapping(self,data,var,map_dic,value_type):
        """函数作用:相当于离散型变量自定义分箱"""
        data[var] = data[var].map(dic).astype(value_type)
        return data
    
    def customize_bin(self,df,variable,**kwargs):
        """自定义分箱的函数，可以针对离散变量和连续变量
        输入：
        1. df为待分箱的数据的数据框
        2. variable为待分箱的变量名
        3. **dic为关键字参数,给定形式为json格式。**json
        **{"dic1":{key为新箱子的名字以字符串给出，value为原来的值，value均以原本值的类型给出},
        "dic2":key为新箱子的名字以字符串给出，value为筛选条件，value均以pd.query()的条件给出}
        4. 如：**{"dic1":{"多胎":"孩子数量 >=3 & 孩子数量 < 5","非人类":"孩子数量 >=5"},"dic2":{"无子女":0,"一胎":1,"两胎":2}}
        输出：
        分箱完毕的数据框"""
        conditions = len(kwargs.keys()) 
        if conditions ==1: # 判断是连续变量
            dic1 = kwargs.get("dic1")
            dic = {}
            for key1,value1 in dic1.items():
                index = (df.query(expr = value1)).index
                dic[key1] = index
        elif conditions ==2: # 判断是离散变量
            dic1 = kwargs.get("dic1")
            dic2 = kwargs.get("dic2")
            dic = {}
            for key1,value1 in dic1.items():
                index = (df.query(expr = value1)).index
                dic[key1] = index
            for key2,value2 in dic2.items():
                index = df.loc[df[variable] == value2].index
                dic[key2] = index
            for key,value in dic.items():
                df.loc[value,variable] = key
            print("变量<%s>分箱完成,分箱结果为%s" % (variable,df[variable].unique()))
            return df
        else:
            print("请输入正确的关键字参数中key个数")

        import itertools
        value_list = list(itertools.chain(*dic.values())) 
        if len(value_list) != len(df):  #判断分箱的样本行数是否等于数据框样本数
            print("待分箱的数据与原始数据长度不符，请检查待分箱数据数量")
        else:
            for key,value in dic.items():
                df.loc[value,variable] = key
            print("变量<%s>分箱完成,分箱结果为%s" % (variable,df[variable].unique()))
            return df

In [32]:
autobin = AutoBin()
autobin.bin_features(data=data,cat_features=["孩子数量","年龄"],value_threshold=.5)

(['孩子数量', '年龄'], [], ['收入', '是否违约'])

# 2.sklearn的KBinsDiscretizer分箱

In [33]:
from sklearn.preprocessing import KBinsDiscretizer

In [35]:
rnd = np.random.RandomState(42)
X= rnd.uniform(-3,3,size=100)
y = np.sin(X) + rnd.normal(size = len(X)) / 3
X = X.reshape(-1,1)

In [36]:
enc = KBinsDiscretizer(n_bins=10, encode='onehot')
X_binned = enc.fit_transform(X) 

In [37]:
X_binned

<100x10 sparse matrix of type '<class 'numpy.float64'>'
	with 100 stored elements in Compressed Sparse Row format>