**Task2（特征选择）**  分别用IV值和随机森林挑选特征，再构建模型，进行模型评估

In [4]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split

# 导入数据
data = pd.read_csv("./data.csv",encoding='gbk')
data.drop_duplicates(inplace=True)

# 提取标签
y = data.status

# 载入特征
with open('feature.pkl', 'rb') as f:
    X = pickle.load(f)

# 划分训练集测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=2018)

In [5]:
# 性能评估
from sklearn.metrics import accuracy_score, roc_auc_score

def model_metrics(clf, X_train, X_test, y_train, y_test):
    # 预测
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    
    y_train_proba = clf.predict_proba(X_train)[:,1]
    y_test_proba = clf.predict_proba(X_test)[:,1]
    
    # 准确率
    print('[准确率]', end = ' ')
    print('训练集：', '%.4f'%accuracy_score(y_train, y_train_pred), end = ' ')
    print('测试集：', '%.4f'%accuracy_score(y_test, y_test_pred))
    
    # auc取值：用roc_auc_score或auc
    print('[auc值]', end = ' ')
    print('训练集：', '%.4f'%roc_auc_score(y_train, y_train_proba), end = ' ')
    print('测试集：', '%.4f'%roc_auc_score(y_test, y_test_proba))

## IV值进行特征选择

stats.scoreatpercentile(x, 50)    # 得到x在50%处的数值

np.in1d(B,A)    # 在序列B中寻找与序列A相同的值，并返回一逻辑值（True,False）

In [6]:
import math
import numpy as np
from scipy import stats
from sklearn.utils.multiclass import type_of_target

def woe(X, y, event=1):  
    res_woe = []
    iv_dict = {}
    for feature in X.columns:
        x = X[feature].values
        # 1) 连续特征离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        # 2) 计算该特征的woe和iv
        # woe_dict, iv = woe_single_x(x, y, feature, event)
        woe_dict, iv = woe_single_x(x, y, feature, event)
        iv_dict[feature] = iv
        res_woe.append(woe_dict) 
        
    return iv_dict
        
def discrete(x):
    # 使用5等分离散化特征
    res = np.zeros(x.shape)
    for i in range(5):
        point1 = stats.scoreatpercentile(x, i * 20)
        point2 = stats.scoreatpercentile(x, (i + 1) * 20)
        x1 = x[np.where((x >= point1) & (x <= point2))]
        mask = np.in1d(x, x1)
        res[mask] = i + 1    # 将[i, i+1]块内的值标记成i+1
    return res

def woe_single_x(x, y, feature,event = 1):
    # event代表预测正例的标签
    event_total = sum(y == event)
    non_event_total = y.shape[-1] - event_total
    
    iv = 0
    woe_dict = {}
    for x1 in set(x):    # 遍历各个块
        y1 = y.reindex(np.where(x == x1)[0])
        event_count = sum(y1 == event)
        non_event_count = y1.shape[-1] - event_count
        rate_event = event_count / event_total    
        rate_non_event = non_event_count / non_event_total
        
        if rate_event == 0:
            rate_event = 0.0001
            # woei = -20
        elif rate_non_event == 0:
            rate_non_event = 0.0001
            # woei = 20
        woei = math.log(rate_event / rate_non_event)
        woe_dict[x1] = woei
        iv += (rate_event - rate_non_event) * woei
    return woe_dict, iv

处理上述特征时, 遇到了IV的极端情况, 响应数为0或未响应数为0。

为简单起见, 我们在代码中对极端值进行平滑处理。

In [7]:
import warnings
warnings.filterwarnings("ignore")

iv_dict = woe(X_train, y_train)

In [8]:
iv = sorted(iv_dict.items(), key = lambda x:x[1],reverse = True)
iv

[('historical_trans_amount', 2.6609646134512865),
 ('trans_amount_3_month', 2.5546436077538357),
 ('repayment_capability', 2.327229251967252),
 ('pawns_auctions_trusts_consume_last_6_month', 2.220777389641486),
 ('first_transaction_day', 2.1651873210712678),
 ('abs', 1.966985825643712),
 ('consfin_avg_limit', 1.6927490494187993),
 ('loans_avg_limit', 1.4616638505356894),
 ('max_cumulative_consume_later_1_month', 1.4598660465564153),
 ('consume_mini_time_last_1_month', 1.3790560008629353),
 ('historical_trans_day', 1.117648762849395),
 ('consfin_credit_limit', 0.869609276640697),
 ('pawns_auctions_trusts_consume_last_1_month', 0.8530625616084101),
 ('avg_price_last_12_month', 0.7281431950917352),
 ('loans_score', 0.6611588082229917),
 ('loans_latest_day', 0.6295469820926429),
 ('apply_score', 0.6273343581887715),
 ('history_suc_fee', 0.5116062502603338),
 ('latest_query_day', 0.4932403479679425),
 ('trans_days_interval_filter', 0.4880692929650191),
 ('loans_count', 0.4848115414454659),


## 随机森林挑选特征

In [9]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [10]:
# 观察默认参数的性能
rf0 = RandomForestClassifier(oob_score=True, random_state=2018)
rf0.fit(X_train, y_train)
print('袋外分数：', rf0.oob_score_)
model_metrics(rf0, X_train, X_test, y_train, y_test)

袋外分数： 0.740907724676886
[准确率] 训练集： 0.9823 测试集： 0.7638
[auc值] 训练集： 0.9994 测试集： 0.7028


In [11]:
# 网格法调参
param_test = {'n_estimators':range(20,200,20)}
gsearch = GridSearchCV(estimator = RandomForestClassifier(n_estimators=120, max_depth=9, min_samples_split=50, 
                                                          min_samples_leaf=20, max_features = 9,random_state=2018), 
                       param_grid = param_test, scoring='roc_auc', cv=5)

gsearch.fit(X_train, y_train)
# gsearch.grid_scores_, 
gsearch.best_params_, gsearch.best_score_

({'n_estimators': 180}, 0.7915865123613097)

In [12]:
rf = RandomForestClassifier(n_estimators=160, max_depth=9, min_samples_split=50,
                            min_samples_leaf=20, max_features = 9,oob_score=True, random_state=2018)
rf.fit(X_train, y_train)
print('袋外分数：', rf.oob_score_)
model_metrics(rf, X_train, X_test, y_train, y_test)

袋外分数： 0.7911030958821761
[准确率] 训练集： 0.8197 测试集： 0.7842
[auc值] 训练集： 0.8981 测试集： 0.7721
