In [None]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 声明使用 Seaborn 样式
sns.set()
# 有五种seaborn的绘图风格，它们分别是：darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
sns.set_style("whitegrid")
# 有四个预置的环境，按大小从小到大排列分别为：paper, notebook, talk, poster。其中，notebook是默认的。
sns.set_context('talk')
# 中文字体设置-黑体
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False

In [None]:
# 数据预处理，主要是针对缺失数据、类别数据的处理
df_train = pd.read_csv(r'D:\数据挖掘\数据集\贷款数据\train.csv')
df_testA = pd.read_csv(r'D:\数据挖掘\数据集\贷款数据\testA.csv')

In [None]:
print(df_train.shape)
print(df_train.columns)

## 查看数据分布

In [None]:
# 查看标签列分布
df_train['isDefault'].value_counts()

In [None]:
# 查看总体情况
df_train.info()

In [None]:
# 看唯一键是否有重复
df_train.shape[0] == len(df_train['id'].drop_duplicates())

In [None]:
# 主要看缺失情况
missing_info = pd.DataFrame(df_train.isnull().sum()/len(df_train['id'])).reset_index()
missing_info = missing_info.rename(columns={'index':'col_name',0:'missing_pct'})
missing_info = missing_info.sort_values(by='missing_pct',ascending=False).reset_index(drop=True)
missing_info

In [None]:
# 查看缺失严重，超过80%的指标
threshold = 0.8
missing_heavy_num = missing_info[missing_info['missing_pct']>=threshold].shape[0]
print('缺失超过{}的特征有{}个'.format(threshold, missing_heavy_num))

In [None]:
# 找出有缺失的特征并可视化
plt.figure(figsize=(20,10))
missing_data = missing_info[missing_info['missing_pct']>0]
sns.barplot(x='col_name',y='missing_pct',data=missing_data)
plt.title('缺失特征比例情况')
plt.xlabel('特征名')
plt.ylabel('缺失比例')
plt.show()

In [None]:
# 查看每个样本的缺失情况
missing_series = df_train.isnull().sum(axis=1)
list_missing_num  = sorted(list(missing_series.values))

# 设置标题
plt.figure(figsize=(23, 5))
plt.title('缺失变量的分布图')
plt.plot(range(df_train.shape[0]), list_missing_num)
plt.xlabel('samples')
plt.ylabel('缺失变量个数')
plt.show()

In [None]:
# 区分数值类和类别类特征
numerical_fea = list(df_train.select_dtypes(exclude='object').columns)
category_fea = list(filter(lambda x:x not in numrical_fea,list(df_train.columns)))
print(numrical_fea)
print(category_fea)

这样的划分方式会将部分类别型特征识别为数值特征，例如：类别特征是数值的那种。这里我们使用一种方法：再次检测数值型特征中不同值个数，如果小于10，进行二次处理（看作类别特征）

建议采用如下划分方式：

In [None]:
def dis_numerical_feature(data,numerical_fea):
    numerical_fea_true = []
    numerical_fea_fake = []
    for fea in numerical_fea:
        if data[fea].nunique() < 10:
            numerical_fea_fake.append(fea)
        else:
            numerical_fea_true.append(fea)
    return numerical_fea_true,numerical_fea_fake

numerical_fea_true,numerical_fea_fake=dis_numerical_feature(df_train,numerical_fea)
print(numerical_fea_true)
print(numerical_fea_fake)

In [None]:
# 查看特征的具体分布，重点关注值分布差距过大的
df_train['verificationStatus'].value_counts()

In [None]:
df_train['n11'].value_counts()

分布相差悬殊，可以考虑分箱或者剔除该特征

In [None]:
df_train['policyCode'].value_counts()

此类特征无用，全部是一个值，可以剔除

In [None]:
# 查看特征中特征的单方差（同值化）性质
threshold_const = 0.95

feature_list = list(df_train.columns)
feature_list.remove('isDefault')
# feature_list
feature_dis = {}
for fea in feature_list:
    # value_counts 的最多的一个样本类别的样本数
    max_samples_count = df_train[fea].value_counts().iloc[0]
    # 总体非空样本数
    sum_samples_count = df_train[df_train[fea].notnull()].shape[0]
    
    # 计算特征中类别最多的样本占比
    fea_rate = max_samples_count/sum_samples_count
    # 过滤同值化特征
    if fea_rate >= threshold_const:
        feature_dis[fea] = fea_rate

feature_dis

In [None]:
# 小于500个类别的特征进行整体分布的探索
fea_category = {}
for fea in feature_list:
    if df_train[fea].nunique() < 500:
        fea_category[fea] = df_train[fea].nunique()
fea_category= pd.DataFrame(list(fea_category.items()),columns=['feature','type_num'])
fea_category = fea_category.sort_values(by='type_num')
fea_category

In [None]:
# 查看分类特征和标签值的联合分布
fea_list = list(fea_category[fea_category['type_num']<=14].feature)
fig = plt.figure(figsize=(30,25))
for fea in fea_list:
    fig.add_subplot(4,3,fea_list.index(fea)+1)
    sns.countplot(x=fea,data=df_train,hue='isDefault')


看图总结如下：

- term贷款期限为5的违约率高于为3的

- homeOwnership、pubRecBankruptcies、purpose多数类别对应的记录占比较少，可以进行分箱合并

- grade、verificationStatus、initialListStatus、employmentLength不同类别对应的违约率不同，可以进一步分析

- applicationType同值化较严重，需要确认后可以删除

- policyCode只有一个类别，可以直接删除

针对n类别暂不做分析，另外还有如下特征需要继续探索

- subGrade 类型数： 35

- issueDate 类型数： 139

- regionCode 类型数： 51

- delinquency_2years 类型数： 30
- ficoRangeLow 类型数： 39
- ficoRangeHigh 类型数： 39
- openAcc 类型数： 75
- pubRec 类型数： 32
- totalAcc 类型数： 134

##  单特征探索

In [None]:
# regionCode
"""计算每个地区的违约率情况"""
df_bucket = df_train.groupby('regionCode')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

# 查看Top10的数据
bad_trend.sort_values(by='bad_rate', ascending=False).iloc[:10]

In [None]:
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='regionCode',data=df_train, hue='isDefault')
fig.add_subplot(2,1,2)
sns.lineplot(x='regionCode',y='bad_rate',data=bad_trend)

可以看到存在部分地区的违约率高于平均值，可以单独拿出Top地区做特征衍生

In [None]:
# subGrade
"""计算subGrade的违约率情况"""
df_bucket = df_train.groupby('subGrade')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

In [None]:
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='subGrade',data=df_train.sort_values(by='subGrade'), hue='isDefault')
fig.add_subplot(2,1,2)
sns.lineplot(x='subGrade',y='bad_rate',data=bad_trend)

这个指标本身和违约情况呈高度线性相关，考虑内生性

In [None]:
"""计算grade的违约率情况"""
df_bucket = df_train.groupby('grade')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

In [None]:
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='grade',data=df_train.sort_values(by='grade'), hue='isDefault')
fig.add_subplot(2,1,2)
sns.lineplot(x='grade',y='bad_rate',data=bad_trend)

和subGrade类似

In [None]:
"""计算delinquency_2years的违约率情况"""
df_bucket = df_train.groupby('delinquency_2years')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

In [None]:
# 画图显示
fig, ax = plt.subplots(2, 1, figsize=(25,15))
plt.title('违约率的delinquency_2years趋势图')
sns.countplot(x='delinquency_2years', hue='isDefault', data=df_train.sort_values(['delinquency_2years']), ax=ax[0])
sns.pointplot(data=bad_trend.sort_values(by='bad_rate', ascending=False), x='delinquency_2years', y='bad_rate', ax=ax[1])

plt.show()

看图说话：

存在部分类别对应的违约率为0，也存在部分类别对应的位于率为100%，一般可以考虑：为0的为一类，为100%的为一类，其余的为一类。

但是如果你结合字段含义和对应类别的样本数量去看的话，delinquency_2years表示借款人过去2年信用档案中逾期30天以上的违约事件数，且 delinquency_2years为0、1、2的样本占比超过95%，其余样本根本不足以考虑，所以也就不存在0%和100%的情况。

In [None]:
"""计算ficoRangeLow的违约率情况"""
df_bucket = df_train.groupby('ficoRangeLow')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

In [None]:
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='ficoRangeLow',data=df_train, hue='isDefault') #.sort_values(by='grade')
fig.add_subplot(2,1,2)
sns.lineplot(x='ficoRangeLow',y='bad_rate',data=bad_trend)

可以将违约率较高的Top类别进行分箱，或者进行独热编码处理

In [None]:
"""计算ficoRangeHigh的违约率情况"""
df_bucket = df_train.groupby('ficoRangeHigh')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

In [None]:
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='ficoRangeHigh',data=df_train, hue='isDefault') #.sort_values(by='grade')
fig.add_subplot(2,1,2)
sns.lineplot(x='ficoRangeHigh',y='bad_rate',data=bad_trend)

In [None]:
"""计算openAcc的违约率情况"""
df_bucket = df_train.groupby('openAcc')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()
# 画图
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='openAcc',data=df_train, hue='isDefault') #.sort_values(by='grade')
fig.add_subplot(2,1,2)
sns.lineplot(x='openAcc',y='bad_rate',data=bad_trend)

In [None]:
"""计算totalAcc的违约率情况"""
df_bucket = df_train.groupby('totalAcc')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()
# 画图
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='totalAcc',data=df_train, hue='isDefault') #.sort_values(by='grade')
fig.add_subplot(2,1,2)
sns.lineplot(x='totalAcc',y='bad_rate',data=bad_trend)

In [None]:
"""计算issueDate的违约率情况"""
df_bucket = df_train.groupby('issueDate')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()
# 画图
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='issueDate',data=df_train, hue='isDefault') #.sort_values(by='grade')
fig.add_subplot(2,1,2)
sns.lineplot(x='issueDate',y='bad_rate',data=bad_trend)

In [None]:
"""计算interestRate的违约率情况"""
df_bucket = df_train.groupby('interestRate')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()
# 画图
fig = plt.figure(figsize=(20,10))
# fig.add_subplot(2,1,1)
# sns.countplot(x='issueDate',data=df_train, hue='isDefault') #.sort_values(by='grade')
# fig.add_subplot(2,1,2)
sns.lineplot(x='interestRate',y='bad_rate',data=bad_trend)

In [None]:
"""对interestRate进行分箱操作"""
df_train['interestRate_box'] = df_train['interestRate']
df_train['interestRate_box'] = pd.cut(df_train['interestRate_box'], 10, labels=False)

df_bucket = df_train.groupby('interestRate_box')
bad_trend = pd.DataFrame()
bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

# 画图
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='interestRate_box',data=df_train, hue='isDefault') #.sort_values(by='grade')
fig.add_subplot(2,1,2)
sns.lineplot(x='interestRate_box',y='bad_rate',data=bad_trend)

总结：

subGrade、regionCode都存在个别类别的违约率高于均值；

elinquency_2years、ficoRangeLow、ficoRangeHigh、openAcc、pubRec样本都是左偏分布，可以对右尾数据进行处理；

贷款利率可以进行分箱处理

In [None]:
"""查看几个数值特征的偏度和峰度"""
for col in numerical_fea:
     print('{:15}'.format(col), 'Skewness: {:05.2f}'.format(df_train[col].skew()),
           '   ' , 'Kurtosis: {:06.2f}'.format(df_train[col].kurt())
          )

f = pd.melt(df_train, value_vars=numerical_fea_true)
g = sns.FacetGrid(f, col="variable",  col_wrap=5, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")

In [None]:
"""类别特征的每个类别频数可视化(count_plot)"""
def count_plot(x,  **kwargs):
    sns.countplot(x=x)
    x=plt.xticks(rotation=90)

f = pd.melt(df_train,  value_vars=category_fea+numerical_fea_fake)
g = sns.FacetGrid(f, col="variable",  col_wrap=3, sharex=False, sharey=False, size=5)
g = g.map(count_plot, "value")

## 特征衍生
### 单特征衍生

In [None]:
df_train_2 = df_train.copy()
df_testA_2 = df_testA.copy()

In [None]:
"""放款时间特征下的相关尝试"""
df_train_2['issueDate'] = pd.to_datetime(df_train_2['issueDate'])
df_testA_2['issueDate'] = pd.to_datetime(df_testA_2['issueDate'])
# 构造月份特征
df_train_2['issueDate_month'] = df_train_2['issueDate'].dt.strftime('%Y-%m')
df_testA_2['issueDate_month'] = df_testA_2['issueDate'].dt.strftime('%Y-%m')
# 构造季度特征
df_train_2['issueDate_quarter'] = df_train_2['issueDate'].dt.strftime('%Y') + '_' + df_train_2['issueDate'].dt.month.apply(lambda x: str(x%3+1))
df_testA_2['issueDate_quarter'] = df_testA_2['issueDate'].dt.strftime('%Y') + '_' + df_testA_2['issueDate'].dt.month.apply(lambda x: str(x%3+1))

# 构造节假日特征：1 工作日  0 节假日
# from chinese_calendar import is_workday
# df_train_2['issueDate_workday'] = df_train_2['issueDate'].apply(lambda x: 1 if is_workday(x) else 0)

In [None]:
fig = plt.figure(figsize=(30,25))
fig.add_subplot(4,1,1)
sns.countplot(x='issueDate_month',data=df_train_2.sort_values(by='issueDate_month'))
fig.add_subplot(4,1,2)
sns.countplot(x='issueDate_month',data=df_testA_2.sort_values(by='issueDate_month'))
fig.add_subplot(4,1,3)
sns.countplot(x='issueDate_quarter',data=df_train_2.sort_values(by='issueDate_quarter'))
fig.add_subplot(4,1,4)
sns.countplot(x='issueDate_quarter',data=df_testA_2.sort_values(by='issueDate_quarter'))

In [None]:
"""基于时间维度计算违约率情况"""
df_bucket = df_train_2.groupby('issueDate_month')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

# 画图
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='issueDate_month',data=df_train_2.sort_values(by='issueDate_month')) #.sort_values(by='grade')
fig.add_subplot(2,1,2)
sns.lineplot(x='issueDate_month',y='bad_rate',data=bad_trend)

In [None]:
"""基于时间维度计算违约率情况"""
df_bucket = df_train_2.groupby('issueDate_quarter')
bad_trend = pd.DataFrame()

bad_trend['total'] = df_bucket['isDefault'].count()
bad_trend['bad'] = df_bucket['isDefault'].sum()
bad_trend['bad_rate'] = round(bad_trend['bad']/bad_trend['total'], 4)*100
bad_trend = bad_trend.reset_index()

# 画图
fig = plt.figure(figsize=(20,10))
fig.add_subplot(2,1,1)
sns.countplot(x='issueDate_quarter',data=df_train_2.sort_values(by='issueDate_quarter')) #.sort_values(by='grade')
fig.add_subplot(2,1,2)
sns.lineplot(x='issueDate_quarter',y='bad_rate',data=bad_trend)

### 特征组合衍生

In [None]:
# 业务统计指标
for col in ['grade', 'subGrade','pubRec']:  #分类数据
    temp_dict = df_train_2.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col + '_target_mean'].to_dict()

    df_train_2[col + '_target_mean'] = df_train_2[col].map(temp_dict)

df_train_2[['grade', 'grade_target_mean', 'subGrade_target_mean', 'pubRec_target_mean']]

In [None]:
# 离散特征组合
df_train_2['ficoRangeLow_ficoRangeHigh'] = df_train_2['ficoRangeLow'].astype(str) + '_' + df_train_2['ficoRangeHigh'].astype(str)
df_train_2[['ficoRangeLow', 'ficoRangeHigh', 'ficoRangeLow_ficoRangeHigh']]