In [2]:
import datetime
import warnings
warnings.filterwarnings('ignore')

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno  # 用于可视化缺失值分布
import scipy.stats as st

#调用matplotlib.pyplot的绘图函数plot()进行绘图的时候，或者生成一个figure画布的时候，可以直接在python console里面生成图像
%matplotlib inline

# 数据加载
data_train = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/testA.csv')

In [3]:
# 区分数值型特征和对象特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x:x not in numerical_fea,list(data_train.columns)))
label = 'isDefault' 
numerical_fea.remove(label)

# 缺失值处理

+ 缺失值过多，可以删除该列；数据集较多，有很少的缺失值，可以删除掉缺失值的行
+ 不处理，有些模型（如xgboost）有处理缺失值机制
+ 如果属性对学习不是特别重要，可以进行插值补全（均值，中位数，众数，建模预测，多重插补等）

In [4]:
#查看缺失值情况 
data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n3                  

In [5]:
# data_train.fillna(data_train.mean(),inplace=True) # 填充均值
# data_train.fillna(data_train.median(),inplace=True) # 填充中位数
# data_train.fillna(data_train.mode(),inplace=True) # 填充众数

In [6]:
# 按照平均数填充数值型特征 
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())

#按照众数填充类别型特征 
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())

In [7]:
data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           0
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  0
regionCode                0
dti                       0
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies        0
revolBal                  0
revolUtil                 0
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     0
policyCode                0
n0                        0
n1                        0
n2                        0
n3                  

In [8]:
"""KNN建模"""
"""填充KNN数据：先利用knn计算临近的k个数据，然后填充他们的均值"""
#from fancyimpute import KNN
#data_train = pd.DataFrame(KNN(k=6).fit_transform(data_train), columns=features)

'填充KNN数据：先利用knn计算临近的k个数据，然后填充他们的均值'

In [9]:
"""插值法：用插值法拟合出缺失的数据，然后进行填充。"""
# features = data_train.columns
# # print(features)
# for f in features: 
#     data_train[f] = data_train[f].interpolate()

'插值法：用插值法拟合出缺失的数据，然后进行填充。'

+ 建模预测 随机森林

一般情况下，会使用数据完整的条目作为模型的训练集，以此来预测缺失值。对于当前的这个数据，可以使用随机森林来预测也可以使用线性回归预测。这里使用随机森林预测模型，选取数据集中的数值属性作为特征（因为sklearn的模型只能处理数值属性，所以这里先仅选取数值特征，但在实际的应用中需要将非数值特征转换为数值特征）

In [10]:
# from sklearn.ensemble import RandomForestRegressor

# #choose training data to predict age
# age_df = data_train[['Age','Survived','Fare', 'Parch', 'SibSp', 'Pclass']]
# age_df_notnull = age_df.loc[(data_train['Age'].notnull())]
# age_df_isnull = age_df.loc[(data_train['Age'].isnull())]
# X = age_df_notnull.values[:,1:]
# Y = age_df_notnull.values[:,0]

# # use RandomForestRegression to train data
# RFR = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
# RFR.fit(X,Y)
# predictAges = RFR.predict(age_df_isnull.values[:,1:])
# data_train.loc[train_data['Age'].isnull(), ['Age']]= predictAges

# 异常值处理

### 均方差

如果一个数据分布近似正态，那么大约 68% 的数据值会在均值的一个标准差范围内，大约 95% 会在 两个标准差范围内，大约 99.7% 会在三个标准差范围内。

In [11]:
def find_outliers_by_3segama(data,fea):    
    data_std = np.std(data[fea])    
    data_mean = np.mean(data[fea])   
    outliers_cut_off = data_std * 3    
    lower_rule = data_mean - outliers_cut_off    
    upper_rule = data_mean + outliers_cut_off    
    data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')    
    return data

In [13]:
data_train = data_train.copy() 
for fea in numerical_fea:    
    data_train = find_outliers_by_3segama(data_train,fea)    
    print(data_train[fea+'_outliers'].value_counts())    
    print(data_train.groupby(fea+'_outliers')['isDefault'].sum())    
    print('*'*10)


正常值    800000
Name: id_outliers, dtype: int64
id_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********
正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********
正常值    800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    799701
异常值       299
Name: homeOwnership_outliers, dtype: int64
homeOwnership_outliers
异常值        62
正常值    159548
Name: isDefault, dtype: int64
**********
正常值    793973
异常值      