In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import math
import datetime
import logging
import warnings
from pathlib import Path 
from datetime import datetime

import pandas as pd 
import numpy as np 
from tqdm import tqdm

from utils import *

pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.4f' % x)
warnings.filterwarnings('ignore')

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')
# 数据集目录
data_path = Path(r'data/tc')

In [3]:
train = pd.read_csv(f"{data_path}/train.csv")
testA = pd.read_csv(f"{data_path}/testA.csv")
logging.info(f"train shape: {train.shape}")
logging.info(f"testA shape: {testA.shape}")
train.head()

2021-01-26 08:56:30,099 INFO: train shape: (800000, 47)
2021-01-26 08:56:30,108 INFO: testA shape: (200000, 46)


Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2 years,2,110000.0,2,2014-07-01,1,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,Aug-2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5 years,0,46000.0,2,2012-08-01,0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,May-2002,1723.0,1.0,,,,,10.0,,,,,,13.0,,,,
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8 years,0,74000.0,2,2015-10-01,0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,May-2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10+ years,1,118000.0,1,2015-08-01,0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,May-1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,,1,29000.0,2,2016-03-01,0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,Aug-1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


### 特征预处理

In [4]:
# grade 特征处理

grade_dic = {"A":0, "B":1, "C":2, "D":3, "E":4, "F":5, "G":6}
for data in [train, testA]:
    data.grade = data.grade.map(grade_dic, na_action='ignore')  # 将grade改为数字，不出来NA值
    data.grade.astype('category')

In [5]:
# subGrade 特征处理

subgrade_dic = {"A1":0,"A2":1,"A3":2,"A4":3,"A5":4,"B1":5,"B2":6,"B3":7,"B4":8,"B5":9,"C1":10,"C2":11,"C3":12,"C4":13,"C5":14,"D1":15,"D2":16,"D3":17,"D4":18,"D5":19,"E1":20,"E2":21,"E3":22,"E4":23,"E5":24,"F1":25,"F2":26,"F3":27,"F4":28,"F5":29,"G1":30,"G2":31,"G3":32,"G4":33,"G5":34}

for data in [train, testA]:
    data.subGrade = data.subGrade.map(subgrade_dic, na_action='ignore')
    data.subGrade.astype('category')

In [6]:
# employmentLength 特征处理

emp_dic = {"2 years":2, "5 years":5, "8 years":8, "10+ years":10, "7 years":7,"9 years":9, "1 year":1, "3 years":3, "< 1 year":0, "4 years":4, "6 years":6}

for data in [train, testA]:
    data.employmentLength = data.employmentLength.map(emp_dic, na_action='ignore')
    data.employmentLength.astype('category')

In [7]:
# 时间格式 issueDate + earliesCreditLine
# issueDate 贷款发放的月份
# earliesCreditLine 借款人最早报告的信用额度开立的月份

for data in [train, testA]:
    t1 = pd.to_datetime(data.issueDate)  # 2014-07-01
    t2 = pd.to_datetime(data.earliesCreditLine)  # 2001-08-01
    years_between_issueDate_and_earliesCreditLine = round(abs((t1-t2).apply(lambda x: x.days))/365, 0)
    issueDate_to_now_days = t1.apply(lambda x: (datetime.now() - x).days)
    earliesCreditLine_to_now_days = t2.apply(lambda x: (datetime.now() - x).days)
    data['years_between_issueDate_and_earliesCreditLine'] = years_between_issueDate_and_earliesCreditLine
    data['issueDate_to_now_days'] = issueDate_to_now_days
    data['earliesCreditLine_to_now_days'] = earliesCreditLine_to_now_days

2021-01-26 08:56:36,954 INFO: NumExpr defaulting to 2 threads.


In [8]:
# 特征构造 ficoRange = ficoRangeHigh - ficoRangeLow
# ficoRangeLow 借款人在贷款发放时的fico所属的下限范围
# ficoRangeHigh 借款人在贷款发放时的fico所属的上限范围

for data in [train, testA]:
    ficoRange = data.ficoRangeHigh - data.ficoRangeLow
    data['ficoRange'] = ficoRange
    data.ficoRange.astype('category')

In [9]:
# 删除唯一值特征 policyCode

for data in [train, testA]:
    data = data.drop('policyCode', axis=1)

In [10]:
# 债务收入比 dti 不可能为负
for data in [train, testA]:
    data.dti[data.dti<0] = 0

In [11]:
# 业务变量构造

for data in [train, testA]:
    # 余额 = 信贷周转余额合计 + 借款人信用档案中当前的信用额度总数  = revolBal+totalAcc
    revolBal_p_totalAcc = data.revolBal + data.totalAcc

    # 信用已用额度 = 借款人信用档案中当前的信用额度总数 - 借款人信用档案中未结信用额度的数量 = totalAcc-openAcc
    totalAcc_m_openAcc = data.totalAcc - data.openAcc

    # 年还款 = 贷款金额 / 贷款期限（year） = loanAmnt/term 
    loanAmnt_term = data.loanAmnt / data.term

    # 年收入/年还款
    annualIncome_loanAmnt_term = data.annualIncome / loanAmnt_term

    # 债务 = 债务收入比 * 年收入 = dti*annualIncome
    debt = data.dti * data.annualIncome

    # 利息 = 贷款利率 * 贷款期限（year） =  interestRate*term
    pro = data.interestRate * data.term

    # 分期付款金额 / 年收入
    installment_annualIncome = data.installment / data.annualIncome

    # 人均贷款金额 = 贷款总额 / 表明贷款是个人申请还是与两个共同借款人的联合申请
    loanAmnt_applicationType = data.loanAmnt/(1+data.applicationType)

    # --------- 赋值
    data["revolBal_p_totalAcc"] = revolBal_p_totalAcc
    data["totalAcc_m_openAcc"] = totalAcc_m_openAcc
    data["loanAmnt_term"] = loanAmnt_term
    data["annualIncome_loanAmnt_term"] = annualIncome_loanAmnt_term
    data["debt"] = debt
    data["pro"] = pro
    data["installment_annualIncome"] = installment_annualIncome
    data["loanAmnt_applicationType"] = loanAmnt_applicationType

    eny_num = data.earliesCreditLine_to_now_days/(data.years_between_issueDate_and_earliesCreditLine)
    int_sub = data.interestRate/data.subGrade
    pro_dti = data.pro/data.dti
    top1 = data.issueDate_to_now_days*data.ficoRangeHigh

    data["eny_num"] = eny_num
    data["int_sub"] = int_sub
    data["pro_dti"] = pro_dti
    data["top1"] = top1

    # pubRec	贬损公共记录的数量
    # pubRecBankruptcies	公开记录清除的数量

    rec = data.pubRec - data.pubRecBankruptcies
    rec_rate = data.pubRecBankruptcies/(1+data.pubRec)

    data['rec'] = rec 
    data['rec_rate'] = rec_rate

In [12]:
# 处理 N group
n_feat = ['n0', 'n1', 'n2', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', ]
nameList = ['min', 'max', 'sum', 'mean', 'median', 'skew', 'std', 'mode', 'range', 'Q25','Q75']
statList = ['min', 'max', 'sum', 'mean', 'median', 'skew', 'std', myMode, myRange, myQ25, myQ75]

for data in [train, testA]:
    for i in range(len(nameList)):
	    data['n_feat_{}'.format(nameList[i])] = data[n_feat].agg(statList[i], axis=1)

### Log化处理数值特征

In [1]:
# 拆分数字特征和离散特征列表
def num_cat_cols(x, THRESH=50):
    a = x.apply(lambda x: x.nunique())
    num_cols = a[a >THRESH].index.tolist()
    cat_cols = a[a <=THRESH].index.tolist()
    return num_cols, cat_cols

In [2]:
num_cols, cat_cols = num_cat_cols(train))

NameError: name 'total_data' is not defined

In [None]:
num_cols.remove('id')
num_cols.remove('issueDate')
num_cols.remove('earliesCreditLine')

In [None]:
# 对连续变量log化处理
for data in [train, testA]:
    for col in num_cols:
        data[col] = data[col].apply(lambda x: math.log(np.float(x)+1)

In [None]:
train['sample'] = 'train'
testA['sample'] = 'test'

# 关联train + test 一起做特征处理
data = pd.concat([train, testA], axis=0, ignore_index=True)
data.to_csv(f"data/data_for_model_v2.csv", index=False)
logging.info(f"combine data shape: {data.shape}")

### More try...

In [14]:
total_data = pd.concat([train.copy(), testA.copy()])
total_data.head()