In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('./train.csv',encoding='latin-1')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87020 entries, 0 to 87019
Data columns (total 26 columns):
ID                       87020 non-null object
Gender                   87020 non-null object
City                     86017 non-null object
Monthly_Income           87020 non-null int64
DOB                      87020 non-null object
Lead_Creation_Date       87020 non-null object
Loan_Amount_Applied      86949 non-null float64
Loan_Tenure_Applied      86949 non-null float64
Existing_EMI             86949 non-null float64
Employer_Name            86949 non-null object
Salary_Account           75256 non-null object
Mobile_Verified          87020 non-null object
Var5                     87020 non-null object
Var1                     87019 non-null object
Loan_Amount_Submitted    52407 non-null float64
Loan_Tenure_Submitted    52407 non-null float64
Interest_Rate            27726 non-null float64
Processing_Fee           27420 non-null float64
EMI_Loan_Submitted       27727 non-null

In [3]:
pd.isnull(data).sum().sort_values()

ID                           0
Var4                         0
Source                       0
Var2                         0
Device_Type                  0
Filled_Form                  0
LoggedIn                     0
Mobile_Verified              0
Var5                         0
Gender                       0
Monthly_Income               0
DOB                          0
Lead_Creation_Date           0
Disbursed                    1
Var1                         1
Existing_EMI                71
Employer_Name               71
Loan_Amount_Applied         71
Loan_Tenure_Applied         71
City                      1003
Salary_Account           11764
Loan_Tenure_Submitted    34613
Loan_Amount_Submitted    34613
EMI_Loan_Submitted       59293
Interest_Rate            59294
Processing_Fee           59600
dtype: int64

### 特征工程

+ City variable dropped because of too many categories
+ DOB converted to Age | DOB dropped
+ EMI_Loan_Submitted_Missing created which is 1 if EMI_Loan_Submitted was missing else 0 | Original variable EMI_Loan_Submitted dropped
+ EmployerName dropped because of too many categories
+ Existing_EMI imputed with 0 (median) since only 111 values were missing
+ Interest_Rate_Missing created which is 1 if Interest_Rate was missing else 0 | Original variable Interest_Rate dropped
+ Lead_Creation_Date dropped because made little intuitive impact on outcome
+ Loan_Amount_Applied, Loan_Tenure_Applied imputed with median values
+ Loan_Amount_Submitted_Missing created which is 1 if Loan_Amount_Submitted was missing else 0 | Original variable Loan_Amount_Submitted dropped
+ Loan_Tenure_Submitted_Missing created which is 1 if Loan_Tenure_Submitted was missing else 0 | Original variable Loan_Tenure_Submitted dropped
+ LoggedIn, Salary_Account dropped
+ Processing_Fee_Missing created which is 1 if Processing_Fee was missing else 0 | Original variable Processing_Fee dropped
+ Source – top 2 kept as is and all others combined into different category
+ Numerical and One-Hot-Coding performed

In [4]:
data.drop(['City'],axis=1,inplace=True)

In [5]:
# 出生日期转换成年龄
data['Age']=data['DOB'].apply(lambda x:120-int(x[-2:]))
data.drop('DOB',axis=1,inplace=True)
data['Age'].head(5)

0    42
1    35
2    39
3    33
4    36
Name: Age, dtype: int64

In [6]:
data['EMI_Loan_Submitted_Missing']=data['EMI_Loan_Submitted'].apply(lambda x:1 if pd.isnull(x) else 0)
data.drop(['EMI_Loan_Submitted'],axis=1,inplace=True)
data['EMI_Loan_Submitted_Missing'].head()

0    1
1    0
2    1
3    1
4    1
Name: EMI_Loan_Submitted_Missing, dtype: int64

In [7]:
data.drop(['Employer_Name'],axis=1,inplace=True)

In [8]:
data['Existing_EMI'].fillna(0,inplace=True)
data['Existing_EMI'].head()

0        0.0
1        0.0
2        0.0
3        0.0
4    25000.0
Name: Existing_EMI, dtype: float64

In [9]:
data['Interest_Rate_Missing']=data['Interest_Rate'].apply(lambda x:1 if pd.isnull(x) else 0)
data.drop(['Interest_Rate'],axis=1,inplace=True)
data['Interest_Rate_Missing'].head()

0    1
1    0
2    1
3    1
4    1
Name: Interest_Rate_Missing, dtype: int64

In [10]:
data.drop(['Lead_Creation_Date'],axis=1,inplace=True)

In [11]:
data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace=True)
data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)

In [12]:
data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data.drop(['Loan_Amount_Submitted','Loan_Tenure_Submitted'],axis=1,inplace=True)

In [13]:
data.drop(['LoggedIn','Salary_Account'],axis=1,inplace=True)

In [14]:
data['Processing_Fee_Missing'] = data['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)
data.drop('Processing_Fee',axis=1,inplace=True)

In [15]:
data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)
data['Source'].value_counts()

S122      38566
S133      29885
others    18569
Name: Source, dtype: int64

In [16]:
data.drop(['ID'],axis=1,inplace=True)
data.dropna(axis=0,inplace=True)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87019 entries, 0 to 87019
Data columns (total 20 columns):
Gender                           87019 non-null object
Monthly_Income                   87019 non-null int64
Loan_Amount_Applied              87019 non-null float64
Loan_Tenure_Applied              87019 non-null float64
Existing_EMI                     87019 non-null float64
Mobile_Verified                  87019 non-null object
Var5                             87019 non-null object
Var1                             87019 non-null object
Filled_Form                      87019 non-null object
Device_Type                      87019 non-null object
Var2                             87019 non-null object
Source                           87019 non-null object
Var4                             87019 non-null int64
Disbursed                        87019 non-null float64
Age                              87019 non-null int64
EMI_Loan_Submitted_Missing       87019 non-null int64
Interest_Ra

In [18]:
data.head().T

Unnamed: 0,0,1,2,3,4
Gender,Female,Male,Male,Male,Male
Monthly_Income,20000,35000,22500,35000,100000
Loan_Amount_Applied,300000,200000,600000,1e+06,500000
Loan_Tenure_Applied,5,2,4,5,2
Existing_EMI,0,0,0,0,25000
Mobile_Verified,N,Y,Y,Y,Y
Var5,0,13,0,10,17
Var1,HBXX,HBXA,HBXX,HBXX,HBXX
Filled_Form,N,N,N,N,N
Device_Type,Web-browser,Web-browser,Web-browser,Web-browser,Web-browser


In [19]:
# 数值编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']
for col in var_to_encode:
    data[col] = le.fit_transform(data[col]).astype('object')

In [20]:
data = pd.get_dummies(data)

In [21]:
data.shape

(87019, 86)

### 模型训练

In [22]:
x=data.loc[:,data.columns!='Disbursed'].values
y=data.loc[:,'Disbursed'].values

In [23]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

（1）设置默认的模型常数，根据一定的学习率得到最优迭代次数

In [26]:
xgb_model = XGBClassifier( 
 learning_rate =0.1,              # 典型值范围：0.01～0.2
 n_estimators=200,               # 迭代次数第一次一般选择较多一点
 max_depth=5,                     # 典型值范围：3～10
 min_child_weight=1,              # 最小叶子节点最小权重和为1，让树尽可能的生长
 gamma=0,                         # 不限制节点分裂，
 subsample=0.8,                   # 每次树构建选择的样本采样比例，典型值范围：0.5~1
 colsample_bytree=0.8,            # 每次树构建时选择的特征采样比例，典型值范围：0.5~1
 objective= 'binary:logistic',    # 目标损失函数采用二分类的logistic回归函数
 nthread=1,                       # 默认是选择最大的线程数，这里选择4
 scale_pos_weight=10,              # 正负比例极其不平衡，因此设置1
 seed=27)   

In [27]:
def search_best_rounds(model:XGBClassifier,dtrain):
    params=model.get_xgb_params()
    cv_results=xgb.cv(params,dtrain,num_boost_round=model.get_params()['n_estimators'],
           nfold=5,metrics='auc',early_stopping_rounds=10,verbose_eval=True)
    return cv_results.shape[0]

dtrain=xgb.DMatrix(x,y)
best_rounds=search_best_rounds(xgb_model,dtrain)
best_rounds

[0]	train-auc:0.76433+0.00429	test-auc:0.74521+0.01333
[1]	train-auc:0.82244+0.01435	test-auc:0.79760+0.01971
[2]	train-auc:0.83127+0.01378	test-auc:0.80740+0.01727
[3]	train-auc:0.83583+0.01277	test-auc:0.81103+0.01654
[4]	train-auc:0.84379+0.00390	test-auc:0.81765+0.01056
[5]	train-auc:0.84734+0.00363	test-auc:0.82108+0.00994
[6]	train-auc:0.84973+0.00305	test-auc:0.82318+0.01033
[7]	train-auc:0.85160+0.00289	test-auc:0.82435+0.00999
[8]	train-auc:0.85249+0.00315	test-auc:0.82488+0.01003
[9]	train-auc:0.85492+0.00297	test-auc:0.82574+0.01032
[10]	train-auc:0.85585+0.00301	test-auc:0.82581+0.01076
[11]	train-auc:0.85745+0.00267	test-auc:0.82569+0.01066
[12]	train-auc:0.85886+0.00278	test-auc:0.82616+0.01092
[13]	train-auc:0.86014+0.00269	test-auc:0.82636+0.01119
[14]	train-auc:0.86106+0.00258	test-auc:0.82734+0.01101
[15]	train-auc:0.86199+0.00247	test-auc:0.82769+0.01134
[16]	train-auc:0.86295+0.00224	test-auc:0.82786+0.01152
[17]	train-auc:0.86394+0.00219	test-auc:0.82801+0.01139
[1

118

In [None]:
xgb_model.set_params(n_estimators=best_rounds)
xgb_model.fit(x,y,eval_metric='auc')

from sklearn.metrics import accuracy_score,roc_auc_score
pred=xgb_model.predict(x)
prob=xgb_model.predict_proba(x)[:,1]
print('accuracy：',accuracy_score(y,pred))
print('auc score:',roc_auc_score(y,prob))

（2）max_depth和min_child_weight参数调优

In [None]:
xgb_model = XGBClassifier(
 learning_rate =0.1,              # 典型值范围：0.01～0.2
 n_estimators=167,               # 迭代次数第一次一般选择较多一点
 max_depth=5,                     # 典型值范围：3～10
 min_child_weight=1,              # 最小叶子节点最小权重和为1，让树尽可能的生长
 gamma=0,                         # 不限制节点分裂，
 subsample=0.8,                   # 每次树构建选择的样本采样比例，典型值范围：0.5~1
 colsample_bytree=0.8,            # 每次树构建时选择的特征采样比例，典型值范围：0.5~1
 objective= 'binary:logistic',    # 目标损失函数采用二分类的logistic回归函数
 nthread=1,                       # 默认是选择最大的线程数，这里选择4
 scale_pos_weight=1,              # 正负比例极其不平衡，因此设置1
 seed=27)

param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}

gs=GridSearchCV(xgb_model,param_test1,scoring='roc_auc',cv=5)
gs.fit(x,y)
print(gs.best_params_)
print(gs.best_score_)

接下来依次进行gamma参数调优、subsample和colsample_bytree参数调优、
正则化参数调优。然后降低学习率，增大迭代次数，重复进行
