In [208]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
 
#数据加载
train=pd.read_csv("./train.csv")
test=pd.read_csv("./testA.csv")

features = train.columns.tolist()
for feature in features:
    if train[feature].nunique() ==1:
        train.drop([feature],axis=1,inplace=True)
        test.drop([feature],axis=1,inplace=True)
print('删除全部唯一值为1字段完成')

#将数值类型的缺失值全部以中位数补全
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
numerical_fea.remove('isDefault')
train[numerical_fea] = train[numerical_fea].fillna(train[numerical_fea].median())
test[numerical_fea] = test[numerical_fea].fillna(test[numerical_fea].median())
print('数值类型缺失值,中位数填充完成')

from scipy import stats 
cat_fea = list(train.select_dtypes(include=['object']).columns)
for cf in cat_fea:
    if train[cf].isnull().sum():
        train[cf] = train[cf].fillna(stats.mode(train[cf])[0][0])
cat_fea = list(test.select_dtypes(include=['object']).columns)
for cf in cat_fea:
    if test[cf].isnull().sum():
        test[cf] = test[cf].fillna(stats.mode(test[cf])[0][0])
print('类别类型缺失值,众数填充完成')

import datetime
def create_days_diff(selected_cols):
    for selected in selected_cols:
        train[selected] = pd.to_datetime(train[selected])
        tmp_str = str(train[selected].min().year)+'-'+str("%02d" % train[selected].min().month)+'-'+str("%02d" % train[selected].min().day)
        startdate = datetime.datetime.strptime(tmp_str, '%Y-%m-%d')
        train[selected+'_diff'] = train[selected].apply(lambda x: x-startdate).dt.days
        
        test[selected] = pd.to_datetime(test[selected])
        tmp_str = str(test[selected].min().year)+'-'+str("%02d" % test[selected].min().month)+'-'+str("%02d" % test[selected].min().day)
        startdate = datetime.datetime.strptime(tmp_str, '%Y-%m-%d')
        test[selected+'_diff'] = test[selected].apply(lambda x: x-startdate).dt.days        
        print(selected+'_diff'+' 时间差字段 已经创建')
def create_ym_features(selected_cols):
    for selected in selected_cols:
        test[selected] = pd.to_datetime(test[selected])
        train_temp = pd.DatetimeIndex(train[selected])
        test_temp = pd.DatetimeIndex(test[selected])
        
        train[selected+'_year'] = train_temp.year
        test[selected+'_year'] = test_temp.year
        print(selected+'_year'+'字段 已经创建')
        
        train[selected+'_month'] = train_temp.month
        test[selected+'_month'] = test_temp.month
        print(selected+'_month'+'字段 已经创建')
        
selected_cols = ['issueDate','earliesCreditLine']
create_ym_features(selected_cols)
create_days_diff(selected_cols)

for data in [train]:
    #贷款等级
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
    #就业年限（年）
    data['employmentLength'] = data['employmentLength'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})
    #贷款等级之子级
    data['subGrade'] = data['subGrade'].map({'E2':1,'D2':2,'D3':3,'A4':4,'C2':5,'A5':6,'C3':7,'B4':8,'B5':9,'E5':10,
        'D4':11,'B3':12,'B2':13,'D1':14,'E1':15,'C5':16,'C1':17,'A2':18,'A3':19,'B1':20,
        'E3':21,'F1':22,'C4':23,'A1':24,'D5':25,'F2':26,'E4':27,'F3':28,'G2':29,'F5':30,
        'G3':31,'G1':32,'F4':33,'G4':34,'G5':35})    
    #借款人信用档案中当前的信用额度总数 除以 贷款金额
    data['rato']=data['totalAcc']/data['loanAmnt']
for data in [test]:
    #贷款等级
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
    #就业年限（年）
    data['employmentLength'] = data['employmentLength'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})
    #贷款等级之子级
    data['subGrade'] = data['subGrade'].map({'E2':1,'D2':2,'D3':3,'A4':4,'C2':5,'A5':6,'C3':7,'B4':8,'B5':9,'E5':10,
        'D4':11,'B3':12,'B2':13,'D1':14,'E1':15,'C5':16,'C1':17,'A2':18,'A3':19,'B1':20,
        'E3':21,'F1':22,'C4':23,'A1':24,'D5':25,'F2':26,'E4':27,'F3':28,'G2':29,'F5':30,
        'G3':31,'G1':32,'F4':33,'G4':34,'G5':35})    
    #借款人信用档案中当前的信用额度总数 除以 贷款金额
    data['rato']=data['totalAcc']/data['loanAmnt']
result = test[['id']].copy()
features = ['issueDate', 'earliesCreditLine','id'] #自定义删除字段
for feature in features:
        train.drop([feature],axis=1,inplace=True)
        test.drop([feature],axis=1,inplace=True)
print('批量删除字段已完成')

#CatBoost模型
model=model = CatBoostClassifier(
    loss_function="Logloss",    # 分类任务常用损失函数
    eval_metric="Accuracy",     # 表示用于过度拟合检测和最佳模型选择的度量标准；
    learning_rate=0.08,         # 表示学习率
    iterations=10000,
    random_seed=42,           # 设置随机种子进行固定
    od_type="Iter",
    metric_period=20,           # 与交叉验证folds数匹配
    max_depth = 8,              # 表示树模型最大深度
    early_stopping_rounds=500,  # 早停步数
    use_best_model=True,
    # task_type="GPU",          # 数据量较小，GPU加速效果不明显
    bagging_temperature=0.9,
    leaf_estimation_method="Newton",
)

 
n_folds =10 #十折交叉校验
answers = []
mean_score = 0
data_x=train.drop(['isDefault'],axis=1)
data_y=train[['isDefault']].copy()
sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2021)
all_test = test.copy()
for train, test in sk.split(data_x, data_y):  
    x_train = data_x.iloc[train]
    y_train = data_y.iloc[train]
    x_test = data_x.iloc[test]
    y_test = data_y.iloc[test]
    clf = model.fit(x_train,y_train, eval_set=(x_test,y_test),verbose=500) # 500条打印一条日志
    
    yy_pred_valid=clf.predict(x_test,prediction_type='Probability')[:,-1]
    print('cat验证的auc:{}'.format(roc_auc_score(y_test, yy_pred_valid)))
    mean_score += roc_auc_score(y_test, yy_pred_valid) / n_folds
    
    y_pred_valid = clf.predict(all_test,prediction_type='Probability')[:,-1]
    answers.append(y_pred_valid) 
print('mean valAuc:{}'.format(mean_score))
cat_pre=sum(answers)/n_folds
result['isDefault']=cat_pre
result.to_csv('./baselinev1.csv',index=False)

0:	test: 0.6983260	best: 0.6983260 (0)	total: 97.4ms	remaining: 1m 37s
500:	test: 0.7355001	best: 0.7355001 (500)	total: 44.2s	remaining: 44.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7359020243
bestIteration = 564

Shrink model to first 565 iterations.
cat验证的auc:0.7359020243129812
0:	test: 0.6953806	best: 0.6953806 (0)	total: 141ms	remaining: 2m 20s
500:	test: 0.7388230	best: 0.7388230 (500)	total: 44.6s	remaining: 44.5s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7405412741
bestIteration = 912

Shrink model to first 913 iterations.
cat验证的auc:0.7405412740510757
0:	test: 0.6924044	best: 0.6924044 (0)	total: 96.3ms	remaining: 1m 36s
500:	test: 0.7331256	best: 0.7331256 (500)	total: 44.2s	remaining: 44.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.7332054353
bestIteration = 531

Shrink model to first 532 iterations.
cat验证的auc:0.7332054353324311
0:	test: 0.6948469	best: 0.6948469 (0)	total: 114ms	remaining: 1m 53

KeyboardInterrupt: 