In [None]:
import pandas as pd    
import numpy as np    
import matplotlib.pyplot as plt    
import seaborn as sns    
import datetime    
from tqdm import tqdm    
from sklearn.preprocessing import LabelEncoder    
from sklearn.feature_selection import SelectKBest    
from sklearn.feature_selection import chi2    
from sklearn.preprocessing import MinMaxScaler    
import xgboost as xgb    
import lightgbm as lgb    
from catboost import CatBoostRegressor    
import warnings    
from sklearn.model_selection import StratifiedKFold, KFold    
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss    
warnings.filterwarnings('ignore')

In [None]:
data_train =pd.read_csv('train.csv')    
data_test_a = pd.read_csv('testA.csv')

In [None]:
cate_features = ['employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode',     
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
numerical_fea = [i for i in data_train.columns if i not in cate_features]
# category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))    
label = 'isDefault'    
numerical_fea.remove(label)
numerical_fea.remove('id')

In [None]:
category_fea

In [None]:
numerical_fea

In [None]:
#按照平均数填充数值型特征    
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())    
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())    
#按照众数填充类别型特征    
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())    
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())

In [None]:
data_train.isnull().sum()

In [None]:
def employmentLength_to_int(s):    
    if pd.isnull(s):    
        return s    
    else:    
        return np.int8(s.split()[0])    
for data in [data_train, data_test_a]:    
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)    
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)    
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [None]:
#用前五后五的拉格朗日函数值去fillna
#定义插值函数,取值时数据范围不能超过data的边界
from scipy.interpolate import lagrange
def insert_value(s, n, k=5):
    if n >= 5 and n+k+1 < len(s):
        y = s[list(range(n-k,n))+list(range(n+1,n+k+1))]
        y = y[y.notnull()]
        insert_value = lagrange(y.index, list(y))(n)
    elif n < 5:
        y = s[list(range(0,n))+list(range(n+1,n+k+1))]
        y = y[y.notnull()]
        insert_value = lagrange(y.index, list(y))(n)
    elif n+k+1 >= len(s):
        y = s[list(range(n-k,n))+list(range(n+1,len(s)))]
        y = y[y.notnull()]
        insert_value = lagrange(y.index, list(y))(n) > 0
    return round(insert_value,1)
    
def fillna_method(data, columns):
    for i in columns:
        for j in range(len(data)):
            if data[i].isnull()[j]:
                if insert_value(data[i],j) < 0: 
                    data[i][j] = 0
                elif insert_value(data[i],j) < 10:
                    data[i][j] = insert_value(data[i],j)
                elif insert_value(data[i],j) < 15:
                    data[i][j] = 10
                else:
                    print(i, j, insert_value(data[i],j))
                    break
                    
#采用线性插值法进行填充，并四舍五入              
for data in [data_train, data_test_a]:
    data['employmentLength'] = round(data['employmentLength'].interpolate(method='linear'))

In [None]:
data_train.info()

In [None]:
#对数值型特征进行标准化
import sklearn.preprocessing as preproc
cate_features = ['employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode',     
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
numerical_fea = [i for i in numerical_fea if i not in cate_features]
numerical_fea.remove(label)
numerical_fea.remove('id')


zscore_scaler=preproc.StandardScaler()
for data in [data_train, data_test_a]:
    data[numerical_fea] = zscore_scaler.fit_transform(data[numerical_fea])

In [None]:
#Objetct特征处理['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
#'employmentLength'上面已经处理
#日期特征处理：
#'issueDate'
for data in [data_train, data_test_a]:    
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')    
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')    
    #构造时间特征    
    data['issueDate'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

#'earliesCreditLine'
from dateutil.relativedelta import relativedelta

def calculate_months_diff(x):
    date_diff = relativedelta(datetime.datetime.strptime('2023-12', '%Y-%m'),datetime.datetime.strptime('Sep-1999', '%b-%Y'))
    return date_diff.years*12+date_diff.months

for data in [data_train, data_test_a]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda x:calculate_months_diff(x))

#其他object特征处理    
#像grade这种类别特征，是有优先级的可以labelencode或者自映射
for data in [data_train, data_test_a]:    
    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})   
    
#subgrade同理
d = {}
for index, value in enumerate(sorted(data_train['subGrade'].unique())):
    d[value] = index+1
for data in [data_train, data_test_a]:    
    data['subGrade'] = data['subGrade'].map(d)  
    

In [None]:
#类别特征处理    
cate_features = ['employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode',     
                 'applicationType', 'initialListStatus', 'title', 'policyCode']    
for f in cate_features:    
    print(f, '类型数：', data[f].nunique())

In [None]:
# 类型数在2之上，又不是高维稀疏的,且纯分类特征    
data_train = pd.get_dummies(data_train, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
data_test_a = pd.get_dummies(data_test_a, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

In [None]:
data_train.columns

In [None]:
data_test_a.columns

In [None]:
#内存管理
import sys

def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
# Trigger garbage collection
import gc
gc.collect()

In [None]:
features = data_train.columns.tolist()
features.remove('id')
features.remove('isDefault')
train_x = data_train[features]
valid_x = data_test_a[features]
train_y = data_train['isDefault']

In [None]:
#自动特征工程
from openfe import OpenFE, transform
ofe = OpenFE()
n_jobs = 5
features = ofe.fit(data=train_x, label=train_y, n_jobs=n_jobs)  # generate new features
train_x, valid_x = transform(train_x, valid_x, features, n_jobs=n_jobs) # transform the train and test data according to generated features.

In [None]:
def cv_model(clf, train_x, train_y, test_x, clf_name):    
    folds = 5    
    seed = 2020    
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)    

    train = np.zeros(train_x.shape[0])    
    test = np.zeros(test_x.shape[0])    

    cv_scores = []    

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):    
        print('************************************ {} ************************************'.format(str(i+1)))    
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]    

        if clf_name == "lgb":    
            train_matrix = clf.Dataset(trn_x, label=trn_y)    
            valid_matrix = clf.Dataset(val_x, label=val_y)    

            params = {    
                'boosting_type': 'gbdt',    
                'objective': 'binary',    
                'metric': 'auc',    
                'num_leaves': 33,    
                'max_depth': 6,    
                'min_data_in_leaf':45,    
                'min_child_weight':0.001,    
                'bagging_fraction': 0.9,    
                'feature_fraction': 0.9,    
                'bagging_freq': 10,    
                'min_split_gain': 0.1,    
                'reg_lambda':0.01,    
                'reg_alpha':0.08,    
                'learning_rate': 0.01,    
                'seed': 2020,    
                'nthread': 24,    
                'n_jobs':24,    
                'silent': True,    
                'verbose': -1,  
            }    

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)    
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)    
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)    

            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])    

        if clf_name == "xgb":    
            train_matrix = clf.DMatrix(trn_x , label=trn_y)    
            valid_matrix = clf.DMatrix(val_x , label=val_y)    

            params = {'booster': 'gbtree',    
                      'objective': 'binary:logistic',    
                      'eval_metric': 'auc',    
                      'gamma': 1,    
                      'min_child_weight': 1.5,    
                      'max_depth': 5,    
                      'lambda': 10,    
                      'subsample': 0.7,    
                      'colsample_bytree': 0.7,    
                      'colsample_bylevel': 0.7,    
                      'eta': 0.04,    
                      'tree_method': 'exact',    
                      'seed': 2020,    
                      'nthread': 36,    
                      "silent": True,    
                      }    

            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]    

            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)    
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)    
            test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)    

        if clf_name == "cat":    
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',    
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}    

            model = clf(iterations=20000, **params)    
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),    
                      cat_features=[], use_best_model=True, verbose=500)    

            val_pred  = model.predict(val_x)    
            test_pred = model.predict(test_x)    

        train[valid_index] = val_pred    
        test = test_pred / kf.n_splits    
        cv_scores.append(roc_auc_score(val_y, val_pred))    

        print(cv_scores)    

    print("%s_scotrainre_list:" % clf_name, cv_scores)    
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))    
    print("%s_score_std:" % clf_name, np.std(cv_scores))    
    return train, test

In [None]:
def lgb_model(x_train, y_train, x_test):    
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")    
    return lgb_train, lgb_test    

def xgb_model(x_train, y_train, x_test):    
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")    
    return xgb_train, xgb_test    

def cat_model(x_train, y_train, x_test):    
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")

In [None]:
lgb_train, lgb_test = lgb_model(train_x, train_y, valid_x)

In [None]:
#结果保存
# lgb_result.to_pickle('lgb_result.pickle')
#submit结果拼接
lgb_submit = pd.DataFrame([[data_test_a['id'][i], lgb_test[i]] for i in range(len(lgb_test))], columns=['id', 'isDefault'])
lgb_submit.to_csv('lgb_test2.csv', index=False)

In [None]:
#网格调参，基础模型为lightgbm
from sklearn.model_selection import GridSearchCV    

def get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=31, max_depth=-1, bagging_fraction=1.0,    
                       feature_fraction=1.0, bagging_freq=0, min_data_in_leaf=20, min_child_weight=0.001,    
                       min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=None,
                      X_train=train_x, y_train=train_y):    
    # 设置5折交叉验证    
    cv_fold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True, )    

    model_lgb = lgb.LGBMClassifier(learning_rate=learning_rate,    
                                   n_estimators=n_estimators,    
                                   num_leaves=num_leaves,    
                                   max_depth=max_depth,    
                                   bagging_fraction=bagging_fraction,    
                                   feature_fraction=feature_fraction,    
                                   bagging_freq=bagging_freq,    
                                   min_data_in_leaf=min_data_in_leaf,    
                                   min_child_weight=min_child_weight,    
                                   min_split_gain=min_split_gain,    
                                   reg_lambda=reg_lambda,    
                                   reg_alpha=reg_alpha,    
                                   n_jobs= 8    
                                  )    
    grid_search = GridSearchCV(estimator=model_lgb,    
                               cv=cv_fold,    
                               param_grid=param_grid,    
                               scoring='roc_auc'    
                              )    
    grid_search.fit(X_train, y_train)    

    print('模型当前最优参数为:{}'.format(grid_search.best_params_))    
    print('模型当前最优得分为:{}'.format(grid_search.best_score_))

In [None]:
"""    
需要注意一下的是，除了获取下面的获取num_boost_round时候用的是原生的lightgbm（因为要用自带的cv）    
下面配合GridSearchCV时必须使用sklearn接口的lightgbm。    
"""    
"""设置n_estimators 为581，调整num_leaves和max_depth，这里选择先粗调再细调"""    
lgb_params = {'num_leaves': range(10, 80, 5), 'max_depth': range(3,10,2)}    
get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=None, max_depth=None, min_data_in_leaf=20,    
                   min_child_weight=0.001,bagging_fraction=1.0, feature_fraction=1.0, bagging_freq=0,    
                   min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)    



In [None]:
"""num_leaves为30，max_depth为5，进一步细调num_leaves和max_depth"""    
lgb_params = {'num_leaves': range(25, 35, 1), 'max_depth': range(3,7,1)}    
get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=None, max_depth=None, min_data_in_leaf=20,    
                   min_child_weight=0.001,bagging_fraction=1.0, feature_fraction=1.0, bagging_freq=0,    
                   min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)    



In [None]:
"""    
确定num_leaves为33，max_depth为6 ，下面进行bagging_fraction、feature_fraction和bagging_freq的调参    
"""    
lgb_params = {'bagging_fraction': [i/10 for i in range(5,10,1)],    
              'feature_fraction': [i/10 for i in range(5,10,1)],    
              'bagging_freq': range(0,81,10)    
             }    
get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=33, max_depth=6, min_data_in_leaf=45,    
                   min_child_weight=0.001,bagging_fraction=None, feature_fraction=None, bagging_freq=None,    
                   min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)    



In [None]:
"""    
确定bagging_fraction为0.9、feature_fraction为0.9、bagging_freq为10 ，下面进行reg_lambda、reg_alpha的调参    
"""    
lgb_params = {'reg_lambda': [0,0.001,0.01,0.03,0.08,0.3,0.5], 'reg_alpha': [0,0.001,0.01,0.03,0.08,0.3,0.5]}    
get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=33, max_depth=6, min_data_in_leaf=45,    
                   min_child_weight=0.001,bagging_fraction=0.9, feature_fraction=0.9, bagging_freq=10,    
                   min_split_gain=0, reg_lambda=None, reg_alpha=None, param_grid=lgb_params)    



In [None]:
"""    
确定reg_lambda为0.01，reg_alpha为0.08，下面进行min_split_gain的调参    
"""    
lgb_params = {'min_split_gain': [i/10 for i in range(0,11,1)]}    
get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=33, max_depth=6, min_data_in_leaf=45,    
                   min_child_weight=0.001,bagging_fraction=0.9, feature_fraction=0.9, bagging_freq=10,    
                   min_split_gain=0, reg_lambda=0.01, reg_alpha=0.08, param_grid=lgb_params) 


In [None]:
lgb_train

In [None]:
"""    
参数确定好了以后，我们设置一个比较小的learning_rate 0.01，来确定最终的num_boost_round    
"""    
# 设置5折交叉验证    
# cv_fold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True, )    
final_params = {
                'boosting_type': 'gbdt',    
                'learning_rate': 0.01,    
                'num_leaves': 33,    
                'max_depth': 6,    
                'min_data_in_leaf':45,    
                'min_child_weight':0.001,    
                'bagging_fraction': 0.9,    
                'feature_fraction': 0.9,    
                'bagging_freq': 10,    
                'min_split_gain': 0.1,    
                'reg_lambda':0.01,    
                'reg_alpha':0.08,    
                'nthread': 6    
               }    
lgb_train = lgb.Dataset(train_x, train_y)

cv_result = lgb.cv(train_set=lgb_train,    
                   early_stopping_rounds=20,    
                   num_boost_round=5000,    
                   nfold=5,    
                   stratified=True,    
                   shuffle=True,    
                   params=final_params,    
                   metrics='auc',    
                   seed=0,    
                  )    

print('迭代次数{}'.format(len(cv_result['auc-mean'])))    
print('交叉验证的AUC为{}'.format(max(cv_result['auc-mean'])))

In [None]:

#根据最佳参数进行模型训练
final_params = {
                'boosting_type': 'gbdt',    
                'learning_rate': 0.01,    
                'num_leaves': 33,    
                'max_depth': 6,    
                'min_data_in_leaf':45,    
                'min_child_weight':0.001,    
                'bagging_fraction': 0.9,    
                'feature_fraction': 0.9,    
                'bagging_freq': 10,    
                'min_split_gain': 0.1,    
                'reg_lambda':0.01,    
                'reg_alpha':0.08,    
                'nthread': 6    
               } 

train_x,test_x,train_y,test_y = train_test_split(train_x,train_y,test_size=0.2,random_state=42)

train_matrix = lgb.Dataset(train_x, label=train_y)    
valid_matrix = lgb.Dataset(test_x, label=test_y)
    
final_model_lgb = lgb.train(final_params, train_set = train_matrix, valid_sets = valid_matrix, num_boost_round=3753, verbose_eval=1000, early_stopping_rounds=200) 

In [None]:
"""预测"""    
pred_y = final_model_lgb.predict(test_x) 
test_y = test_y.values
#计算准确率
accuracy = accuracy_score(test_y,pred_y)
print('accuracy:%3.f%%'%(accuracy*100))

"""计算roc的相关指标"""
fpr, tpr, threshold = metrics.roc_curve(test_y, pred_y)    
roc_auc = metrics.auc(fpr, tpr)    
print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))    
"""画出roc曲线图"""    
plt.figure(figsize=(8, 8))    
plt.title('Validation ROC')    
plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc)    
plt.ylim(0,1)    
plt.xlim(0,1)    
plt.legend(loc='best')    
plt.title('ROC')    
plt.ylabel('True Positive Rate')    
plt.xlabel('False Positive Rate')    
# 画出对角线    
plt.plot([0,1],[0,1],'r--')    
plt.show()

In [None]:
#使用最佳模型预测结果
lgb_result = final_model_lgb.predict(valid_x)
#结果保存
# lgb_result.to_pickle('lgb_result.pickle')
#submit结果拼接
lgb_submit = pd.DataFrame([[data_test_a['id'][i], lgb_test[i]] for i in range(len(lgb_test))], columns=['id', 'isDefault'])
lgb_submit.to_csv('lgb_test2.csv', index=False)