In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from scipy.stats import norm, skew
from scipy.special import boxcox1p

pd.set_option('max.columns', 100)

data_path = './data/'
save_path = './result/'

In [2]:
data_train = pd.read_csv(data_path + 'train.csv')
data_test = pd.read_csv(data_path + 'test.csv')
submit = pd.read_csv(data_path + 'submit.csv')

print('data_train shape', data_train.shape)
print('data_test shape', data_test.shape)

data_train shape (40000, 21)
data_test shape (15000, 20)


### 辅助函数

In [3]:
# 目标编码
def kfold_mean(df_train, df_test, target, target_mean_list):
    folds = StratifiedKFold(n_splits=5)

    mean_of_target = df_train[target].mean()

    for fold_, (trn_idx, val_idx) in tqdm(enumerate(folds.split(df_train, y=df_train['label']))):
        tr_x = df_train.iloc[trn_idx, :]
        vl_x = df_train.iloc[val_idx, :]

        for col in target_mean_list:
            df_train.loc[vl_x.index, f'{col}_target_enc'] = vl_x[col].map(
                tr_x.groupby(col)[target].mean())

    for col in target_mean_list:
        df_train[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)

        df_test[f'{col}_target_enc'] = df_test[col].map(
            df_train.groupby(col)[f'{col}_target_enc'].mean())

        df_test[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)
        
    return pd.concat([df_train, df_test], ignore_index=True)

### 特征工程

In [4]:
data = pd.concat([data_train, data_test], axis = 0).reset_index(drop = True)
data.head()

Unnamed: 0,id,XINGBIE,CSNY,HYZK,ZHIYE,ZHICHEN,ZHIWU,XUELI,DWJJLX,DWSSHY,GRJCJS,GRZHZT,GRZHYE,GRZHSNJZYE,GRZHDNGJYE,GRYJCE,DWYJCE,DKFFE,DKYE,DKLL,label
0,train_0,1,1038672000,90,90,999,0,99,150,12,1737.0,1,3223.515,801.31,837.0,312.0,312.0,175237,154112.935,2.708,0.0
1,train_1,2,504892800,90,90,999,0,99,110,0,4894.0,1,18055.195,53213.22,1065.2,795.84,795.84,300237,298252.945,2.979,0.0
2,train_2,1,736185600,90,90,999,0,99,150,9,10297.0,1,27426.6,13963.14,7230.02,1444.2,1444.2,150237,147339.13,2.708,0.0
3,train_3,1,428515200,90,90,999,0,99,150,7,10071.5,1,111871.13,99701.265,2271.295,1417.14,1417.14,350237,300653.78,2.708,0.0
4,train_4,2,544204800,90,90,999,0,99,900,14,2007.0,1,237.0,11028.875,35.78,325.5,325.5,150237,145185.01,2.708,0.0


In [5]:
cate_2_cols = ['XINGBIE', 'ZHIWU', 'XUELI']
cate_cols = ['HYZK', 'ZHIYE', 'ZHICHEN', 'DWJJLX', 'DWSSHY', 'GRZHZT']
num_cols = ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE', 'DWYJCE', 'DKFFE', 'DKYE', 'DKLL']

In [6]:
# ---------------------------- age --------------------------
use_base = True
if use_base:
    data['age'] = ((1609430399 - data['CSNY']) / (365 * 24 * 3600)).astype(int)
else:
    long_time_mask = data['CSNY'].astype(str).str.len() == 12
    data['time'] = 0
    data.loc[long_time_mask, 'time'] = pd.to_datetime(data.loc[long_time_mask, 'CSNY'], unit='ms')
    data.loc[~long_time_mask, 'time'] = pd.to_datetime(data.loc[~long_time_mask, 'CSNY'], unit='s')
    data['time'] = pd.to_datetime(data.loc[:, 'time'].copy())
    data['age'] = 2020 - data['time'].dt.year
    
    data['time_tonow'] = (datetime.datetime.now() - data['time']).dt.days
    
    data.drop(['time', 'time_tonow'], axis=1, inplace=True)
    
data.drop(['CSNY'], axis=1, inplace=True)

In [7]:
# -------------------------- 偏差修正 ------------------------
correct_error = False
if correct_error:
    for col in ['GRJCJS', 'GRZHYE', 'GRZHSNJZYE', 'GRZHDNGJYE', 'GRYJCE', 'DWYJCE', 'DKFFE', 'DKYE']:
        data[col] = data[col] - 237

In [8]:
not_drop = []

In [9]:
# --------------------------------- 业务特征 ---------------------------------

data['JC_ratio'] = data['GRYJCE'] / data['GRJCJS']                # 缴存比例 = 个人月缴存额 / 个人缴存基数


data['GRJCJS_lowest'] = data['GRYJCE'] / 0.12              # 根据月缴存额，计算正常的个人缴存基数范围
data['GRJCJS_highest'] = data['GRYJCE'] / 0.05
data['GRJCJS_1'] = data['GRJCJS_lowest'] - data['GRJCJS']
data['GRJCJS_2'] = data['GRJCJS_highest'] - data['GRJCJS']


# 当年缴存（猜测是4个月） - 当年归集
data['DNTQ'] = (data['GRYJCE'] + data['DWYJCE']) * 4 - data['GRZHDNGJYE']


# 【日照】贷款额度 = 申请人及配偶的个人月缴存额之和 / 实际缴存比例 * 12(月) * 0.45(还款能力系数) * 最长贷款年限
data['until_retire'] = 65 - data['age']
data['DK_years'] = data['until_retire'].apply(lambda x: x if x < 30 else 30)
data['DKED_1'] = (data['GRYJCE'] + data['DWYJCE']) / data['JC_ratio'] * 12 * 0.45 * data['DK_years']
data.drop(['until_retire', 'DK_years'], axis=1, inplace=True)


data['DKFFE_DKYE'] = data['DKFFE'] - data['DKYE']          # 贷款发放额 - 贷款余额


# data['trick'] = data.apply(lambda x: 1 if (x['GRZHZT']!=1 and x['GRZHDNGJYE']==237) else 0, axis=1)

# data['GRZHYE_YJCE_ratio'] = data['GRZHYE'] / (data['GRYJCE'] + data['DWYJCE'])
# data['JZYE_JCJS'] = data['GRZHSNJZYE'] / (data['GRJCJS'] * 12)

# # 利息
# data['DKFFE_DKLL'] = (data['DKFFE'] * data['DKLL']) / 100
# data['DKYE_DKLL'] = (data['DKYE'] * data['DKLL']) / 100

In [10]:
# # -------------------------------- 分箱 ------------------------------------
# data['age'] = pd.cut(data['age'], [-10000, 18, 25, 30, 35, 40, 45, 50], labels=False)
# data['DKLL'] = pd.cut(data['DKLL'], [-np.inf, 2.4, 2.6, 2.8, 3.2, 3.4, np.inf], labels=False)

In [11]:
# ----------------------------------- 类别 -----------------------------------

# count encoding
cat_col = ['HYZK', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'XUELI', 'DWJJLX', 'DWSSHY', 'GRZHZT']
for col in cat_col:
    data[col + '_COUNT'] = data[col].map(data[col].value_counts())
    col_idx = data[col].value_counts()
    for idx in col_idx[col_idx < 10].index:
        data[col] = data[col].replace(idx, -1)


# label encoding
label_enc_cols = ['XINGBIE','HYZK','ZHIYE','ZHICHEN','ZHIWU','XUELI']
for col in label_enc_cols:
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col].astype(str))


# target encoding
target_enc_fea =  ['DWJJLX', 'DWSSHY', 'GRZHZT']
data = kfold_mean(data[~data['label'].isna()], data[data['label'].isna()], 'label', target_enc_fea)


# 类别组合共现、类别偏好
cate_cols_combine = [[cate_cols[i], cate_cols[j]] for i in range(len(cate_cols)) for j in range(i + 1, len(cate_cols))]
for f1, f2 in tqdm(cate_cols_combine):
    data['{}_{}_count'.format(f1, f2)] = data.groupby([f1, f2])['id'].transform('count')
    data['{}_in_{}_prop'.format(f1, f2)] = data['{}_{}_count'.format(f1, f2)] / data[f2 + '_COUNT']
    data['{}_in_{}_prop'.format(f2, f1)] = data['{}_{}_count'.format(f1, f2)] / data[f1 + '_COUNT']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
5it [00:00, 62.67it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documen

### 训练集和测试集

In [12]:
train = data[data['label'].isna() == False].reset_index(drop=True)
test = data[data['label'].isna() == True].reset_index(drop=True)
display(train.shape, test.shape)

(40000, 85)

(15000, 85)

In [13]:
drop_feats = [f for f in train.columns if (train[f].nunique() <= 3) and (f not in (cate_2_cols + cate_cols + not_drop + ['label']))]
len(drop_feats), drop_feats

(4,
 ['ZHIWU_COUNT', 'XUELI_COUNT', 'HYZK_in_ZHICHEN_prop', 'HYZK_in_GRZHZT_prop'])

In [14]:
print('Nan col nums of train: ', train.isnull().any().sum())
print('Nan col nums of test: ', test.isnull().any().sum()-1)
train.head()

Nan col nums of train:  0
Nan col nums of test:  0


Unnamed: 0,id,XINGBIE,HYZK,ZHIYE,ZHICHEN,ZHIWU,XUELI,DWJJLX,DWSSHY,GRJCJS,GRZHZT,GRZHYE,GRZHSNJZYE,GRZHDNGJYE,GRYJCE,DWYJCE,DKFFE,DKYE,DKLL,label,age,JC_ratio,GRJCJS_lowest,GRJCJS_highest,GRJCJS_1,GRJCJS_2,DNTQ,DKED_1,DKFFE_DKYE,HYZK_COUNT,ZHIYE_COUNT,ZHICHEN_COUNT,ZHIWU_COUNT,XUELI_COUNT,DWJJLX_COUNT,DWSSHY_COUNT,GRZHZT_COUNT,DWJJLX_target_enc,DWSSHY_target_enc,GRZHZT_target_enc,HYZK_ZHIYE_count,HYZK_in_ZHIYE_prop,ZHIYE_in_HYZK_prop,HYZK_ZHICHEN_count,HYZK_in_ZHICHEN_prop,ZHICHEN_in_HYZK_prop,HYZK_DWJJLX_count,HYZK_in_DWJJLX_prop,DWJJLX_in_HYZK_prop,HYZK_DWSSHY_count,HYZK_in_DWSSHY_prop,DWSSHY_in_HYZK_prop,HYZK_GRZHZT_count,HYZK_in_GRZHZT_prop,GRZHZT_in_HYZK_prop,ZHIYE_ZHICHEN_count,ZHIYE_in_ZHICHEN_prop,ZHICHEN_in_ZHIYE_prop,ZHIYE_DWJJLX_count,ZHIYE_in_DWJJLX_prop,DWJJLX_in_ZHIYE_prop,ZHIYE_DWSSHY_count,ZHIYE_in_DWSSHY_prop,DWSSHY_in_ZHIYE_prop,ZHIYE_GRZHZT_count,ZHIYE_in_GRZHZT_prop,GRZHZT_in_ZHIYE_prop,ZHICHEN_DWJJLX_count,ZHICHEN_in_DWJJLX_prop,DWJJLX_in_ZHICHEN_prop,ZHICHEN_DWSSHY_count,ZHICHEN_in_DWSSHY_prop,DWSSHY_in_ZHICHEN_prop,ZHICHEN_GRZHZT_count,ZHICHEN_in_GRZHZT_prop,GRZHZT_in_ZHICHEN_prop,DWJJLX_DWSSHY_count,DWJJLX_in_DWSSHY_prop,DWSSHY_in_DWJJLX_prop,DWJJLX_GRZHZT_count,DWJJLX_in_GRZHZT_prop,GRZHZT_in_DWJJLX_prop,DWSSHY_GRZHZT_count,DWSSHY_in_GRZHZT_prop,GRZHZT_in_DWSSHY_prop
0,train_0,1,1,3,2,0,1,150,12,1737.0,1,3223.515,801.31,837.0,312.0,312.0,175237,154112.935,2.708,0.0,18,0.17962,2600.0,6240.0,863.0,4503.0,1659.0,562788.0,21124.065,54988,54930,54912,49994,54994,20176,2065,54773,0.024447,0.061192,0.064913,54928,0.999964,0.998909,54912,1.0,0.998618,20176,1.0,0.366916,2065,1.0,0.037554,54761,0.999781,0.995872,54912,1.0,0.999672,20172,0.999802,0.367231,2064,0.999516,0.037575,54705,0.998759,0.995904,20172,0.999802,0.367351,2064,0.999516,0.037587,54689,0.998466,0.995939,920,0.445521,0.045599,20096,0.366896,0.996035,2061,0.037628,0.998063
1,train_1,2,1,3,2,0,1,110,0,4894.0,1,18055.195,53213.22,1065.2,795.84,795.84,300237,298252.945,2.979,0.0,35,0.162615,6632.0,15916.8,1738.0,11022.8,5301.52,1585656.0,1984.055,54988,54930,54912,49994,54994,14530,3551,54773,0.093762,0.016744,0.064913,54928,0.999964,0.998909,54912,1.0,0.998618,14525,0.999656,0.264149,3551,1.0,0.064578,54761,0.999781,0.995872,54912,1.0,0.999672,14524,0.999587,0.264409,3550,0.999718,0.064628,54705,0.998759,0.995904,14520,0.999312,0.264423,3550,0.999718,0.064649,54689,0.998466,0.995939,995,0.280203,0.068479,14487,0.264492,0.997041,3540,0.06463,0.996902
2,train_2,1,1,3,2,0,1,150,9,10297.0,1,27426.6,13963.14,7230.02,1444.2,1444.2,150237,147339.13,2.708,0.0,27,0.140254,12035.0,28884.0,1738.0,18587.0,4323.58,3336228.0,2897.87,54988,54930,54912,49994,54994,20176,3540,54773,0.024447,0.04409,0.064913,54928,0.999964,0.998909,54912,1.0,0.998618,20176,1.0,0.366916,3533,0.998023,0.06425,54761,0.999781,0.995872,54912,1.0,0.999672,20172,0.999802,0.367231,3527,0.996328,0.064209,54705,0.998759,0.995904,20172,0.999802,0.367351,3522,0.994915,0.064139,54689,0.998466,0.995939,810,0.228814,0.040147,20096,0.366896,0.996035,3514,0.064156,0.992655
3,train_3,1,1,3,2,0,1,150,7,10071.5,1,111871.13,99701.265,2271.295,1417.14,1417.14,350237,300653.78,2.708,0.0,37,0.140708,11809.5,28342.8,1738.0,18271.3,9065.825,3045621.6,49583.22,54988,54930,54912,49994,54994,20176,5226,54773,0.024447,0.012766,0.064913,54928,0.999964,0.998909,54912,1.0,0.998618,20176,1.0,0.366916,5225,0.999809,0.095021,54761,0.999781,0.995872,54912,1.0,0.999672,20172,0.999802,0.367231,5217,0.998278,0.094975,54705,0.998759,0.995904,20172,0.999802,0.367351,5217,0.998278,0.095007,54689,0.998466,0.995939,2920,0.558745,0.144726,20096,0.366896,0.996035,5214,0.095193,0.997704
4,train_4,2,1,3,2,0,1,900,14,2007.0,1,237.0,11028.875,35.78,325.5,325.5,150237,145185.01,2.708,0.0,33,0.162182,2712.5,6510.0,705.5,4503.0,2568.22,650268.0,5051.99,54988,54930,54912,49994,54994,7584,6559,54773,0.112285,0.077085,0.064913,54928,0.999964,0.998909,54912,1.0,0.998618,7578,0.999209,0.137812,6559,1.0,0.119281,54761,0.999781,0.995872,54912,1.0,0.999672,7526,0.992352,0.137011,6550,0.998628,0.119243,54705,0.998759,0.995904,7514,0.99077,0.136837,6544,0.997713,0.119172,54689,0.998466,0.995939,1479,0.225492,0.195016,7534,0.13755,0.993407,6518,0.119,0.993749


### Model

In [15]:
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score

In [16]:
def tpr_weight_funtion(y_true,y_predict):
    """线上评分函数"""
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

In [17]:
feature = [col for col in train.columns if col not in ['id', 'label'] + drop_feats]
print('num features: ', len(feature))

num features:  79


In [18]:
from lightgbm.sklearn import LGBMClassifier
from datetime import datetime
start = datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))

final_output = np.zeros(test.shape[0])
score = []     
auc = []
seeds = [1023, 2048, 2098]
for seed in seeds:
    print('seed :', seed)
    num_folds=5
    kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True).split(train.drop(['label'], axis=1), train['label'])
    
    output_probs = np.zeros((test.shape[0], num_folds))        # 记录每折中对测试集的预测结果，最终取平均值作为最终预测
    valid_probs = np.zeros((train.shape[0], num_folds))
    for fold, (train_idx, valid_idx) in enumerate(kfold):
        X_train, y_train = train[feature].iloc[train_idx], train['label'].iloc[train_idx]
        X_valid, y_valid = train[feature].iloc[valid_idx], train['label'].iloc[valid_idx]
        
        clf = lgb.LGBMClassifier(
                    learning_rate=0.05,
                    n_estimators=10230,
                    num_leaves=31,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    random_state=1023,
                    metric=None
        ) 


        clf.fit(X_train, y_train, 
                eval_set = [(X_valid,y_valid)], 
                eval_metric = 'auc',     # lambda y_true, y_pred: tpr_score(y_true, y_pred), 
                # categorical_feature = cate_2_cols + cate_cols,
                early_stopping_rounds=200,    
                verbose=False)

        y_pred_valid = clf.predict_proba(X_valid)[:, 1]                     # 验证集预测概率
        # y_pred_valid_label = [1 if p > 0.5 else 0 for p in y_pred_valid]    # 概率转类别（0、1）
        score.append(tpr_weight_funtion(y_valid, y_pred_valid))
        auc.append(roc_auc_score(y_valid, y_pred_valid))
        valid_probs[:, fold] = clf.predict_proba(train[feature])[:, 1]
        output_probs[:, fold] = clf.predict_proba(test[feature])[:, 1]           # 对测试集预测
        
    final_output = final_output + np.mean(output_probs,axis=1) / len(seeds)

print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print('time costed is: %d s' % (int((datetime.now() - start).seconds)))     
print('MEAN-AUC:%.6f, STD-AUC:%.6f' % (np.mean(auc), np.std(auc)))
print('MEAN-Score:%.6f, STD-Score:%.6f' % (np.mean(score), np.std(score)))

"""

MEAN-AUC:0.947733, STD-AUC:0.005229
MEAN-Score:0.561384, STD-Score:0.019119

"""

2021-01-22 17:39:42
seed : 1023
seed : 2048
seed : 2098
2021-01-22 17:40:14
time costed is: 32 s
MEAN-AUC:0.947733, STD-AUC:0.005229
MEAN-Score:0.561384, STD-Score:0.019119


'\n\nMEAN-AUC:0.947733, STD-AUC:0.005229\nMEAN-Score:0.561384, STD-Score:0.019119\n\n'

In [19]:
# 预测结果
final_output

array([0.00150549, 0.00276196, 0.00597583, ..., 0.0019541 , 0.08096486,
       0.00205074])

In [20]:
sum(final_output>0.5)

1382

In [21]:
# submit['id'] = test['id']
# # submit['label'] = np.mean(output_probs,axis=1)
# submit['label'] = final_output

# submit.to_csv(save_path + 'advance_baseline_34.csv', index = False)
# submit.head()

Unnamed: 0,id,label
0,test_0,0.001505
1,test_1,0.002762
2,test_2,0.005976
3,test_3,0.000671
4,test_4,0.012204
