In [1]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import pandas as pd
import gc
import os

warnings.simplefilter('ignore')
%matplotlib inline

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
seed = 2020
v = 1

In [3]:
df_feature = pd.read_pickle('feature{}.pkl'.format(v))

In [5]:
df_train = df_feature[df_feature.target.notna()].copy()
df_test = df_feature[df_feature.target.isna()].copy()

df_train.shape, df_test.shape

((15280, 9779), (5767, 9779))

In [6]:
%%time
ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id'], df_train.columns))


Wall time: 7.02 ms


In [None]:
from feature_selector import FeatureSelector
fs = FeatureSelector(data = df_train[feature_names], labels = df_train[ycol])
fs.identify_zero_importance(task = 'classification', eval_metric = 'auc',
                            n_iterations = 10, early_stopping = True)
fs.identify_low_importance(cumulative_importance = 0.97)
low_importance_features = fs.ops['low_importance']
print('====low_importance_features=====')
print(low_importance_features)
for i in low_importance_features:
    feature_names.remove(i)

In [7]:

model = xgb.XGBClassifier( 
                           max_depth=6,
                           learning_rate=0.01,
                           n_estimators=10000,
                           subsample=0.8,
                           reg_alpha=10,
                           reg_lambda=12,
#                             tree_method='gpu_hist',
                           random_state=seed)

df_oof = df_train[['id', ycol]].copy()
df_oof['prob'] = 0
prediction = df_test[['id']]
prediction['prob'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(
        kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=100,
                          eval_metric='auc', 
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(X_val,)[:, 1]
    df_oof.loc[val_idx, 'prob'] = pred_val

    pred_test = lgb_model.predict_proba(df_test[feature_names])[:, 1]
    prediction['prob'] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



[0]	validation_0-auc:0.74879	validation_1-auc:0.71626
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[100]	validation_0-auc:0.83837	validation_1-auc:0.77865
[200]	validation_0-auc:0.87184	validation_1-auc:0.79924
[300]	validation_0-auc:0.89077	validation_1-auc:0.80627
[400]	validation_0-auc:0.90458	validation_1-auc:0.81087
[500]	validation_0-auc:0.91655	validation_1-auc:0.81500
[600]	validation_0-auc:0.92681	validation_1-auc:0.81789
[700]	validation_0-auc:0.93600	validation_1-auc:0.82002
[800]	validation_0-auc:0.94409	validation_1-auc:0.82086
[900]	validation_0-auc:0.95127	validation_1-auc:0.82205
[1000]	validation_0-auc:0.95776	validation_1-auc:0.82304
[1100]	validation_0-auc:0.96322	validation_1-auc:0.82369
[1200]	validation_0-auc:0.96809	validation_1-auc:0.82426
[1300]	validation_0-auc:0.97229	validation_1-auc:0.82453
Stopping. Best iteration:
[1310]	validation_0-auc:0.972

In [8]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby([
    'column'
])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,x_num_5_sum+x_num_32_sum,0.01285557
1,x_num_5_sum+x_num_41_sum,0.01210566
2,x_num_5_sum+x_num_49_sum,0.00869567
3,x_num_5_sum+x_num_60_sum,0.008240659
4,x_num_5_sum+x_num_18_sum,0.007610389
5,x_num_5_sum+x_num_2,0.007237684
6,x_num_21_sum+x_num_51_sum,0.006645732
7,x_num_5_sum+x_num_48_sum,0.006109915
8,x_num_51_sum+x_num_58_sum,0.005783288
9,x_num_5_sum+x_num_28_sum,0.005577626


In [9]:
auc = roc_auc_score(df_oof[ycol], df_oof['prob'])
print('auc:', auc)

auc: 0.8415462566710658


In [10]:
os.makedirs('sub', exist_ok=True)
prediction.to_csv('sub/xinwangyinhang_{}.csv'.format(auc), index=False)

In [11]:
os.makedirs('prob', exist_ok=True)

prediction.to_csv('prob/sub_xgb{}.csv'.format(v), index=False)
df_oof.to_csv('prob/oof_xgb{}.csv'.format(v), index=False)

In [13]:
df_oof['target'].value_counts()

0.0    13657
1.0     1623
Name: target, dtype: int64

In [15]:
df_oof[df_oof['target']==0]['prob'].mean()

0.08162198124440537

In [16]:
df_oof[df_oof['target']==1]['prob'].mean()

0.27666966803515364