In [19]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
import warnings
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import gc
import os
warnings.simplefilter('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [20]:
v = 2
seed = 2020

In [21]:
df_feature = pd.read_pickle('feature{}.pkl'.format(v))
df_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21047 entries, 0 to 21046
Columns: 266 entries, id to target
dtypes: float32(252), float64(6), int64(8)
memory usage: 22.5 MB


In [22]:
df_train = df_feature[df_feature.target.notna()].copy()
df_test = df_feature[df_feature.target.isna()].copy()
df_train.shape, df_test.shape

((15280, 266), (5767, 266))

In [23]:
ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id'], df_train.columns))

In [24]:
from feature_selector import FeatureSelector
fs = FeatureSelector(data = df_train[feature_names], labels = df_train[ycol])
fs.identify_zero_importance(task = 'classification', eval_metric = 'auc',
                            n_iterations = 10, early_stopping = True)
fs.identify_low_importance(cumulative_importance = 0.97)
low_importance_features = fs.ops['low_importance']
print('====low_importance_features=====')
print(low_importance_features)
for i in low_importance_features:
    feature_names.remove(i)

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[93]	valid_0's auc: 0.847781	valid_0's binary_logloss: 0.256598
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[145]	valid_0's auc: 0.836879	valid_0's binary_logloss: 0.260394
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[144]	valid_0's auc: 0.839056	valid_0's binary_logloss: 0.258275
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[181]	valid_0's auc: 0.846509	valid_0's binary_logloss: 0.256794
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[92]	valid_0's auc: 0.831352	valid_0's binary_logloss: 0.261737
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[128]	valid_0's auc: 0.841547	valid_0's binary_logloss: 0.259

In [25]:
df_test.shape

(5767, 266)

In [36]:
df_train.shape

(15280, 266)

In [73]:
from sklearn.metrics import precision_score, recall_score
def custom_f1_eval(y_true, y_pred):
    yr = y_pred
    for i in range(len(y_pred)):
        if y_pred[i] >= 0.5:
            yr[i] = 1
        else :
            yr[i] = 0
    precision = precision_score(y_true, yr)
    recall = recall_score(y_true, yr)
    if precision == 0.0 and recall == 0.0:
        f1 = 0.0
    else:
        f1 = (precision * recall) / (0.4 * precision + 0.6 * recall)
    return "f1", f1, True
model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           num_leaves=32,
                           max_depth=5,
                           learning_rate=0.01,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=10,
                           reg_lambda=12,
                           random_state=seed,
                           metric = None,
                           is_unbalance=True)
df_oof = df_train[['id',ycol]].copy()
df_oof['prob'] = 0
prediction = df_test[['id']]
prediction['prob'] = 0

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(
        kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]
    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          eval_metric=lambda y_true, y_pred: [custom_f1_eval(y_true, y_pred)],
                          verbose=100,
                          early_stopping_rounds=5000)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]

    df_oof.loc[val_idx, 'prob'] = pred_val

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    
    prediction['prob'] += pred_test / kfold.n_splits

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 5000 rounds
[100]	train's binary_logloss: 0.360504	train's f1: 0.415108	valid's binary_logloss: 0.37724	valid's f1: 0.355505
[200]	train's binary_logloss: 0.4013	train's f1: 0.44836	valid's binary_logloss: 0.427974	valid's f1: 0.37617
[300]	train's binary_logloss: 0.408838	train's f1: 0.44771	valid's binary_logloss: 0.444871	valid's f1: 0.355346
[400]	train's binary_logloss: 0.400678	train's f1: 0.457917	valid's binary_logloss: 0.443614	valid's f1: 0.351957
[500]	train's binary_logloss: 0.386569	train's f1: 0.473765	valid's binary_logloss: 0.436142	valid's f1: 0.361328
[600]	train's binary_logloss: 0.372114	train's f1: 0.491803	valid's binary_logloss: 0.42815	valid's f1: 0.365526
[700]	train's binary_logloss: 0.356619	train's f1: 0.511308	valid's binary_logloss: 0.419096	valid's f1: 0.373057
[800]	train's binary_logloss: 0.341924	train's f1: 0.530131	valid's binary_logloss: 0.410418	valid's f1: 0.371945
[900]	train's binary_logloss: 

[7100]	train's binary_logloss: 0.0744525	train's f1: 0.96868	valid's binary_logloss: 0.295416	valid's f1: 0.394057
[7200]	train's binary_logloss: 0.0744525	train's f1: 0.96868	valid's binary_logloss: 0.295416	valid's f1: 0.394057
[7300]	train's binary_logloss: 0.0744525	train's f1: 0.96868	valid's binary_logloss: 0.295416	valid's f1: 0.394057
[7400]	train's binary_logloss: 0.0744525	train's f1: 0.96868	valid's binary_logloss: 0.295416	valid's f1: 0.394057
[7500]	train's binary_logloss: 0.0744525	train's f1: 0.96868	valid's binary_logloss: 0.295416	valid's f1: 0.394057
[7600]	train's binary_logloss: 0.0744525	train's f1: 0.96868	valid's binary_logloss: 0.295416	valid's f1: 0.394057
[7700]	train's binary_logloss: 0.0744525	train's f1: 0.96868	valid's binary_logloss: 0.295416	valid's f1: 0.394057
[7800]	train's binary_logloss: 0.0744525	train's f1: 0.96868	valid's binary_logloss: 0.295416	valid's f1: 0.394057
Early stopping, best iteration is:
[2870]	train's binary_logloss: 0.161581	train

[6200]	train's binary_logloss: 0.0769626	train's f1: 0.966518	valid's binary_logloss: 0.303937	valid's f1: 0.397112
[6300]	train's binary_logloss: 0.0759573	train's f1: 0.96695	valid's binary_logloss: 0.303763	valid's f1: 0.395531
[6400]	train's binary_logloss: 0.0756273	train's f1: 0.96695	valid's binary_logloss: 0.30375	valid's f1: 0.394816
[6500]	train's binary_logloss: 0.0756273	train's f1: 0.96695	valid's binary_logloss: 0.30375	valid's f1: 0.394816
[6600]	train's binary_logloss: 0.0756273	train's f1: 0.96695	valid's binary_logloss: 0.30375	valid's f1: 0.394816
[6700]	train's binary_logloss: 0.0756273	train's f1: 0.96695	valid's binary_logloss: 0.30375	valid's f1: 0.394816
[6800]	train's binary_logloss: 0.0756273	train's f1: 0.96695	valid's binary_logloss: 0.30375	valid's f1: 0.394816
[6900]	train's binary_logloss: 0.0756273	train's f1: 0.96695	valid's binary_logloss: 0.30375	valid's f1: 0.394816
[7000]	train's binary_logloss: 0.0756273	train's f1: 0.96695	valid's binary_logloss: 

[4900]	train's binary_logloss: 0.0982608	train's f1: 0.931534	valid's binary_logloss: 0.287034	valid's f1: 0.432024
[5000]	train's binary_logloss: 0.0961883	train's f1: 0.935159	valid's binary_logloss: 0.286594	valid's f1: 0.434517
[5100]	train's binary_logloss: 0.0943134	train's f1: 0.93759	valid's binary_logloss: 0.286338	valid's f1: 0.44
[5200]	train's binary_logloss: 0.0924758	train's f1: 0.941672	valid's binary_logloss: 0.285812	valid's f1: 0.441631
[5300]	train's binary_logloss: 0.0908119	train's f1: 0.944962	valid's binary_logloss: 0.285535	valid's f1: 0.437731
[5400]	train's binary_logloss: 0.0891731	train's f1: 0.947445	valid's binary_logloss: 0.285332	valid's f1: 0.437888
[5500]	train's binary_logloss: 0.0875777	train's f1: 0.949525	valid's binary_logloss: 0.285075	valid's f1: 0.435737
[5600]	train's binary_logloss: 0.0859663	train's f1: 0.951194	valid's binary_logloss: 0.284767	valid's f1: 0.434919
[5700]	train's binary_logloss: 0.084527	train's f1: 0.95287	valid's binary_lo

[1700]	train's binary_logloss: 0.242062	train's f1: 0.681028	valid's binary_logloss: 0.340573	valid's f1: 0.426009
[1800]	train's binary_logloss: 0.233044	train's f1: 0.695489	valid's binary_logloss: 0.336152	valid's f1: 0.428809
[1900]	train's binary_logloss: 0.224547	train's f1: 0.706498	valid's binary_logloss: 0.332051	valid's f1: 0.429604
[2000]	train's binary_logloss: 0.21647	train's f1: 0.71729	valid's binary_logloss: 0.328132	valid's f1: 0.42654
[2100]	train's binary_logloss: 0.209081	train's f1: 0.732866	valid's binary_logloss: 0.324343	valid's f1: 0.427257
[2200]	train's binary_logloss: 0.201905	train's f1: 0.74506	valid's binary_logloss: 0.320773	valid's f1: 0.431381
[2300]	train's binary_logloss: 0.195179	train's f1: 0.755211	valid's binary_logloss: 0.317554	valid's f1: 0.427662
[2400]	train's binary_logloss: 0.188681	train's f1: 0.766234	valid's binary_logloss: 0.314709	valid's f1: 0.427845
[2500]	train's binary_logloss: 0.182397	train's f1: 0.781739	valid's binary_logloss:

[3500]	train's binary_logloss: 0.133497	train's f1: 0.871492	valid's binary_logloss: 0.310461	valid's f1: 0.414894
[3600]	train's binary_logloss: 0.130148	train's f1: 0.878214	valid's binary_logloss: 0.309449	valid's f1: 0.416889
[3700]	train's binary_logloss: 0.126689	train's f1: 0.883955	valid's binary_logloss: 0.3085	valid's f1: 0.411478
[3800]	train's binary_logloss: 0.123348	train's f1: 0.894555	valid's binary_logloss: 0.307358	valid's f1: 0.410773
[3900]	train's binary_logloss: 0.12018	train's f1: 0.899016	valid's binary_logloss: 0.306364	valid's f1: 0.415512
[4000]	train's binary_logloss: 0.11724	train's f1: 0.906171	valid's binary_logloss: 0.305536	valid's f1: 0.415505
[4100]	train's binary_logloss: 0.114438	train's f1: 0.912286	valid's binary_logloss: 0.304736	valid's f1: 0.414102
[4200]	train's binary_logloss: 0.111664	train's f1: 0.915761	valid's binary_logloss: 0.303908	valid's f1: 0.411964
[4300]	train's binary_logloss: 0.10905	train's f1: 0.919654	valid's binary_logloss: 

In [78]:
    yr = df_train['id'].copy()
    for i in range(len(yr)):
        if df_oof['prob'][i] >= 0.6:
            yr[i] = 1
        else :
            yr[i] = 0
    precision = precision_score(df_oof[ycol], yr)
    recall = recall_score(df_oof[ycol],yr)
    if precision == 0.0 and recall == 0.0:
        f1 = 0.0
    else:
        f1 = (precision * recall) / (0.4 * precision + 0.6 * recall)
print('f1:', f1)


f1: 0.38563243719700313


In [28]:
os.makedirs('sub', exist_ok=True)
prediction.to_csv('sub/xinwangyinhang_{}.csv'.format(auc), index=False)

In [71]:
df_oof.head(10)

Unnamed: 0,id,target,prob
0,0,0.0,0.0
1,1,0.0,0.0
2,2,1.0,0.0
3,3,0.0,0.0
4,4,1.0,0.0
5,5,1.0,0.0
6,6,1.0,0.0
7,7,0.0,0.0
8,8,0.0,0.0
9,9,0.0,0.0


In [56]:
os.makedirs('prob', exist_ok=True)

prediction.to_csv('prob/sub_lgb{}.csv'.format(v), index=False)
df_oof[['id', 'prob', ycol]].to_csv('prob/oof_lgb{}.csv'.format(v), index=False)

In [30]:
df_oof.head(100)

Unnamed: 0,id,target,prob
0,0,0.0,0.194634
1,1,0.0,0.003817
2,2,1.0,0.388696
3,3,0.0,0.006898
4,4,1.0,0.889966
5,5,1.0,0.698051
6,6,1.0,0.709167
7,7,0.0,0.267751
8,8,0.0,0.427973
9,9,0.0,0.644526


In [31]:
import xgboost as xgb
model = xgb.XGBClassifier( 
                           max_depth=6,
                           learning_rate=0.01,
                           n_estimators=10000,
                           subsample=0.8,
                           reg_alpha=10,
                           reg_lambda=12,
#                             tree_method='gpu_hist',
                           random_state=seed)

df_oof = df_train[['id', ycol]].copy()
df_oof['prob'] = 0
prediction = df_test[['id']]
prediction['prob'] = 0

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(
        kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=100,
                          eval_metric='auc', 
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(X_val,)[:, 1]
    df_oof.loc[val_idx, 'prob'] = pred_val

    pred_test = lgb_model.predict_proba(df_test[feature_names])[:, 1]
    prediction['prob'] += pred_test / kfold.n_splits

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



[0]	validation_0-auc:0.70505	validation_1-auc:0.67701
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[100]	validation_0-auc:0.82503	validation_1-auc:0.77034
[200]	validation_0-auc:0.85697	validation_1-auc:0.79114
[300]	validation_0-auc:0.87672	validation_1-auc:0.79920
[400]	validation_0-auc:0.89124	validation_1-auc:0.80453
[500]	validation_0-auc:0.90293	validation_1-auc:0.80885
[600]	validation_0-auc:0.91332	validation_1-auc:0.81121
[700]	validation_0-auc:0.92305	validation_1-auc:0.81369
[800]	validation_0-auc:0.93143	validation_1-auc:0.81527
[900]	validation_0-auc:0.93933	validation_1-auc:0.81655
[1000]	validation_0-auc:0.94638	validation_1-auc:0.81817
[1100]	validation_0-auc:0.95249	validation_1-auc:0.81946
[1200]	validation_0-auc:0.95786	validation_1-auc:0.82009
[1300]	validation_0-auc:0.96254	validation_1-auc:0.82054
[1400]	validation_0-auc:0.96675	validation_1-auc:0.8206

In [33]:
os.makedirs('prob', exist_ok=True)

prediction.to_csv('prob/sub_xgb{}.csv'.format(v), index=False)
df_oof[['id', 'prob', ycol]].to_csv('prob/oof_xgb{}.csv'.format(v), index=False)