https://blog.amedama.jp/entry/imbalanced-data

In [1]:
# Dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

# LightGBM
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# Imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler


# Dataset

In [2]:
args = {
    'n_samples': 7000000,
    'n_features': 80,
    'n_informative': 3,
    'n_redundant': 0,
    'n_repeated': 0,
    'n_classes': 2,
    'n_clusters_per_class': 1,
    'weights': [0.99, 0.01],
    'random_state': 42,
}

X, y = make_classification(**args)

In [3]:
len(y[y == 0]), len(y[y == 1])

(6895786, 104214)

In [4]:
len(y[y == 1])/len(y)

0.014887714285714285

In [5]:
def imbalanced_data_split(X, y, test_size=0.2):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = imbalanced_data_split(X, y, test_size=0.2)
X_train2, X_valid, y_train2, y_valid = imbalanced_data_split(X_train, y_train, test_size=0.2)

# LightGBM

In [7]:
lgbm_params = {
    'learning_rate': 0.1,
    'num_leaves': 8,
    'boosting_type' : 'gbdt',
    'reg_alpha' : 1,
    'reg_lambda' : 1,
    'objective': 'binary',
    'metric': 'auc',
}

In [8]:
def lgbm_train(X_train_df, X_valid_df, y_train_df, y_valid_df, lgbm_params):
    lgb_train = lgb.Dataset(X_train_df, y_train_df)
    lgb_eval = lgb.Dataset(X_valid_df, y_valid_df, reference=lgb_train)

    # 上記のパラメータでモデルを学習する
    model = lgb.train(lgbm_params, lgb_train,
                      # モデルの評価用データを渡す
                      valid_sets=lgb_eval,
                      # 最大で 1000 ラウンドまで学習する
                      num_boost_round=1000,
                      # 10 ラウンド経過しても性能が向上しないときは学習を打ち切る
                      early_stopping_rounds=10)
    
    return model

In [9]:
%%time
model_normal = lgbm_train(X_train2, X_valid, y_train2, y_valid, lgbm_params)

[1]	valid_0's auc: 0.780562
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.815015
[3]	valid_0's auc: 0.817213
[4]	valid_0's auc: 0.817371
[5]	valid_0's auc: 0.818282
[6]	valid_0's auc: 0.823774
[7]	valid_0's auc: 0.824164
[8]	valid_0's auc: 0.824837
[9]	valid_0's auc: 0.827372
[10]	valid_0's auc: 0.827457
[11]	valid_0's auc: 0.82807
[12]	valid_0's auc: 0.828295
[13]	valid_0's auc: 0.828386
[14]	valid_0's auc: 0.828773
[15]	valid_0's auc: 0.828684
[16]	valid_0's auc: 0.828497
[17]	valid_0's auc: 0.82846
[18]	valid_0's auc: 0.829166
[19]	valid_0's auc: 0.829275
[20]	valid_0's auc: 0.8288
[21]	valid_0's auc: 0.828842
[22]	valid_0's auc: 0.82934
[23]	valid_0's auc: 0.829358
[24]	valid_0's auc: 0.829446
[25]	valid_0's auc: 0.830283
[26]	valid_0's auc: 0.830493
[27]	valid_0's auc: 0.830511
[28]	valid_0's auc: 0.830365
[29]	valid_0's auc: 0.830364
[30]	valid_0's auc: 0.830128
[31]	valid_0's auc: 0.830623
[32]	valid_0's auc: 0.830666
[33]	valid_0's auc: 0.83

In [10]:
# テストデータを予測する
y_pred_normal = model_normal.predict(X_test, num_iteration=model_normal.best_iteration)

# auc を計算する
auc = roc_auc_score(y_test, y_pred_normal)
print(auc)

0.829287295077


# Imbalanced-learn

In [11]:
sampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
X_train2, X_valid, y_train2, y_valid = imbalanced_data_split(X_resampled, y_resampled, test_size=0.2)

In [12]:
%%time
model_under_sample = lgbm_train(X_train2, X_valid, y_train2, y_valid, lgbm_params)

[1]	valid_0's auc: 0.821812
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.824402
[3]	valid_0's auc: 0.826128
[4]	valid_0's auc: 0.827987
[5]	valid_0's auc: 0.828225
[6]	valid_0's auc: 0.829101
[7]	valid_0's auc: 0.831881
[8]	valid_0's auc: 0.831842
[9]	valid_0's auc: 0.831957
[10]	valid_0's auc: 0.831985
[11]	valid_0's auc: 0.83201
[12]	valid_0's auc: 0.832192
[13]	valid_0's auc: 0.832251
[14]	valid_0's auc: 0.832362
[15]	valid_0's auc: 0.832498
[16]	valid_0's auc: 0.83243
[17]	valid_0's auc: 0.832529
[18]	valid_0's auc: 0.83291
[19]	valid_0's auc: 0.832604
[20]	valid_0's auc: 0.83235
[21]	valid_0's auc: 0.831969
[22]	valid_0's auc: 0.832304
[23]	valid_0's auc: 0.832472
[24]	valid_0's auc: 0.832804
[25]	valid_0's auc: 0.833382
[26]	valid_0's auc: 0.83311
[27]	valid_0's auc: 0.833346
[28]	valid_0's auc: 0.833389
[29]	valid_0's auc: 0.833329
[30]	valid_0's auc: 0.833022
[31]	valid_0's auc: 0.832999
[32]	valid_0's auc: 0.833307
[33]	valid_0's auc: 0.83

In [13]:
# テストデータを予測する
y_pred_under_sample = model_under_sample.predict(X_test, num_iteration=model_under_sample.best_iteration)

# auc を計算する
auc = roc_auc_score(y_test, y_pred_under_sample)
print(auc)

0.828820480993


# Bagging

In [14]:
def bagging(seed):
    sampler = RandomUnderSampler(random_state=seed)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    X_train2, X_valid, y_train2, y_valid = imbalanced_data_split(X_resampled, y_resampled, test_size=0.2)
    model_bagging = lgbm_train(X_train2, X_valid, y_train2, y_valid, lgbm_params)
    return model_bagging

In [15]:
%%time
models = []

for i in range(50):
    models.append(bagging(i))

[1]	valid_0's auc: 0.820043
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.822227
[3]	valid_0's auc: 0.825069
[4]	valid_0's auc: 0.825866
[5]	valid_0's auc: 0.827135
[6]	valid_0's auc: 0.827373
[7]	valid_0's auc: 0.828237
[8]	valid_0's auc: 0.828278
[9]	valid_0's auc: 0.828605
[10]	valid_0's auc: 0.828902
[11]	valid_0's auc: 0.830688
[12]	valid_0's auc: 0.830678
[13]	valid_0's auc: 0.831031
[14]	valid_0's auc: 0.831086
[15]	valid_0's auc: 0.831703
[16]	valid_0's auc: 0.831767
[17]	valid_0's auc: 0.831808
[18]	valid_0's auc: 0.831894
[19]	valid_0's auc: 0.831955
[20]	valid_0's auc: 0.832242
[21]	valid_0's auc: 0.832401
[22]	valid_0's auc: 0.832821
[23]	valid_0's auc: 0.832791
[24]	valid_0's auc: 0.832784
[25]	valid_0's auc: 0.83274
[26]	valid_0's auc: 0.83282
[27]	valid_0's auc: 0.833274
[28]	valid_0's auc: 0.83327
[29]	valid_0's auc: 0.833239
[30]	valid_0's auc: 0.833265
[31]	valid_0's auc: 0.83307
[32]	valid_0's auc: 0.833067
[33]	valid_0's auc: 0.8

[1]	valid_0's auc: 0.821395
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.823561
[3]	valid_0's auc: 0.828055
[4]	valid_0's auc: 0.828225
[5]	valid_0's auc: 0.828225
[6]	valid_0's auc: 0.828375
[7]	valid_0's auc: 0.828526
[8]	valid_0's auc: 0.828764
[9]	valid_0's auc: 0.828816
[10]	valid_0's auc: 0.828885
[11]	valid_0's auc: 0.829214
[12]	valid_0's auc: 0.829804
[13]	valid_0's auc: 0.831121
[14]	valid_0's auc: 0.831064
[15]	valid_0's auc: 0.831389
[16]	valid_0's auc: 0.831491
[17]	valid_0's auc: 0.831692
[18]	valid_0's auc: 0.831074
[19]	valid_0's auc: 0.83111
[20]	valid_0's auc: 0.831138
[21]	valid_0's auc: 0.832479
[22]	valid_0's auc: 0.832493
[23]	valid_0's auc: 0.833235
[24]	valid_0's auc: 0.83299
[25]	valid_0's auc: 0.832657
[26]	valid_0's auc: 0.832567
[27]	valid_0's auc: 0.832736
[28]	valid_0's auc: 0.832706
[29]	valid_0's auc: 0.832498
[30]	valid_0's auc: 0.832551
[31]	valid_0's auc: 0.832561
[32]	valid_0's auc: 0.832714
[33]	valid_0's auc: 0

[9]	valid_0's auc: 0.829289
[10]	valid_0's auc: 0.82944
[11]	valid_0's auc: 0.831398
[12]	valid_0's auc: 0.83144
[13]	valid_0's auc: 0.831795
[14]	valid_0's auc: 0.831933
[15]	valid_0's auc: 0.832028
[16]	valid_0's auc: 0.832088
[17]	valid_0's auc: 0.832167
[18]	valid_0's auc: 0.832351
[19]	valid_0's auc: 0.832235
[20]	valid_0's auc: 0.832265
[21]	valid_0's auc: 0.832296
[22]	valid_0's auc: 0.831886
[23]	valid_0's auc: 0.831796
[24]	valid_0's auc: 0.83221
[25]	valid_0's auc: 0.831839
[26]	valid_0's auc: 0.832405
[27]	valid_0's auc: 0.832343
[28]	valid_0's auc: 0.832354
[29]	valid_0's auc: 0.832321
[30]	valid_0's auc: 0.833201
[31]	valid_0's auc: 0.833324
[32]	valid_0's auc: 0.834084
[33]	valid_0's auc: 0.834077
[34]	valid_0's auc: 0.834305
[35]	valid_0's auc: 0.83427
[36]	valid_0's auc: 0.834183
[37]	valid_0's auc: 0.834223
[38]	valid_0's auc: 0.834685
[39]	valid_0's auc: 0.834575
[40]	valid_0's auc: 0.834565
[41]	valid_0's auc: 0.834589
[42]	valid_0's auc: 0.83425
[43]	valid_0's auc: 

[29]	valid_0's auc: 0.832577
Early stopping, best iteration is:
[19]	valid_0's auc: 0.833098
[1]	valid_0's auc: 0.822033
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.824626
[3]	valid_0's auc: 0.825297
[4]	valid_0's auc: 0.826224
[5]	valid_0's auc: 0.82907
[6]	valid_0's auc: 0.829429
[7]	valid_0's auc: 0.829477
[8]	valid_0's auc: 0.82957
[9]	valid_0's auc: 0.8297
[10]	valid_0's auc: 0.829794
[11]	valid_0's auc: 0.830529
[12]	valid_0's auc: 0.830439
[13]	valid_0's auc: 0.831244
[14]	valid_0's auc: 0.831105
[15]	valid_0's auc: 0.831188
[16]	valid_0's auc: 0.831253
[17]	valid_0's auc: 0.831604
[18]	valid_0's auc: 0.831659
[19]	valid_0's auc: 0.831913
[20]	valid_0's auc: 0.831966
[21]	valid_0's auc: 0.831997
[22]	valid_0's auc: 0.831971
[23]	valid_0's auc: 0.832693
[24]	valid_0's auc: 0.833119
[25]	valid_0's auc: 0.833136
[26]	valid_0's auc: 0.833106
[27]	valid_0's auc: 0.833378
[28]	valid_0's auc: 0.833385
[29]	valid_0's auc: 0.833403
[30]	valid_0's au

[33]	valid_0's auc: 0.834445
[34]	valid_0's auc: 0.834479
[35]	valid_0's auc: 0.834447
[36]	valid_0's auc: 0.834792
[37]	valid_0's auc: 0.834616
[38]	valid_0's auc: 0.834667
[39]	valid_0's auc: 0.834771
[40]	valid_0's auc: 0.834815
[41]	valid_0's auc: 0.834736
[42]	valid_0's auc: 0.834651
[43]	valid_0's auc: 0.834654
[44]	valid_0's auc: 0.834737
[45]	valid_0's auc: 0.834191
[46]	valid_0's auc: 0.834289
[47]	valid_0's auc: 0.834205
[48]	valid_0's auc: 0.833914
[49]	valid_0's auc: 0.834033
[50]	valid_0's auc: 0.834258
Early stopping, best iteration is:
[40]	valid_0's auc: 0.834815
[1]	valid_0's auc: 0.821062
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.82375
[3]	valid_0's auc: 0.823965
[4]	valid_0's auc: 0.825498
[5]	valid_0's auc: 0.827308
[6]	valid_0's auc: 0.827345
[7]	valid_0's auc: 0.82847
[8]	valid_0's auc: 0.828579
[9]	valid_0's auc: 0.830042
[10]	valid_0's auc: 0.830012
[11]	valid_0's auc: 0.830095
[12]	valid_0's auc: 0.83103
[13]	valid_0's a

[4]	valid_0's auc: 0.82885
[5]	valid_0's auc: 0.82927
[6]	valid_0's auc: 0.829368
[7]	valid_0's auc: 0.830758
[8]	valid_0's auc: 0.830887
[9]	valid_0's auc: 0.830955
[10]	valid_0's auc: 0.831023
[11]	valid_0's auc: 0.831103
[12]	valid_0's auc: 0.831798
[13]	valid_0's auc: 0.832085
[14]	valid_0's auc: 0.832289
[15]	valid_0's auc: 0.832353
[16]	valid_0's auc: 0.832433
[17]	valid_0's auc: 0.832448
[18]	valid_0's auc: 0.83248
[19]	valid_0's auc: 0.832174
[20]	valid_0's auc: 0.832201
[21]	valid_0's auc: 0.83254
[22]	valid_0's auc: 0.83267
[23]	valid_0's auc: 0.832605
[24]	valid_0's auc: 0.833334
[25]	valid_0's auc: 0.833754
[26]	valid_0's auc: 0.834431
[27]	valid_0's auc: 0.834533
[28]	valid_0's auc: 0.834357
[29]	valid_0's auc: 0.834566
[30]	valid_0's auc: 0.834073
[31]	valid_0's auc: 0.833996
[32]	valid_0's auc: 0.834311
[33]	valid_0's auc: 0.834168
[34]	valid_0's auc: 0.834244
[35]	valid_0's auc: 0.834076
[36]	valid_0's auc: 0.833876
[37]	valid_0's auc: 0.834194
[38]	valid_0's auc: 0.834

[24]	valid_0's auc: 0.831566
[25]	valid_0's auc: 0.83157
[26]	valid_0's auc: 0.831377
[27]	valid_0's auc: 0.831234
[28]	valid_0's auc: 0.831279
[29]	valid_0's auc: 0.83143
Early stopping, best iteration is:
[19]	valid_0's auc: 0.831849
[1]	valid_0's auc: 0.82068
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.822454
[3]	valid_0's auc: 0.823168
[4]	valid_0's auc: 0.825941
[5]	valid_0's auc: 0.825983
[6]	valid_0's auc: 0.826055
[7]	valid_0's auc: 0.826593
[8]	valid_0's auc: 0.826765
[9]	valid_0's auc: 0.829048
[10]	valid_0's auc: 0.829284
[11]	valid_0's auc: 0.829352
[12]	valid_0's auc: 0.829726
[13]	valid_0's auc: 0.829983
[14]	valid_0's auc: 0.830816
[15]	valid_0's auc: 0.830715
[16]	valid_0's auc: 0.830766
[17]	valid_0's auc: 0.831346
[18]	valid_0's auc: 0.831404
[19]	valid_0's auc: 0.831471
[20]	valid_0's auc: 0.831537
[21]	valid_0's auc: 0.83131
[22]	valid_0's auc: 0.831327
[23]	valid_0's auc: 0.831585
[24]	valid_0's auc: 0.831628
[25]	valid_0's au

[20]	valid_0's auc: 0.832279
[21]	valid_0's auc: 0.831354
[22]	valid_0's auc: 0.83057
[23]	valid_0's auc: 0.830509
[24]	valid_0's auc: 0.830734
[25]	valid_0's auc: 0.830879
[26]	valid_0's auc: 0.831752
[27]	valid_0's auc: 0.832428
[28]	valid_0's auc: 0.83233
[29]	valid_0's auc: 0.832386
[30]	valid_0's auc: 0.832485
[31]	valid_0's auc: 0.832331
[32]	valid_0's auc: 0.832585
[33]	valid_0's auc: 0.832531
[34]	valid_0's auc: 0.832569
[35]	valid_0's auc: 0.832722
[36]	valid_0's auc: 0.83241
[37]	valid_0's auc: 0.832504
[38]	valid_0's auc: 0.832303
[39]	valid_0's auc: 0.83209
[40]	valid_0's auc: 0.831988
[41]	valid_0's auc: 0.832026
[42]	valid_0's auc: 0.832283
[43]	valid_0's auc: 0.832347
[44]	valid_0's auc: 0.831558
[45]	valid_0's auc: 0.831748
Early stopping, best iteration is:
[35]	valid_0's auc: 0.832722
[1]	valid_0's auc: 0.821334
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.824504
[3]	valid_0's auc: 0.827819
[4]	valid_0's auc: 0.828311
[5]	valid_0'

[9]	valid_0's auc: 0.83093
[10]	valid_0's auc: 0.830985
[11]	valid_0's auc: 0.831152
[12]	valid_0's auc: 0.831253
[13]	valid_0's auc: 0.831405
[14]	valid_0's auc: 0.831437
[15]	valid_0's auc: 0.831512
[16]	valid_0's auc: 0.83156
[17]	valid_0's auc: 0.831708
[18]	valid_0's auc: 0.832204
[19]	valid_0's auc: 0.832626
[20]	valid_0's auc: 0.832675
[21]	valid_0's auc: 0.832737
[22]	valid_0's auc: 0.832572
[23]	valid_0's auc: 0.832605
[24]	valid_0's auc: 0.832585
[25]	valid_0's auc: 0.832552
[26]	valid_0's auc: 0.832595
[27]	valid_0's auc: 0.832709
[28]	valid_0's auc: 0.832734
[29]	valid_0's auc: 0.832742
[30]	valid_0's auc: 0.832852
[31]	valid_0's auc: 0.832859
[32]	valid_0's auc: 0.832892
[33]	valid_0's auc: 0.833013
[34]	valid_0's auc: 0.833126
[35]	valid_0's auc: 0.833946
[36]	valid_0's auc: 0.833967
[37]	valid_0's auc: 0.834162
[38]	valid_0's auc: 0.833818
[39]	valid_0's auc: 0.833935
[40]	valid_0's auc: 0.833724
[41]	valid_0's auc: 0.833641
[42]	valid_0's auc: 0.833569
[43]	valid_0's au

In [16]:
y_preds = []

for m in models:
    y_preds.append(m.predict(X_test, num_iteration=m.best_iteration))

y_preds_bagging = sum(y_preds)/len(y_preds)
# auc を計算する
auc = roc_auc_score(y_test, y_preds_bagging)
print(auc)

0.828739612397


In [17]:
for y_pred in y_preds:
    print(roc_auc_score(y_test, y_pred))

0.829038548525
0.829005810317
0.829574381079
0.829005501593
0.829115756896
0.830008476018
0.829315744203
0.829196368427
0.829094047475
0.828922108487
0.828819176749
0.829146259767
0.828730333484
0.829157742743
0.829440527278
0.829411796024
0.828753528532
0.829732695976
0.829394593301
0.829310393433
0.829718726059
0.828475118328
0.828296835398
0.82913646723
0.828786975088
0.830079145565
0.829946457101
0.829045216689
0.829156334154
0.829062805417
0.828657061258
0.828707546426
0.82859396359
0.829137443374
0.82902774504
0.829355395603
0.827830536067
0.829232347583
0.828523399381
0.828940018931
0.829384017101
0.829218131943
0.828820480993
0.829374438422
0.828130994959
0.828321586448
0.829026760199
0.829344883743
0.82897927516
0.828491538031
