https://blog.amedama.jp/entry/imbalanced-data

In [1]:
# Dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

# LightGBM
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# Imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler


# Dataset

In [2]:
args = {
    'n_samples': 7000000,
    'n_features': 80,
    'n_informative': 3,
    'n_redundant': 0,
    'n_repeated': 0,
    'n_classes': 2,
    'n_clusters_per_class': 1,
    'weights': [0.99, 0.01],
    'random_state': 42,
}

X, y = make_classification(**args)

In [3]:
len(y[y == 0]), len(y[y == 1])

(6895786, 104214)

In [4]:
len(y[y == 1])/len(y)

0.014887714285714285

In [5]:
def imbalanced_data_split(X, y, test_size=0.2):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = imbalanced_data_split(X, y, test_size=0.2)
X_train2, X_valid, y_train2, y_valid = imbalanced_data_split(X_train, y_train, test_size=0.2)

# LightGBM

In [7]:
lgbm_params = {
    'learning_rate': 0.1,
    'num_leaves': 8,
    'boosting_type' : 'gbdt',
    'reg_alpha' : 1,
    'reg_lambda' : 1,
    'objective': 'binary',
    'metric': 'auc',
}

In [8]:
def lgbm_train(X_train_df, X_valid_df, y_train_df, y_valid_df, lgbm_params):
    lgb_train = lgb.Dataset(X_train_df, y_train_df)
    lgb_eval = lgb.Dataset(X_valid_df, y_valid_df, reference=lgb_train)

    # 上記のパラメータでモデルを学習する
    model = lgb.train(lgbm_params, lgb_train,
                      # モデルの評価用データを渡す
                      valid_sets=lgb_eval,
                      # 最大で 1000 ラウンドまで学習する
                      num_boost_round=1000,
                      # 10 ラウンド経過しても性能が向上しないときは学習を打ち切る
                      early_stopping_rounds=10)
    
    return model

In [9]:
%%time
model_normal = lgbm_train(X_train2, X_valid, y_train2, y_valid, lgbm_params)

[1]	valid_0's auc: 0.780562
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.815015
[3]	valid_0's auc: 0.817213
[4]	valid_0's auc: 0.817371
[5]	valid_0's auc: 0.818282
[6]	valid_0's auc: 0.823774
[7]	valid_0's auc: 0.824164
[8]	valid_0's auc: 0.824837
[9]	valid_0's auc: 0.827372
[10]	valid_0's auc: 0.827457
[11]	valid_0's auc: 0.82807
[12]	valid_0's auc: 0.828295
[13]	valid_0's auc: 0.828386
[14]	valid_0's auc: 0.828773
[15]	valid_0's auc: 0.828684
[16]	valid_0's auc: 0.828497
[17]	valid_0's auc: 0.82846
[18]	valid_0's auc: 0.829166
[19]	valid_0's auc: 0.829275
[20]	valid_0's auc: 0.8288
[21]	valid_0's auc: 0.828842
[22]	valid_0's auc: 0.82934
[23]	valid_0's auc: 0.829358
[24]	valid_0's auc: 0.829446
[25]	valid_0's auc: 0.830283
[26]	valid_0's auc: 0.830493
[27]	valid_0's auc: 0.830511
[28]	valid_0's auc: 0.830365
[29]	valid_0's auc: 0.830364
[30]	valid_0's auc: 0.830128
[31]	valid_0's auc: 0.830623
[32]	valid_0's auc: 0.830666
[33]	valid_0's auc: 0.83

In [10]:
# テストデータを予測する
y_pred_normal = model_normal.predict(X_test, num_iteration=model_normal.best_iteration)

# auc を計算する
auc = roc_auc_score(y_test, y_pred_normal)
print(auc)

0.829287295077


# Imbalanced-learn

In [11]:
sampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
X_train2, X_valid, y_train2, y_valid = imbalanced_data_split(X_resampled, y_resampled, test_size=0.2)

In [12]:
%%time
model_under_sample = lgbm_train(X_train2, X_valid, y_train2, y_valid, lgbm_params)

[1]	valid_0's auc: 0.821812
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.824402
[3]	valid_0's auc: 0.826128
[4]	valid_0's auc: 0.827987
[5]	valid_0's auc: 0.828225
[6]	valid_0's auc: 0.829101
[7]	valid_0's auc: 0.831881
[8]	valid_0's auc: 0.831842
[9]	valid_0's auc: 0.831957
[10]	valid_0's auc: 0.831985
[11]	valid_0's auc: 0.83201
[12]	valid_0's auc: 0.832192
[13]	valid_0's auc: 0.832251
[14]	valid_0's auc: 0.832362
[15]	valid_0's auc: 0.832498
[16]	valid_0's auc: 0.83243
[17]	valid_0's auc: 0.832529
[18]	valid_0's auc: 0.83291
[19]	valid_0's auc: 0.832604
[20]	valid_0's auc: 0.83235
[21]	valid_0's auc: 0.831969
[22]	valid_0's auc: 0.832304
[23]	valid_0's auc: 0.832472
[24]	valid_0's auc: 0.832804
[25]	valid_0's auc: 0.833382
[26]	valid_0's auc: 0.83311
[27]	valid_0's auc: 0.833346
[28]	valid_0's auc: 0.833389
[29]	valid_0's auc: 0.833329
[30]	valid_0's auc: 0.833022
[31]	valid_0's auc: 0.832999
[32]	valid_0's auc: 0.833307
[33]	valid_0's auc: 0.83

In [13]:
# テストデータを予測する
y_pred_under_sample = model_under_sample.predict(X_test, num_iteration=model_under_sample.best_iteration)

# auc を計算する
auc = roc_auc_score(y_test, y_pred_under_sample)
print(auc)

0.828820480993


# Bagging

In [14]:
def bagging(seed):
    sampler = RandomUnderSampler(random_state=seed, replacement=True)
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    X_train2, X_valid, y_train2, y_valid = imbalanced_data_split(X_resampled, y_resampled, test_size=0.2)
    model_bagging = lgbm_train(X_train2, X_valid, y_train2, y_valid, lgbm_params)
    return model_bagging

In [15]:
%%time
models = []

for i in range(10):
    models.append(bagging(i))

[1]	valid_0's auc: 0.817564
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.822281
[3]	valid_0's auc: 0.824859
[4]	valid_0's auc: 0.82861
[5]	valid_0's auc: 0.829031
[6]	valid_0's auc: 0.82915
[7]	valid_0's auc: 0.829288
[8]	valid_0's auc: 0.82936
[9]	valid_0's auc: 0.829444
[10]	valid_0's auc: 0.829576
[11]	valid_0's auc: 0.830907
[12]	valid_0's auc: 0.83112
[13]	valid_0's auc: 0.831391
[14]	valid_0's auc: 0.832079
[15]	valid_0's auc: 0.832151
[16]	valid_0's auc: 0.831951
[17]	valid_0's auc: 0.83202
[18]	valid_0's auc: 0.832392
[19]	valid_0's auc: 0.832087
[20]	valid_0's auc: 0.832154
[21]	valid_0's auc: 0.832117
[22]	valid_0's auc: 0.832372
[23]	valid_0's auc: 0.832215
[24]	valid_0's auc: 0.832251
[25]	valid_0's auc: 0.832233
[26]	valid_0's auc: 0.83213
[27]	valid_0's auc: 0.832268
[28]	valid_0's auc: 0.832471
[29]	valid_0's auc: 0.832262
[30]	valid_0's auc: 0.831917
[31]	valid_0's auc: 0.831939
[32]	valid_0's auc: 0.832086
[33]	valid_0's auc: 0.832

[37]	valid_0's auc: 0.834273
[38]	valid_0's auc: 0.834381
[39]	valid_0's auc: 0.834559
[40]	valid_0's auc: 0.834412
[41]	valid_0's auc: 0.834744
[42]	valid_0's auc: 0.834932
[43]	valid_0's auc: 0.834859
[44]	valid_0's auc: 0.834765
[45]	valid_0's auc: 0.834788
[46]	valid_0's auc: 0.834769
[47]	valid_0's auc: 0.834788
[48]	valid_0's auc: 0.834793
[49]	valid_0's auc: 0.834385
[50]	valid_0's auc: 0.834397
[51]	valid_0's auc: 0.834465
[52]	valid_0's auc: 0.834282
Early stopping, best iteration is:
[42]	valid_0's auc: 0.834932
[1]	valid_0's auc: 0.821372
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.82221
[3]	valid_0's auc: 0.823102
[4]	valid_0's auc: 0.825032
[5]	valid_0's auc: 0.825447
[6]	valid_0's auc: 0.827421
[7]	valid_0's auc: 0.828489
[8]	valid_0's auc: 0.828566
[9]	valid_0's auc: 0.829133
[10]	valid_0's auc: 0.829409
[11]	valid_0's auc: 0.830246
[12]	valid_0's auc: 0.832265
[13]	valid_0's auc: 0.832311
[14]	valid_0's auc: 0.832411
[15]	valid_0's

In [16]:
y_preds = []

for m in models:
    y_preds.append(m.predict(X_test, num_iteration=m.best_iteration))

y_preds_bagging = sum(y_preds)/len(y_preds)
# auc を計算する
auc = roc_auc_score(y_test, y_preds_bagging)
print(auc)

0.829094611662


In [17]:
for y_pred in y_preds:
    print(roc_auc_score(y_test, y_pred))

0.83000084039
0.829658314856
0.828868413698
0.82950217428
0.8287143428
0.82995420438
0.829735390303
0.828799552069
0.830693700138
0.829254375259
