In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# サンプルデータの生成
X, y = make_classification(n_samples=10000, n_features=10, n_informative=4, n_redundant=0, n_repeated=0,
                           n_clusters_per_class=5, random_state=0)

X = pd.DataFrame(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_valid, y_train, y_valid= train_test_split(X_train, y_train, test_size=0.2, random_state = 0)

X_train = X_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [3]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.220501,1.793172,-1.704846,-0.031902,1.483161,0.538368,0.930566,-1.152792,0.668279,2.758012
1,-0.787205,-2.941857,-1.477566,0.444549,-0.717255,1.892983,2.187604,0.474345,2.573713,1.575652
2,-0.6771,-2.28223,0.552653,0.633478,0.155302,-1.413506,-2.925425,0.537465,0.886338,-2.620048
3,-0.56803,0.134996,-0.408422,0.821043,-0.576422,-0.605421,2.187018,0.798433,0.280466,-1.365117
4,1.187592,0.062733,-2.93817,0.513271,-0.35699,-1.429823,-0.759187,0.063446,0.209486,-1.386646


In [4]:
y_train

array([1, 1, 0, ..., 1, 0, 1])

# Supervised learning

In [5]:
import lightgbm as lgb

lgbm_params = {
    'learning_rate': 0.1,
    'num_leaves': 8,
    'boosting_type' : 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
}

def lgbm_train(X_train_df, X_valid_df, y_train_df, y_valid_df, lgbm_params):
    lgb_train = lgb.Dataset(X_train_df, y_train_df)
    lgb_eval = lgb.Dataset(X_valid_df, y_valid_df, reference=lgb_train)

    # 上記のパラメータでモデルを学習する
    model = lgb.train(lgbm_params, lgb_train,
                      # モデルの評価用データを渡す
                      valid_sets=lgb_eval,
                      # 最大で 1000 ラウンドまで学習する
                      num_boost_round=1000,
                      # 10 ラウンド経過しても性能が向上しないときは学習を打ち切る
                      early_stopping_rounds=10)
    
    return model

In [6]:
model = lgbm_train(X_train, X_valid, y_train, y_valid, lgbm_params)

[1]	valid_0's auc: 0.784972
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.811981
[3]	valid_0's auc: 0.825018
[4]	valid_0's auc: 0.83381
[5]	valid_0's auc: 0.838375
[6]	valid_0's auc: 0.842531
[7]	valid_0's auc: 0.847927
[8]	valid_0's auc: 0.853236
[9]	valid_0's auc: 0.857528
[10]	valid_0's auc: 0.858777
[11]	valid_0's auc: 0.859462
[12]	valid_0's auc: 0.860591
[13]	valid_0's auc: 0.860569
[14]	valid_0's auc: 0.86276
[15]	valid_0's auc: 0.864546
[16]	valid_0's auc: 0.868154
[17]	valid_0's auc: 0.869084
[18]	valid_0's auc: 0.870236
[19]	valid_0's auc: 0.870287
[20]	valid_0's auc: 0.871248
[21]	valid_0's auc: 0.872214
[22]	valid_0's auc: 0.873254
[23]	valid_0's auc: 0.874679
[24]	valid_0's auc: 0.875691
[25]	valid_0's auc: 0.875748
[26]	valid_0's auc: 0.876562
[27]	valid_0's auc: 0.877482
[28]	valid_0's auc: 0.879182
[29]	valid_0's auc: 0.880305
[30]	valid_0's auc: 0.881332
[31]	valid_0's auc: 0.881858
[32]	valid_0's auc: 0.88291
[33]	valid_0's auc: 0.

In [7]:
from sklearn.metrics import roc_auc_score
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
roc_auc_score(y_test, y_pred)  # auc を計算する

0.88625523688366292

# Pseudo Labeling

In [8]:
y_pred

array([ 0.22544793,  0.61853244,  0.84060021, ...,  0.49554101,
        0.15531975,  0.66054069])

In [9]:
def pseudoLabeling(X_train, y_train, X_valid, y_valid, X_test, y_pred):
    
    X_train = pd.concat([X_train, X_test])
    y_train =  np.r_[y_train, y_pred]

    return X_train, y_train, X_valid, y_valid

In [10]:
X_train_pl, y_train_pl, X_valid, y_valid = pseudoLabeling(X_train, y_train, X_valid, y_valid, X_test, y_pred)

In [11]:
model = lgbm_train(X_train_pl, X_valid, y_train_pl, y_valid, lgbm_params)

[1]	valid_0's auc: 0.789201
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.816046
[3]	valid_0's auc: 0.827757
[4]	valid_0's auc: 0.839326
[5]	valid_0's auc: 0.84473
[6]	valid_0's auc: 0.846442
[7]	valid_0's auc: 0.846366
[8]	valid_0's auc: 0.85124
[9]	valid_0's auc: 0.851943
[10]	valid_0's auc: 0.855838
[11]	valid_0's auc: 0.859136
[12]	valid_0's auc: 0.861154
[13]	valid_0's auc: 0.861952
[14]	valid_0's auc: 0.862718
[15]	valid_0's auc: 0.862685
[16]	valid_0's auc: 0.864664
[17]	valid_0's auc: 0.866041
[18]	valid_0's auc: 0.867491
[19]	valid_0's auc: 0.869047
[20]	valid_0's auc: 0.869747
[21]	valid_0's auc: 0.870398
[22]	valid_0's auc: 0.872369
[23]	valid_0's auc: 0.872859
[24]	valid_0's auc: 0.873776
[25]	valid_0's auc: 0.876175
[26]	valid_0's auc: 0.877092
[27]	valid_0's auc: 0.877907
[28]	valid_0's auc: 0.87842
[29]	valid_0's auc: 0.879529
[30]	valid_0's auc: 0.879367
[31]	valid_0's auc: 0.879128
[32]	valid_0's auc: 0.879622
[33]	valid_0's auc: 0.

In [12]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
roc_auc_score(y_test, y_pred)  # auc を計算する

0.88365992285066497

# Cross Pseudo Labeling

In [13]:
from sklearn.cluster import KMeans

def crossPseudoLabeling(X_train, y_train, X_valid, y_valid, X_test, y_pred, num_class=5):

    X_test_set = []
    y_pred_set = []
    
    X_test_class = KMeans(n_clusters=num_class).fit_predict(X_test)
    
    for i in range(num_class):

        X_train_tmp = pd.concat([X_train, X_test[X_test_class != i]])
        y_train_tmp =  np.r_[y_train, y_pred[X_test_class != i]]
        
        model = lgbm_train(X_train_tmp, X_valid, y_train_tmp, y_valid, lgbm_params)
        y_pred_tmp = model.predict(X_test[X_test_class == i], num_iteration=model.best_iteration)
        
        X_test_tmp = X_test[X_test_class == i].copy()
        X_test_tmp['target'] = y_pred_tmp
        X_test_set.append(X_test_tmp)
        
        len(X_test_tmp)

    y_pred_cpl = pd.concat(X_test_set).sort_index()['target']
    
    return y_pred_cpl

In [14]:
y_pred_cpl = crossPseudoLabeling(X_train, y_train, X_valid, y_valid, X_test, y_pred, num_class=5)

[1]	valid_0's auc: 0.788109
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's auc: 0.812134
[3]	valid_0's auc: 0.829993
[4]	valid_0's auc: 0.835359
[5]	valid_0's auc: 0.834577
[6]	valid_0's auc: 0.838846
[7]	valid_0's auc: 0.844872
[8]	valid_0's auc: 0.8485
[9]	valid_0's auc: 0.849881
[10]	valid_0's auc: 0.852016
[11]	valid_0's auc: 0.853549
[12]	valid_0's auc: 0.854505
[13]	valid_0's auc: 0.853971
[14]	valid_0's auc: 0.85709
[15]	valid_0's auc: 0.859275
[16]	valid_0's auc: 0.862193
[17]	valid_0's auc: 0.863495
[18]	valid_0's auc: 0.865283
[19]	valid_0's auc: 0.866083
[20]	valid_0's auc: 0.867775
[21]	valid_0's auc: 0.869335
[22]	valid_0's auc: 0.869127
[23]	valid_0's auc: 0.869354
[24]	valid_0's auc: 0.870873
[25]	valid_0's auc: 0.872562
[26]	valid_0's auc: 0.873211
[27]	valid_0's auc: 0.874632
[28]	valid_0's auc: 0.875426
[29]	valid_0's auc: 0.877179
[30]	valid_0's auc: 0.878072
[31]	valid_0's auc: 0.878727
[32]	valid_0's auc: 0.87961
[33]	valid_0's auc: 0.8

[30]	valid_0's auc: 0.869389
[31]	valid_0's auc: 0.869207
[32]	valid_0's auc: 0.869078
[33]	valid_0's auc: 0.869555
[34]	valid_0's auc: 0.868932
[35]	valid_0's auc: 0.868667
[36]	valid_0's auc: 0.871148
[37]	valid_0's auc: 0.871925
[38]	valid_0's auc: 0.872588
[39]	valid_0's auc: 0.872647
[40]	valid_0's auc: 0.873024
[41]	valid_0's auc: 0.873585
[42]	valid_0's auc: 0.874255
[43]	valid_0's auc: 0.87433
[44]	valid_0's auc: 0.874041
[45]	valid_0's auc: 0.874331
[46]	valid_0's auc: 0.875344
[47]	valid_0's auc: 0.875721
[48]	valid_0's auc: 0.875565
[49]	valid_0's auc: 0.876629
[50]	valid_0's auc: 0.877509
[51]	valid_0's auc: 0.878449
[52]	valid_0's auc: 0.87893
[53]	valid_0's auc: 0.879566
[54]	valid_0's auc: 0.879246
[55]	valid_0's auc: 0.879287
[56]	valid_0's auc: 0.87889
[57]	valid_0's auc: 0.878989
[58]	valid_0's auc: 0.879566
[59]	valid_0's auc: 0.879508
[60]	valid_0's auc: 0.879829
[61]	valid_0's auc: 0.880307
[62]	valid_0's auc: 0.881102
[63]	valid_0's auc: 0.880871
[64]	valid_0's au

In [15]:
roc_auc_score(y_test, y_pred_cpl)  # auc を計算する

0.88491657490556352