In [1]:
import os
from datetime import timedelta
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
import lightgbm as lgbm
# import optuna.integration.lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import average_precision_score
from imblearn.under_sampling import RandomUnderSampler
pd.set_option('display.max_Columns', 100)

In [2]:
is_time_series = False
is_subsample = True
is_ensumble = False

In [18]:
INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'

In [3]:
train_df = pd.read_feather('../input/train.f')
test_df = pd.read_feather('../input/test.f')
print('train_df', train_df.shape)
print('test_df', test_df.shape)

train_feat_df = pd.read_feather('../input/train_feat_df.f')
test_feat_df = pd.read_feather('../input/test_feat_df.f')
print('train_feat_df', train_feat_df.shape)
print('test_feat_df', test_feat_df.shape)
y = train_df['target']

train_df (1997595, 35)
test_df (390095, 30)
train_feat_df (1997595, 78)
test_feat_df (390095, 78)


In [4]:
feature_count = len(train_feat_df.columns)
print(feature_count)

78


#### LightGBM による学習

In [5]:
def pr_auc(y_pred, y_true):
    """lightGBM の round ごとに PR-AUC を計算する用"""
    score = average_precision_score(y_true.get_label(), y_pred)
    return "pr_auc", score, True

In [6]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'seed' : 0,
    'learning_rate':  0.1,
    'feature_pre_filter': False, 
    'lambda_l1': 6.419345380049121e-07, 
    'lambda_l2': 8.432801302426078, 
    'num_leaves': 212, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.9907178796872467, 
    'bagging_freq': 2, 
    'min_child_samples': 100
}

In [7]:
def train_lgbm(X, y, cv, params: dict, verbose=100):

    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    num_boost_round=1000,
                                                    early_stopping_rounds=verbose,
                                                    feval=pr_auc,
                                                    verbose_eval=verbose)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)

        print(f'Fold {i} PR-AUC: {average_precision_score(y_valid, y_pred):.4f}')

    score = average_precision_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models, score

In [8]:
def tuning_lgbm(X, y, cv, params, verbose=100):
    idx_train, idx_valid = cv[0]
    x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    lgbm_train = lgbm.Dataset(x_train, y_train)
    lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
    
    best_params, tuning_history = dict(), list()
    best = lgbm.train(params,
                                  lgbm_train,
                                  valid_sets=lgbm_eval,
                                  num_boost_round=1000,
                                  early_stopping_rounds=verbose,
                                  feval=pr_auc,
                                  verbose_eval=0)
    print('Best Params:', best.params)
    print('Best Iteration:', best.best_iteration)
    print('Best Score:', best.best_score)

In [9]:
# %%time
# if is_time_series:
#     fold = TimeSeriesSplit(n_splits=5)
# else:
#     fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# cv = list(fold.split(train_feat_df, y)) 

# tuning_lgbm(train_feat_df, y, cv, params=lgbm_param)

In [10]:
def kfold_lgbm(X, y):
    if is_time_series:
        fold = TimeSeriesSplit(n_splits=5)
    else:
        fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    cv = list(fold.split(X, y)) 

    oof, models, score = train_lgbm(X, y, cv, params=lgbm_param)
    return oof, models, score

In [11]:
def train_pred():
    oof, models, score = kfold_lgbm(train_feat_df, y)
    pred_list = []
    for model in models:
            pred = model.predict(test_feat_df, num_iteration = model.best_iteration)
            pred_list.append(pred)
            
    pred = np.mean(pred_list, axis=0)
    return pred, score

In [12]:
def resampling_train_pred():
    print(y.value_counts())
    negative = y.value_counts()[0]
    positive = y.value_counts()[1]
    strategy = {0:int(negative/5), 1:positive}

    pred_list = []
    score_list = []

    for i in range(3):
        rus = RandomUnderSampler(random_state=i*9, sampling_strategy = strategy)
        X_resampled, y_resampled = rus.fit_resample(train_feat_df, y)

        oof, models, score = kfold_lgbm(X_resampled, y_resampled)
        score_list.append(score)

        for model in models:
            pred = model.predict(test_feat_df, num_iteration = model.best_iteration)
            pred_list.append(pred)

        print('----------------[{}] {}----------------'.format(i, score))

    pred = np.mean(pred_list, axis=0)
    score_ave = np.mean(score_list, axis=0)
    return pred, score_ave

In [13]:
def feat_imp(model):
    fi = model.feature_importance()
    fn = model.feature_name()
    df_feature_importance = pd.DataFrame({'name':fn, 'imp':fi})
    df_feature_importance.sort_values('imp', inplace=True)
    return df_feature_importance

def feature_importance(models):
    fi = pd.DataFrame(columns=['name'])
    for i, model in enumerate(models):
        fi_tmp = feat_imp(model)
        colname = 'imp_{}'.format(i)
        fi_tmp.rename(columns={'imp': colname}, inplace=True)
        fi = pd.merge(fi, fi_tmp, on=['name'], how='outer')
    fi['sum'] = fi.sum(axis=1)
    return fi.sort_values(['sum'], ascending=False)

In [14]:
# feature_importance(models).head(50)

In [15]:
%%time
if is_subsample:
    pred, score = resampling_train_pred()
else:
    pred, score = train_pred()

0    1932105
1      65490
Name: target, dtype: int64
[LightGBM] [Info] Number of positive: 52392, number of negative: 309136
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6550
[LightGBM] [Info] Number of data points in the train set: 361528, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.144918 -> initscore=-1.775027
[LightGBM] [Info] Start training from score -1.775027
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.244174	valid_0's pr_auc: 0.646025
[200]	valid_0's binary_logloss: 0.24261	valid_0's pr_auc: 0.648503
Early stopping, best iteration is:
[187]	valid_0's binary_logloss: 0.242609	valid_0's pr_auc: 0.64888
Fold 0 PR-AUC: 0.6489
[LightGBM] [Info] Number of positive: 52392, number of negative: 309137
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set

Fold 3 PR-AUC: 0.6543
[LightGBM] [Info] Number of positive: 52392, number of negative: 309137
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6578
[LightGBM] [Info] Number of data points in the train set: 361529, number of used features: 77
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.144918 -> initscore=-1.775031
[LightGBM] [Info] Start training from score -1.775031
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.242155	valid_0's pr_auc: 0.651096
[200]	valid_0's binary_logloss: 0.241046	valid_0's pr_auc: 0.652171
Early stopping, best iteration is:
[134]	valid_0's binary_logloss: 0.241244	valid_0's pr_auc: 0.652872
Fold 4 PR-AUC: 0.6529
FINISHED \ whole score: 0.6521
----------------[1] 0.6520891689634492----------------
[LightGBM] [Info] Number of positive: 52392, number of negative: 309136
You can set `force_row_wise=true` to remo

In [16]:
assert len(pred) == len(test_df)

In [19]:
out_filename = 'submission'
if is_time_series:
    out_filename = out_filename + '_ts'

if is_subsample:
    out_filename = out_filename + '_sub'

sub_df = pd.DataFrame({ 'target': pred })
sub_df.to_csv(os.path.join(OUTPUT_DIR, out_filename + '.csv'), index=False)

In [20]:
print('- feature={}'.format(feature_count))
print('- score={:.4f}'.format(score))

- feature=78
- score=0.6516


In [21]:
if is_ensumble:
    sub = pd.read_csv('../output/submission.csv')
    sub_ts = pd.read_csv('../output/submission_ts.csv')
    assert len(sub) == len(sub_ts)
    sub['target'] = (sub['target'] + sub_ts['target'])/2
    sub.to_csv('../output/ensumble.csv', index=False)

#### subsampling_29: 
- feature=78
- score=0.6501
- publicLB= 0.2398

#### tuning
```
Best Params: {
    'objective': 'binary', 
    'boosting_type': 'gbdt', 
    'seed': 0, 
    'learning_rate': 0.1, 
    'feature_pre_filter': False, 
    'lambda_l1': 6.419345380049121e-07, 
    'lambda_l2': 8.432801302426078, 
    'num_leaves': 212, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.9907178796872467, 
    'bagging_freq': 2, 
    'min_child_samples': 100, 
    'num_iterations': 1000, 
    'early_stopping_round': 100
}
Best Iteration: 245
Best Score: 'pr_auc', 0.22382995580267329
```