In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'

In [3]:
def read_feather(filename):
    df = pd.read_feather(os.path.join(INPUT_DIR, filename))
    print(filename, df.shape)
    return df

In [4]:
train_df = read_feather('train.f')
test_df = read_feather('test.f')

train.f (1997595, 35)
test.f (390095, 30)


#### 連続変数の特徴量

In [5]:
def create_continuous_features(input_df):
    use_columns = [
        # 連続変数
        'max_login_interval', 
        'frequency', 
        'login_frequency', 
        'last_login_interval',
        'from_click',
    ]
    return input_df[use_columns].copy()

#### Category系の特徴量

In [6]:
def create_category_features(input_df):
    use_columns = [
        # category 系の id. label-encoding として使う
        'adnw_id',
        'adspot_id',
        'adspot_video_format_id',
        'game_feed_asset_type_id'
    ]
    return input_df[use_columns].copy()

In [7]:
# # https://github.com/nyk510/vivid/blob/master/vivid/utils.py
# from contextlib import contextmanager
# from time import time

# @contextmanager
# def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
#     if prefix: format_str = str(prefix) + format_str
#     if suffix: format_str = format_str + str(suffix)
#     start = time()
#     yield
#     d = time() - start
#     out_str = format_str.format(d)
#     if logger:
#         logger.info(out_str)
#     else:
#         print(out_str)

In [8]:
# from tqdm import tqdm

def to_feature(input_df):
    processors = [
        create_continuous_features,
        create_category_features
    ]

    out_df = pd.DataFrame()

#     for func in tqdm(processors, total=len(processors)):
#         with timer(prefix='create ' + func.__name__ + ' '):

    for func in processors:
        _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)

    return out_df

In [9]:
train_feat_df = to_feature(train_df)
test_feat_df = to_feature(test_df)

In [10]:
assert len(train_feat_df) == len(train_df)
assert len(test_feat_df) == len(test_df)

#### LightGBM による学習

In [11]:
from sklearn.metrics import average_precision_score
import lightgbm as lgbm

def pr_auc(y_pred, y_true):
    """lightGBM の round ごとに PR-AUC を計算する用"""
    score = average_precision_score(y_true.get_label(), y_pred)
    return "pr_auc", score, True

In [12]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'seed' : 0,
    'learning_rate':  0.1,
    'max_depth': 6,
}

In [18]:
def train_lgbm(X, y, cv, params: dict, verbose=100):

    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    num_boost_round=1000,
                                                    early_stopping_rounds=verbose,
                                                    feval=pr_auc,
                                                    verbose_eval=verbose)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)

        print(f'Fold {i} PR-AUC: {average_precision_score(y_valid, y_pred):.4f}')

    score = average_precision_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models, score

In [19]:
from sklearn.model_selection import StratifiedKFold

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=71)
y = train_df['target']
cv = list(fold.split(train_feat_df, y)) # もともとが generator なため明示的に list に変換する

In [20]:
%%time
oof, models, score = train_lgbm(train_feat_df, y, cv, params=lgbm_param)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.135712	valid_0's pr_auc: 0.0799047
[200]	valid_0's binary_logloss: 0.135411	valid_0's pr_auc: 0.0812736
[300]	valid_0's binary_logloss: 0.135266	valid_0's pr_auc: 0.0818274
[400]	valid_0's binary_logloss: 0.135211	valid_0's pr_auc: 0.0819085
[500]	valid_0's binary_logloss: 0.135187	valid_0's pr_auc: 0.0818971
Early stopping, best iteration is:
[442]	valid_0's binary_logloss: 0.135186	valid_0's pr_auc: 0.0821181
Fold 0 PR-AUC: 0.0821
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.135908	valid_0's pr_auc: 0.078695
[200]	valid_0's binary_logloss: 0.135579	valid_0's pr_auc: 0.0805152
[300]	valid_0's binary_logloss: 0.135458	valid_0's pr_auc: 0.081161
[400]	valid_0's binary_logloss: 0.135438	valid_0's pr_auc: 0.0814514
Early stopping, best iteration is:
[345]	valid_0's binary_logloss: 0.135432	valid_0's pr_auc: 0.0813332
Fold 1 PR-AUC: 0.0813
Train

In [21]:
def feat_imp(model):
    fi = model.feature_importance()
    fn = model.feature_name()
    df_feature_importance = pd.DataFrame({'name':fn, 'imp':fi})
    df_feature_importance.sort_values('imp', inplace=True)
    return df_feature_importance

def feature_importance(models):
    fi = pd.DataFrame(columns=['name'])
    for i, model in enumerate(models):
        fi_tmp = feat_imp(model)
        colname = 'imp_{}'.format(i)
        fi_tmp.rename(columns={'imp': colname}, inplace=True)
        fi = pd.merge(fi, fi_tmp, on=['name'], how='outer')
    fi['sum'] = fi.sum(axis=1)
    return fi.sort_values(['sum'], ascending=False)

In [22]:
feature_importance(models)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,imp_4,sum
7,max_login_interval,3710,2747,2999,2897,2404,14757
8,last_login_interval,3719,2841,2995,2743,2180,14478
6,adspot_id,1438,1116,1229,1150,902,5835
5,login_frequency,1179,973,1032,1050,791,5025
4,frequency,1064,800,885,751,647,4147
3,adnw_id,845,687,671,755,587,3545
2,adspot_video_format_id,562,491,498,547,420,2518
1,game_feed_asset_type_id,455,432,394,420,345,2046
0,from_click,230,208,201,230,195,1064


In [23]:
%%time
pred_list = []
for model in models:
    print('best_iteration', model.best_iteration)
    pred = model.predict(test_feat_df, num_iteration = model.best_iteration)
    pred_list.append(pred)
    
pred = np.mean(pred_list, axis=0)

best_iteration 442
best_iteration 345
best_iteration 368
best_iteration 353
best_iteration 288
CPU times: user 1min 16s, sys: 479 ms, total: 1min 16s
Wall time: 22.1 s


In [24]:
assert len(pred) == len(test_df)

In [25]:
sub_df = pd.DataFrame({ 'target': pred })
sub_df.to_csv(os.path.join(OUTPUT_DIR, 'simple_submission.csv'), index=False)

In [None]:
print('score', score)