In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
import lightgbm as lgbm
# import optuna.integration.lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import average_precision_score
pd.set_option('display.max_Columns', 100)

In [2]:
is_time_series = False
is_ensumble = False

In [3]:
INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'

In [4]:
def read_feather(filename):
    df = pd.read_feather(os.path.join(INPUT_DIR, filename))
    print(filename, df.shape)
    return df

In [5]:
train_df = read_feather('train.f')
test_df = read_feather('test.f')

train.f (1997595, 35)
test.f (390095, 30)


In [6]:
campaign_df = read_feather('campaign.f')
map_gv_df = read_feather('map_game_feed_native_video_assets.f')
ad_video_df = read_feather('advertiser_video.f')
ad_cvideo_df = read_feather('advertiser_converted_video.f')

campaign.f (14627, 4)
map_game_feed_native_video_assets.f (2796, 3)
advertiser_video.f (11707, 6)
advertiser_converted_video.f (198622, 8)


In [7]:
ad_cvideo_df = ad_cvideo_df.drop_duplicates(
    subset=['mst_advertiser_video_id', 
                   'mst_game_feed_id', 
                    'mst_video_template_id'], keep='last')

le = preprocessing.LabelEncoder()
le.fit(['vertical', 'horizontal'])
ad_cvideo_df['rectangle_type_id'] = le.transform(ad_cvideo_df['rectangle_type'])
ad_cvideo_df.drop(columns=['rectangle_type'], inplace=True)
print('ad_cvideo_df', ad_cvideo_df.shape)

ad_cvideo_df (107493, 8)


### map_game_feed_native_video_assetsをマージ

In [8]:
def merge_gamefeed_features(input_df):
    input_merge = pd.merge(input_df, map_gv_df, 
                           left_on='game_feed_id', right_on='mst_game_feed_id', how='left').drop(columns=['mst_game_feed_id'])
    
    horizontal = ad_video_df.copy()
    left_keys = ['horizontal_mst_advertiser_video_id', 'advertiser_id']
    right_keys = ['id', 'mst_advertiser_id']
    horizontal.columns = [f'horizontal_{c}' if c not in right_keys else c for c in horizontal.columns]
    input_merge = pd.merge(input_merge, horizontal, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys) 
    
    vertical = ad_video_df.copy()
    left_keys = ['vertical_mst_advertiser_video_id', 'advertiser_id']
    right_keys = ['id', 'mst_advertiser_id']
    vertical.columns = [f'vertical_{c}' if c not in right_keys else c for c in vertical.columns]
    input_merge = pd.merge(input_merge, vertical, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys)
    
    left_keys = [
        "horizontal_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    horizontal = ad_cvideo_df.copy()
    horizontal.columns = [f"horizontal_converted_{c}" if c not in right_keys else c for c in horizontal.columns]
    input_merge = pd.merge(input_merge, horizontal, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys) 
    
    left_keys = [
        "vertical_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    vertical = ad_cvideo_df.copy()
    vertical.columns = [f"vertical_converted_{c}" if c not in right_keys else c for c in vertical.columns]
    input_merge = pd.merge(input_merge, vertical, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys)
    
#     input_merge.drop(columns=['game_feed_id', 'advertiser_id', 'video_template_id', 
#                               'horizontal_mst_advertiser_video_id', 'vertical_mst_advertiser_video_id'], inplace=True)
    
    # merge
    merge_col = ['duration', 'file_size', 'converted_file_size', 'converted_bitrate']
    vert_horz = ['vertical_', 'horizontal_']
    
    for m_col in merge_col:
        input_merge[m_col] = 0
        for vh in vert_horz:
            input_merge[m_col] = input_merge[m_col] + input_merge[vh+m_col].fillna(0)
            input_merge.drop(columns=[vh+m_col], inplace=True)
    
    return input_merge

### campaignをマージ

In [9]:
def merge_campaign_features(input_df):
    campaign = pd.merge(input_df, campaign_df, left_on='campaign_id', right_on='id', how='left')
    campaign.drop(columns=['id'], inplace=True)
    return campaign

In [10]:
def merge_all(input_df):
    input_df = merge_gamefeed_features(input_df)
    input_df = merge_campaign_features(input_df)
    print('merge', input_df.shape)
    return input_df

In [11]:
print('train_df', train_df.shape)
train_df = merge_all(train_df)
print('test_df', test_df.shape)
test_df = merge_all(test_df)

train_df (1997595, 35)
merge (1997595, 54)
test_df (390095, 30)
merge (390095, 49)


In [12]:
whole_df = pd.concat([train_df, test_df], ignore_index=True)
print('whole_df', whole_df.shape)

whole_df (2387690, 54)


In [13]:
label_encoding_col = [
    'adspot_id',
    'adspot_video_format_id', 
    'country_code', 
    'game_feed_asset_type_id',
    'item_id', 
    'os', 
    'video_template_id',
    'uid',
#     'auction_type_id', ->OHE
#     'user_type_id',  ->OHE
    
#     'advertiser_id',  ->TGE
#     'app_id',   ->TGE
#     'media_app_id',   ->TGE
#     'campaign_id',  ->TGE
    'category_id', 
    'game_feed_id', 
    'game_template_id',
    
    'horizontal_mst_advertiser_video_id',
    'vertical_mst_advertiser_video_id', 
    'horizontal_converted_rectangle_type_id', 
    'vertical_converted_rectangle_type_id',
    'mst_advertiser_id', 
#     'mst_advertiser_order_id', 
#     'mst_user_type_id',
]

In [14]:
target_enc_col = [
#     'advertiser_id', 
    'app_id', 
    'media_app_id', 
    'campaign_id',
#     'category_id',
#     'game_feed_id',
#     'game_template_id',
    
    'mst_advertiser_order_id', 
    'mst_user_type_id',
]

In [15]:
count_enc_col = [
    'advertiser_id', 
    'adnw_id', 
    'adspot_id',
    'category_id', 
    'game_feed_id',
    'uid', 
    'game_template_id',
]

In [16]:
onehot_enc_col = [
    'auction_type_id',
    'header_bidding',
    'is_interstitial',
    'user_type_id'
]

In [17]:
continuous_col = [
    'first_login_interval',
    'max_login_interval', 
    'frequency', 
    'login_frequency', 
    'last_login_interval',
    'from_click',
    'pos',
#     'adnw_id',
#     'header_bidding', 
#     'is_interstitial'
    
    'horizontal_width',
    'horizontal_height', 
    'vertical_width', 
    'vertical_height',
    'horizontal_converted_width', 
    'horizontal_converted_height',
    'vertical_converted_width',
    'vertical_converted_height', 
    'duration', 
    'file_size', 
    'converted_file_size', 
    'converted_bitrate',
]

### Label Encoding

In [18]:
def get_non_overlapping(column: str):
    """train/testにしか出てこない値を調べる"""
    only_in_train = set(train_df[column].unique()) - set(test_df[column].unique())
    only_in_test = set(test_df[column].unique()) - set(train_df[column].unique())
    non_overlapping = only_in_train.union(only_in_test)
    return non_overlapping

def category2num(input_df, columns: list):
    input_ = input_df[columns].copy()
    for column in columns:
        non_overlapping = get_non_overlapping(column)
        if input_df[column].dtype == np.dtype("O"):
            # dtypeがobjectなら欠損は'missing' クラスにする
            input_[column] = input_df[column].fillna("missing")
            input_[column] = input_[column].map(lambda x: x if x not in non_overlapping else "other")
        else:
            # dtypeがint/floatなら欠損は'-1'とする
            input_[column] = input_df[column].fillna(-1)
            input_[column] = input_[column].map(lambda x: x if x not in non_overlapping else -2)

    return input_

In [19]:
train_LE = category2num(train_df, label_encoding_col)
print('train_LE', train_LE.shape)
test_LE = category2num(test_df, label_encoding_col)
print('test_LE', test_LE.shape)
concatenated = pd.concat([train_LE, test_LE], axis=0).reset_index(drop=True)

for column in label_encoding_col:
    le = preprocessing.LabelEncoder()
    le.fit(concatenated[column])
    train_LE[column] = le.transform(train_LE[column])
    test_LE[column] = le.transform(test_LE[column])

train_LE (1997595, 16)
test_LE (390095, 16)


In [20]:
def create_label_encoding_features(input_df, is_test=False):
    if is_test:
        return test_LE
    else:
        return train_LE

### target encoding

In [21]:
def target_encoding(train, test, enc_col):
    group_train = train.groupby([enc_col]).mean()[['target']].reset_index()
    test_copy = test[[enc_col]].copy()
    test_merge = pd.merge(test_copy, group_train, on=[enc_col], how='left')
    test_merge.set_index(test_copy.index, inplace=True)
    test_merge['target'].fillna(train['target'].mean(), inplace=True)
    enc_name = 'TGE_' + enc_col
    test_merge.rename(columns={'target': enc_name}, inplace=True)
    return test_merge.drop(columns=enc_col)

In [22]:
def create_targetencoding_features(input_df, is_test=False):
    
    if is_test:
        # test用 (全データ使用)
        print('TGE for test')
        tgt_all = pd.DataFrame()
        for enc_col in target_enc_col:
            tmp = target_encoding(train_df, test_df, enc_col)
            tgt_all = pd.concat([tgt_all, tmp], axis=1)
        return tgt_all
    
    else:
        # train用 (oof)
        print('TGE for train (TimeSeriesSplit)')
#         kf = KFold(n_splits=5, shuffle=True, random_state=0)
        kf = TimeSeriesSplit(n_splits=10)

        tgt_all = pd.DataFrame()
        for enc_col in target_enc_col:
            tgt_col = pd.DataFrame()

            for train_index, eval_index in kf.split(train_df):
                kf_train = train_df.iloc[train_index]
                kf_eval = train_df.iloc[eval_index]

                tmp = target_encoding(kf_train, kf_eval, enc_col)
                tgt_col = pd.concat([tgt_col, tmp])

            tgt_all = pd.concat([tgt_all, tgt_col], axis=1)
            print(enc_col)
        
        return tgt_all.sort_index()

### Count Encoding

In [23]:
def create_count_encoding_features(input_df, is_test=False):
    out_df = pd.DataFrame()
    for c in count_enc_col:
        series = whole_df[c]
        vc = series.value_counts(dropna=False)

        _df = pd.DataFrame(input_df[c].map(vc))
        out_df = pd.concat([out_df, _df], axis=1)

    out_df = out_df.add_prefix('CE_')
    return out_df

### One-hot encoding

In [24]:
def create_onehot_encoding_features(input_df, is_test=False):
    out_df = pd.DataFrame()
    for c in onehot_enc_col:
        series = input_df[c]
        cat = pd.Categorical(series, categories=whole_df[c].dropna().unique())
        _df = pd.get_dummies(cat)
        _df.columns = _df.columns.tolist()
        _df = _df.add_prefix(c + '=')
        out_df = pd.concat([out_df, _df], axis=1)

    return out_df.add_prefix('OH_')

### 集約系

In [25]:
def create_aggregate_features(input_df, is_test=False):
    return _run_aggregation(input_df, 'uid')

In [26]:
def _run_aggregation(input_df, agg_column):
    _agg_df = pd.concat([
#         whole_df.groupby(agg_column)['last_login_interval'].agg(['mean']).add_prefix('last_login_int_'),
#         whole_df.groupby(agg_column)['first_login_interval'].agg(['mean']).add_prefix('first_login_int_'),
#         whole_df.groupby(agg_column)['max_login_interval'].agg(['mean']).add_prefix('max_login_int_'),
#         whole_df.groupby(agg_column)['frequency'].agg(['mean']).add_prefix('frequency_'),
#         whole_df.groupby(agg_column)['login_frequency'].agg(['mean']).add_prefix('login_frequency_'),
        whole_df.groupby(agg_column)['advertiser_id'].nunique(),
        whole_df.groupby(agg_column)['app_id'].nunique(),
        whole_df.groupby(agg_column)['media_app_id'].nunique(),
        whole_df.groupby(agg_column)['campaign_id'].nunique()
    ], axis=1)

    out_df = pd.merge(input_df[agg_column], _agg_df, on=agg_column, how='left')
    out_df = out_df.drop(columns=agg_column).add_suffix('_by_{}'.format(agg_column))
    return out_df

### 四則演算

In [27]:
def feature_operation(input_df):
#     input_df['first_last_login_interval'] = input_df['first_login_interval'] - input_df['last_login_interval']
#     input_df['max_last_login_interval'] = input_df['max_login_interval'] - input_df['last_login_interval']
#     input_df['max_first_login_interval'] = input_df['max_login_interval'] - input_df['first_login_interval']
#     input_df['login_freq_frequency'] = input_df['login_frequency'] - input_df['frequency']
    return input_df

### 連続変数

In [28]:
def create_continuous_features(input_df, is_test=False):
    return input_df[continuous_col].copy()

### date系

In [29]:
def create_date_features(input_df, is_test=False):
    date_df = pd.DataFrame(pd.to_datetime(input_df['imp_at'], utc=True))
    date_df['imp_at'] = date_df['imp_at'].dt.tz_convert('Asia/Tokyo')
#     date_df['day'] = date_df['imp_at'].dt.day
    date_df['hour'] = date_df['imp_at'].dt.hour
#     date_df['total_minute'] = date_df['imp_at'].dt.hour*60+date_df['imp_at'].dt.minute
    date_df['hour_zone'] = pd.cut(date_df['hour'].values, bins=[-np.inf, 6, 12, 18, np.inf]).codes
    date_df['dayofweek'] = date_df['imp_at'].dt.dayofweek
    date_df.drop(columns=['imp_at'], inplace=True)
    return date_df

In [30]:
processors = [
    create_continuous_features,
    create_label_encoding_features,
    create_date_features,
    create_targetencoding_features,
    create_count_encoding_features,
    create_onehot_encoding_features,
    create_aggregate_features,
]

In [31]:
def to_feature(input_df, is_test=False):
    out_df = pd.DataFrame()
    for func in processors:
        _df = func(input_df, is_test)
#         assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
    return out_df

In [32]:
train_feat_df = to_feature(train_df)
test_feat_df = to_feature(test_df, True)
train_feat_df = feature_operation(train_feat_df)
test_feat_df = feature_operation(test_feat_df)
y = train_df['target']

TGE for train (TimeSeriesSplit)
app_id
media_app_id
campaign_id
mst_advertiser_order_id
mst_user_type_id
TGE for test


In [33]:
assert len(train_feat_df) == len(train_df)
assert len(test_feat_df) == len(test_df)

In [34]:
feature_count = len(train_feat_df.columns)
print(feature_count)

65


In [35]:
train_feat_df.tail()

Unnamed: 0,first_login_interval,max_login_interval,frequency,login_frequency,last_login_interval,from_click,pos,horizontal_width,horizontal_height,vertical_width,vertical_height,horizontal_converted_width,horizontal_converted_height,vertical_converted_width,vertical_converted_height,duration,file_size,converted_file_size,converted_bitrate,adspot_id,adspot_video_format_id,country_code,game_feed_asset_type_id,item_id,os,video_template_id,uid,category_id,game_feed_id,game_template_id,horizontal_mst_advertiser_video_id,vertical_mst_advertiser_video_id,horizontal_converted_rectangle_type_id,vertical_converted_rectangle_type_id,mst_advertiser_id,hour,hour_zone,dayofweek,TGE_app_id,TGE_media_app_id,TGE_campaign_id,TGE_mst_advertiser_order_id,TGE_mst_user_type_id,CE_advertiser_id,CE_adnw_id,CE_adspot_id,CE_category_id,CE_game_feed_id,CE_uid,CE_game_template_id,OH_auction_type_id=1.0,OH_auction_type_id=2.0,OH_auction_type_id=4.0,OH_header_bidding=0.0,OH_header_bidding=1.0,OH_is_interstitial=1.0,OH_is_interstitial=0.0,OH_user_type_id=1,OH_user_type_id=2,OH_user_type_id=4,OH_user_type_id=3,advertiser_id_by_uid,app_id_by_uid,media_app_id_by_uid,campaign_id_by_uid
1997590,32994.0,5487.0,1,6.0,651.0,1,1,,,,,,,,,0.0,0.0,0.0,0.0,37,2,0,1,1,1,6,108518,1,1347,49,626,2,0,0,43,8,1,4,0.017895,0.018655,0.129836,0.056076,0.048459,48160,941185,856650,2205632,362,9,1286.0,0,1,0,1,0,1,0,0,1,0,0,1,1,1,2
1997591,984500.0,116145.0,2,1.0,20034.0,1,0,1920.0,1080.0,,,960.0,540.0,,,6.0,57111321.0,876350.0,1000000.0,47,2,0,2,1,1,0,107523,1,1347,66,271,2,1,0,28,8,1,4,0.027778,0.027778,0.0,0.0,0.058172,101821,156196,137281,2205632,759,2,,1,0,0,1,0,1,0,0,1,0,0,1,2,2,1
1997592,38298.0,17147.0,5,1.0,21108.0,1,0,,,,,,,,,0.0,0.0,0.0,0.0,27,0,0,1,893,1,6,108518,1,1271,47,626,2,0,0,110,8,1,4,0.0,0.0,0.0,0.0,0.004843,20516,42014,42014,2205632,44,2,74251.0,0,1,0,0,0,0,1,0,1,0,0,2,1,1,2
1997593,651451.0,96344.0,0,1.0,19060.0,1,1,1280.0,720.0,,,,,,,14.0,17103589.0,0.0,0.0,28,3,0,2,1,0,6,108518,1,1347,66,627,2,0,0,117,8,1,4,0.022727,0.022727,0.009208,0.0377,0.004843,1679,128697,128697,2205632,14,1,,1,0,0,0,0,0,1,1,0,0,0,1,1,1,1
1997594,55743.0,42167.0,6,1.0,13575.0,1,1,,,,,,,,,0.0,0.0,0.0,0.0,24,0,0,1,1,1,6,108518,1,1274,10,626,2,0,0,110,8,1,4,0.032364,0.032364,0.0,0.0,0.004843,20516,13915,1828,2205632,30,2,2882.0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1


In [36]:
train_feat_df.to_feather('../input/train_feat_df.f')
test_feat_df.to_feather('../input/test_feat_df.f')

In [37]:
#del train_df, test_df

#### LightGBM による学習

In [38]:
def pr_auc(y_pred, y_true):
    """lightGBM の round ごとに PR-AUC を計算する用"""
    score = average_precision_score(y_true.get_label(), y_pred)
    return "pr_auc", score, True

In [39]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'seed' : 0,
    'learning_rate':  0.1,
    'feature_pre_filter': False, 
    'lambda_l1': 6.419345380049121e-07, 
    'lambda_l2': 8.432801302426078, 
    'num_leaves': 212, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.9907178796872467, 
    'bagging_freq': 2, 
    'min_child_samples': 100
}

In [40]:
def train_lgbm(X, y, cv, params: dict, verbose=100):

    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    num_boost_round=1000,
                                                    early_stopping_rounds=verbose,
                                                    feval=pr_auc,
                                                    verbose_eval=verbose)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)

        print(f'Fold {i} PR-AUC: {average_precision_score(y_valid, y_pred):.4f}')

    score = average_precision_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models, score

In [41]:
if is_time_series:
    fold = TimeSeriesSplit(n_splits=5)
else:
    fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = list(fold.split(train_feat_df, y)) # もともとが generator なため明示的に list に変換する

In [42]:
def tuning_lgbm(X, y, cv, params, verbose=100):
    idx_train, idx_valid = cv[0]
    x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    lgbm_train = lgbm.Dataset(x_train, y_train)
    lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
    
    best_params, tuning_history = dict(), list()
    best = lgbm.train(params,
                                  lgbm_train,
                                  valid_sets=lgbm_eval,
                                  num_boost_round=1000,
                                  early_stopping_rounds=verbose,
                                  feval=pr_auc,
                                  verbose_eval=0)
    print('Best Params:', best.params)
    print('Best Iteration:', best.best_iteration)
    print('Best Score:', best.best_score)

In [43]:
# %%time
# tuning_lgbm(train_feat_df, y, cv, params=lgbm_param)

In [44]:
%%time
oof, models, score = train_lgbm(train_feat_df, y, cv, params=lgbm_param)

[LightGBM] [Info] Number of positive: 52392, number of negative: 1545684
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4515
[LightGBM] [Info] Number of data points in the train set: 1598076, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.032784 -> initscore=-3.384468
[LightGBM] [Info] Start training from score -3.384468
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.0982145	valid_0's pr_auc: 0.305069
[200]	valid_0's binary_logloss: 0.0977726	valid_0's pr_auc: 0.308552
[300]	valid_0's binary_logloss: 0.0977427	valid_0's pr_auc: 0.308696
Early stopping, best iteration is:
[244]	valid_0's binary_logloss: 0.0976922	valid_0's pr_auc: 0.309414
Fold 0 PR-AUC: 0.3094
[LightGBM] [Info] Number of positive: 52392, number of negative: 1545684
You can set `force_row_wise=true` to remove the overhead.
And if memory is

In [45]:
def feat_imp(model):
    fi = model.feature_importance()
    fn = model.feature_name()
    df_feature_importance = pd.DataFrame({'name':fn, 'imp':fi})
    df_feature_importance.sort_values('imp', inplace=True)
    return df_feature_importance

def feature_importance(models):
    fi = pd.DataFrame(columns=['name'])
    for i, model in enumerate(models):
        fi_tmp = feat_imp(model)
        colname = 'imp_{}'.format(i)
        fi_tmp.rename(columns={'imp': colname}, inplace=True)
        fi = pd.merge(fi, fi_tmp, on=['name'], how='outer')
    fi['sum'] = fi.sum(axis=1)
    return fi.sort_values(['sum'], ascending=False)

In [46]:
feature_importance(models).head(50)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,imp_4,sum
64,max_login_interval,3813,3490,3204,3728,3505,17740
63,first_login_interval,3349,3361,2682,3561,2980,15933
62,TGE_mst_advertiser_order_id,3279,3265,2929,3261,3107,15841
61,TGE_media_app_id,2938,2722,2290,2802,2485,13237
60,TGE_campaign_id,2779,2586,2296,2647,2454,12762
59,last_login_interval,2680,2557,2218,2702,2340,12497
58,TGE_app_id,2622,2429,2157,2523,2050,11781
57,CE_game_feed_id,2545,2288,1992,2310,2075,11210
56,TGE_mst_user_type_id,1872,1696,1439,1828,1620,8455
55,hour,1827,1668,1431,1827,1508,8261


In [47]:
%%time
pred_list = []
for model in models:
    print('best_iteration', model.best_iteration)
    pred = model.predict(test_feat_df, num_iteration = model.best_iteration)
    pred_list.append(pred)
    
pred = np.mean(pred_list, axis=0)

best_iteration 244
best_iteration 231
best_iteration 200
best_iteration 240
best_iteration 213
CPU times: user 1min 35s, sys: 1.05 s, total: 1min 36s
Wall time: 4.8 s


In [48]:
assert len(pred) == len(test_df)

In [49]:
if is_time_series:
    out_filename = 'submission_ts.csv'
else:
    out_filename = 'submission.csv'
    
sub_df = pd.DataFrame({ 'target': pred })
sub_df.to_csv(os.path.join(OUTPUT_DIR, out_filename), index=False)

In [50]:
print('- feature={}'.format(feature_count))
print('- score={:.4f}'.format(score))

- feature=65
- score=0.3088


In [51]:
if is_ensumble:
    sub = pd.read_csv('../output/submission.csv')
    sub_ts = pd.read_csv('../output/submission_ts.csv')
    assert len(sub) == len(sub_ts)
    sub['target'] = (sub['target'] + sub_ts['target'])/2
    sub.to_csv('../output/ensumble.csv', index=False)

#### simple_25: LE->TGE('mst_advertiser_order_id', 'mst_user_type_id',)
- feature=65
- score=0.3088
- publicLB= 0.237

#### simple_24: マージを先にしてIDをLE
- feature=65
- score=0.3160
- publicLB= 0.2409 ★best★

#### ensumble_23: simple19 + ts23
- publicLB= 0.2395 ★best★

#### ts_23: 19相当に戻す, TimeSeriesCV(5)
- feature=61
- score=0.2222
- publicLB= 0.2345

#### simple_22: CEをKFlod(5), TGEをTimeSeriesSplit(5)
- feature=61
- score=0.3100
- publicLB= 0.2113

#### simple_21: CEをTimeSeriesSplit(10)に, NaNは平均
- feature=61
- score=0.2713
- publicLB= 0.2216

#### simple_20: TGEをの欠損をNaNに
- feature=61
- score=0.3153
- publicLB= 0.2374

#### simple_19: TGEをTimeSeriesSplit(10)に
- feature=61
- score=0.3153
- publicLB= 0.2384

#### simple_18: agg(unique系)
- feature=61
- score=0.3127
- publicLB= 0.2337

#### simple_17: 
##### CE(+'advertiser_id'), TGE(-'advertiser_id')
- feature=57
- score=0.3015
- publicLB= 0.2322

#### simple_16: 
##### CE(+'uid')
- feature=57
- score=0.3024
- publicLB= 0.2346

##### cont(+'adnw_id',)
- feature=57
- score=0.2470

##### cont(+'adnw_id',), CE(+'uid')
- feature=58
- score=0.3021

##### cont(+'adnw_id',), CE(+'uid', -'game_template_id')
- feature=57
- score=0.3019

##### CE('adnw_id',)
- feature=57
- score=0.2476

#### simple_15: 
- LE ('adspot_id', 'uid')
- feature=56
- score=0.2468
- publicLB= 0.2195

#### simple_14: 
- TGE->LE ("category_id", "game_feed_id", "game_template_id")
- feature=54
- score=0.2419
- publicLB= 0.2171

#### simple_13:
##### 8-3 相当に戻す
- feature=45
- score=0.2484
##### CE追加  'adnw_id', 'adspot_id', 'category_id', 'game_feed_id'
- feature=46
- score=0.2437
##### OHE追加 'auction_type_id', 'header_bidding', 'is_interstitial', 'user_type_id'
- feature=53
- score=0.2431
##### CE追加 'game_template_id',
- feature=54
- score=0.2432
- publicLB= 0.2141

#### simple_12: 集約系・四則演算全部抜き
- Wall time: 6min 20s (SSD)
- feature=58
- score=0.3025
- publicLB= 0.2354 ★best★

#### simple_11: count_enc全部抜き
- Wall time: 13min 12s
- feature=76
- score=0.3060
- publicLB= 0.2306

#### simple_10: total_minute, day, CE_app_id, CE_advertiser_id抜く、
- Wall time: 11min 39s
- feature=81
- score=0.3185
- publicLB= 0.2335

#### simple_9: 8-1, count, one-hot, aggregate, operation
- Wall time: 10min 48s
- feature=86
- score=0.3284
- publicLB= 0.2313

#### simple_8-3: label->target : 'game_feed_id', 'game_template_id'
- feature=46
- score=0.2490
- publicLB= 0.2183

#### simple_8-2: label->target : category_id
- feature=46
- score=0.2492

#### simple_8-1: label->target : campaign_id
- feature=46
- score=0.2495

#### simple_7: 追加のtarget_encを抜いた
- Wall time: 7min 21s (vCPU x 24、メモリ 96 GB)
- feature=46
- score=0.2486
- publicLB= 0.2207

#### simple_6: label_enc + target_enc
- Wall time: 3min 56s (vCPU x 24、メモリ 96 GB)
- feature=55
- score=0.3255
- publicLB= 0.1798 (Overfit)

#### simple_5: target_enc (oof version)
- Wall time: 7min 20s
- feature=44
- score=0.2383
- publicLB= 0.2123

#### simple_4: target_enc
- Wall time: 8min
- feature=44
- score=0.2589
- publicLB= 0.1973 (leak)

#### simple_3: tuning
- Wall time: 7min 19s
- feature=41
- score=0.2229
- publicLB= 0.1970

#### simple_2
- Wall time: 47min 46s
- feature= 41
- score= 0.214588
- publicLB= 0.1907

#### tuning
```
Best Params: {
    'objective': 'binary', 
    'boosting_type': 'gbdt', 
    'seed': 0, 
    'learning_rate': 0.1, 
    'feature_pre_filter': False, 
    'lambda_l1': 6.419345380049121e-07, 
    'lambda_l2': 8.432801302426078, 
    'num_leaves': 212, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.9907178796872467, 
    'bagging_freq': 2, 
    'min_child_samples': 100, 
    'num_iterations': 1000, 
    'early_stopping_round': 100
}
Best Iteration: 245
Best Score: 'pr_auc', 0.22382995580267329
```