In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
import lightgbm as lgbm
# import optuna.integration.lightgbm as lgbm

In [2]:
INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'

In [3]:
def read_feather(filename):
    df = pd.read_feather(os.path.join(INPUT_DIR, filename))
    print(filename, df.shape)
    return df

In [4]:
train_df = read_feather('train.f')
test_df = read_feather('test.f')

train.f (1997595, 35)
test.f (390095, 30)


In [5]:
campaign_df = read_feather('campaign.f')
map_gv_df = read_feather('map_game_feed_native_video_assets.f')
ad_video_df = read_feather('advertiser_video.f')
ad_cvideo_df = read_feather('advertiser_converted_video.f')

campaign.f (14627, 4)
map_game_feed_native_video_assets.f (2796, 3)
advertiser_video.f (11707, 6)
advertiser_converted_video.f (198622, 8)


In [6]:
ad_cvideo_df = ad_cvideo_df.drop_duplicates(
    subset=['mst_advertiser_video_id', 
                   'mst_game_feed_id', 
                    'mst_video_template_id'], keep='last')

le = preprocessing.LabelEncoder()
le.fit(['vertical', 'horizontal'])
ad_cvideo_df['rectangle_type_id'] = le.transform(ad_cvideo_df['rectangle_type'])
ad_cvideo_df.drop(columns=['rectangle_type'], inplace=True)
print('ad_cvideo_df', ad_cvideo_df.shape)

ad_cvideo_df (107493, 8)


In [7]:
train_df.head()

Unnamed: 0,request_id,imp_at,target,adnw_id,adspot_id,adspot_video_format_id,advertiser_id,app_id,auction_type_id,campaign_id,...,last_paid_interval,login_frequency,max_login_interval,media_app_id,os,os_version,pos,uid,user_type_id,video_template_id
0,ad6a7b7894f142d81a1e,2020-04-28 00:04:14.639000,0,4,1111,2.0,1b29c3959fc76425d130,efef8f05c293786ae95d,1.0,9ce932d55f27ed28f0e5,...,29835.0,10.0,2607.0,,1,13.3.1,0,30cfce348022c3812b3a,1,a96538aec6d5d90fd8d9
1,ac8453e4a93391812b78,2020-04-28 00:09:13.759000,0,28,1081,2.0,ba2bb260d8eccd0057a4,85518c203c727a6fcd6e,2.0,23f7cf2bddc97c599ac4,...,159596.0,2.0,13540.0,c3292411366546db2194,2,9.0.0,1,4c28854cab6a85c37dab,2,297ade72d5f1e9ed6e2f
2,4959adca0d9284e642c9,2020-04-28 00:09:14.708000,0,42,1123,3.0,0aaa2635cef6c96c7932,d01b84105c697315f36d,1.0,2dde952bf6521fac0b71,...,156643.0,9.0,33725.0,07514730ff1e69100233,1,12.1.2,0,b35f0cd7210c4273489c,1,
3,622c44ebc7cf591d046e,2020-04-28 00:10:40.291000,0,8,1113,2.0,1ed4e5bb950da757976a,c72faf7a97213d7e4c8c,1.0,1cbbcb2f33a045e5a09c,...,,4.0,7377.0,84a2cff85228c12b0e6a,1,13.4.1,0,57dc1bf69be06dac416c,1,b5401899f2db8ce48d43
4,3093c6746ae6e35b9ad5,2020-04-28 00:11:45.895000,0,22,1061,,d352ebd3462849796754,0376415068bf3e5b9e95,4.0,84095859da9cffce1c60,...,1566749.0,5.0,164759.0,2fd1c330d5a47edc039b,1,12.4.5,0,7ca7b54b97093b084027,1,


### target encoding

#### advertiser_id	

In [8]:
grp_ad= train_df.groupby(['advertiser_id']).mean()[['target']].reset_index()
print('train', grp_ad.shape)
test_ad = pd.DataFrame(test_df['advertiser_id'].unique(), columns=['advertiser_id'])
print('test', test_ad.shape)
grp_ad = pd.merge(grp_ad, test_ad, on=['advertiser_id'], how='outer')
grp_ad['target'].fillna(train_df['target'].mean(), inplace=True)
grp_ad.rename(columns={'target': 'advertiser_tgt'}, inplace=True)
print('total', grp_ad.shape)

train (145, 2)
test (140, 1)
total (168, 2)


#### app_id

In [9]:
grp_app = train_df.groupby(['app_id']).mean()[['target']].reset_index()
print('train', grp_app.shape)
test_app = pd.DataFrame(test_df['app_id'].unique(), columns=['app_id'])
print('test', test_app.shape)
grp_app = pd.merge(grp_app, test_app, on=['app_id'], how='outer')
grp_app['target'].fillna(train_df['target'].mean(), inplace=True)
grp_app.rename(columns={'target': 'app_tgt'}, inplace=True)
print('total', grp_app.shape)

train (14284, 2)
test (7674, 1)
total (15895, 2)


#### media_app_id

In [10]:
grp_media = train_df.groupby(['media_app_id']).mean()[['target']].reset_index()
print('train', grp_media.shape)
test_media = pd.DataFrame(test_df['media_app_id'].unique(), columns=['media_app_id'])
print('test', test_media.shape)
grp_media = pd.merge(grp_media, test_media, on=['media_app_id'], how='outer')
grp_media['target'].fillna(train_df['target'].mean(), inplace=True)
grp_media.rename(columns={'target': 'media_tgt'}, inplace=True)
print('total', grp_media.shape)

train (10627, 2)
test (5908, 1)
total (11755, 2)


In [11]:
def create_targetencoding_features(input_df):
    targetenc_col = ['advertiser_id', 'app_id', 'media_app_id']
    target_enc = input_df[targetenc_col].copy()
    target_enc = pd.merge(target_enc, grp_ad, on=['advertiser_id'], how='left')
    target_enc = pd.merge(target_enc, grp_app, on=['app_id'], how='left')
    target_enc = pd.merge(target_enc, grp_media, on=['media_app_id'], how='left')
    return target_enc.drop(columns=targetenc_col)

#### 連続変数の特徴量

In [12]:
def create_continuous_features(input_df):
    use_columns = [
        # 連続変数
        'first_login_interval',
        'max_login_interval', 
        'frequency', 
        'login_frequency', 
        'last_login_interval',
        'from_click',
    ]
    return input_df[use_columns].copy()

#### Category系の特徴量

In [13]:
def create_category_features(input_df):
    use_columns = [
        # category 系の id. label-encoding として使う
        'adnw_id',
        'adspot_id',
        'adspot_video_format_id',
        'game_feed_asset_type_id',
        'auction_type_id',
        'category_id',
        'header_bidding',
        'is_interstitial',
        'os',
#  os_version',
        'pos',
        'user_type_id'
    ]
    return input_df[use_columns].copy()

#### country_code

In [14]:
def create_countrycode(input_df):
    le = preprocessing.LabelEncoder()
    le.fit(['None', 'JP', 'US', 'KR'])
    return pd.DataFrame(le.transform(input_df['country_code'].fillna('None')), columns=['country'])

#### date系

In [15]:
def create_date_features(input_df):
    date_df = pd.DataFrame(pd.to_datetime(input_df['imp_at'], utc=True))
    date_df['imp_at'] = date_df['imp_at'].dt.tz_convert('Asia/Tokyo')
    date_df['day'] = date_df['imp_at'].dt.day
    date_df['hour'] = date_df['imp_at'].dt.hour
    date_df['total_minute'] = date_df['imp_at'].dt.hour*60+date_df['imp_at'].dt.minute
    date_df['dayofweek'] = date_df['imp_at'].dt.dayofweek
    date_df.drop(columns=['imp_at'], inplace=True)
    return date_df

#### campaign

In [16]:
def create_campaign_features(input_df):
    campaign = pd.merge(input_df[['campaign_id']], campaign_df, left_on='campaign_id', right_on='id', how='left')
    campaign.drop(columns=['campaign_id', 'id', 'mst_advertiser_id', 'mst_advertiser_order_id'], inplace=True)
    return campaign

#### map_game_feed_native_video_assets

In [17]:
def create_gamefeed_features(input_df):
    input_merge = pd.merge(input_df[['game_feed_id', 'advertiser_id', 'video_template_id']], map_gv_df, 
                           left_on='game_feed_id', right_on='mst_game_feed_id', how='left').drop(columns=['mst_game_feed_id'])
    
    horizontal = ad_video_df.copy()
    left_keys = ['horizontal_mst_advertiser_video_id', 'advertiser_id']
    right_keys = ['id', 'mst_advertiser_id']
    horizontal.columns = [f'horizontal_{c}' if c not in right_keys else c for c in horizontal.columns]
    input_merge = pd.merge(input_merge, horizontal, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys) 
    
    vertical = ad_video_df.copy()
    left_keys = ['vertical_mst_advertiser_video_id', 'advertiser_id']
    right_keys = ['id', 'mst_advertiser_id']
    vertical.columns = [f'vertical_{c}' if c not in right_keys else c for c in vertical.columns]
    input_merge = pd.merge(input_merge, vertical, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys)
    
    left_keys = [
        "horizontal_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    horizontal = ad_cvideo_df.copy()
    horizontal.columns = [f"horizontal_converted_{c}" if c not in right_keys else c for c in horizontal.columns]
    input_merge = pd.merge(input_merge, horizontal, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys) 
    
    left_keys = [
        "vertical_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    vertical = ad_cvideo_df.copy()
    vertical.columns = [f"vertical_converted_{c}" if c not in right_keys else c for c in vertical.columns]
    input_merge = pd.merge(input_merge, vertical, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys)
    
    input_merge.drop(columns=['game_feed_id', 'advertiser_id', 'video_template_id', 
                              'horizontal_mst_advertiser_video_id', 'vertical_mst_advertiser_video_id'], inplace=True)
    return input_merge

In [18]:
processors = [
    create_continuous_features,
    create_category_features,
    create_countrycode,
    create_date_features,
    create_campaign_features,
    create_gamefeed_features,
    create_targetencoding_features
]

In [19]:
def to_feature(input_df):
    out_df = pd.DataFrame()
    for func in processors:
        _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
    return out_df

In [20]:
train_feat_df = to_feature(train_df)
test_feat_df = to_feature(test_df)
y = train_df['target']

In [21]:
assert len(train_feat_df) == len(train_df)
assert len(test_feat_df) == len(test_df)

In [22]:
feature_count = len(train_feat_df.columns)

In [23]:
#del train_df, test_df

#### LightGBM による学習

In [24]:
from sklearn.metrics import average_precision_score

def pr_auc(y_pred, y_true):
    """lightGBM の round ごとに PR-AUC を計算する用"""
    score = average_precision_score(y_true.get_label(), y_pred)
    return "pr_auc", score, True

In [25]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'seed' : 0,
    'learning_rate':  0.1,
    'feature_pre_filter': False, 
    'lambda_l1': 6.419345380049121e-07, 
    'lambda_l2': 8.432801302426078, 
    'num_leaves': 212, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.9907178796872467, 
    'bagging_freq': 2, 
    'min_child_samples': 100
}

In [26]:
def train_lgbm(X, y, cv, params: dict, verbose=100):

    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    num_boost_round=1000,
                                                    early_stopping_rounds=verbose,
                                                    feval=pr_auc,
                                                    verbose_eval=verbose)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)

        print(f'Fold {i} PR-AUC: {average_precision_score(y_valid, y_pred):.4f}')

    score = average_precision_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models, score

In [27]:
from sklearn.model_selection import StratifiedKFold

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv = list(fold.split(train_feat_df, y)) # もともとが generator なため明示的に list に変換する

In [28]:
def tuning_lgbm(X, y, cv, params, verbose=100):
    idx_train, idx_valid = cv[0]
    x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    lgbm_train = lgbm.Dataset(x_train, y_train)
    lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
    
    best_params, tuning_history = dict(), list()
    best = lgbm.train(params,
                                  lgbm_train,
                                  valid_sets=lgbm_eval,
                                  num_boost_round=1000,
                                  early_stopping_rounds=verbose,
                                  feval=pr_auc,
                                  verbose_eval=0)
    print('Best Params:', best.params)
    print('Best Iteration:', best.best_iteration)
    print('Best Score:', best.best_score)

In [29]:
# %%time
# tuning_lgbm(train_feat_df, y, cv, params=lgbm_param)

In [30]:
%%time
oof, models, score = train_lgbm(train_feat_df, y, cv, params=lgbm_param)

[LightGBM] [Info] Number of positive: 52392, number of negative: 1545684
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 1598076, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.032784 -> initscore=-3.384468
[LightGBM] [Info] Start training from score -3.384468
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.103364	valid_0's pr_auc: 0.257655
[200]	valid_0's binary_logloss: 0.103193	valid_0's pr_auc: 0.25864
Early stopping, best iteration is:
[167]	valid_0's binary_logloss: 0.103174	valid_0's pr_auc: 0.258858
Fold 0 PR-AUC: 0.2589
[LightGBM] [Info] Number of positive: 52392, number of negative: 1545684
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total B

In [31]:
def feat_imp(model):
    fi = model.feature_importance()
    fn = model.feature_name()
    df_feature_importance = pd.DataFrame({'name':fn, 'imp':fi})
    df_feature_importance.sort_values('imp', inplace=True)
    return df_feature_importance

def feature_importance(models):
    fi = pd.DataFrame(columns=['name'])
    for i, model in enumerate(models):
        fi_tmp = feat_imp(model)
        colname = 'imp_{}'.format(i)
        fi_tmp.rename(columns={'imp': colname}, inplace=True)
        fi = pd.merge(fi, fi_tmp, on=['name'], how='outer')
    fi['sum'] = fi.sum(axis=1)
    return fi.sort_values(['sum'], ascending=False)

In [32]:
feature_importance(models)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,imp_4,sum
41,total_minute,2964,4362,3922,2845,5096,19189
43,max_login_interval,3167,4145,3647,3106,4840,18905
42,last_login_interval,2978,4353,3493,2945,4821,18590
40,horizontal_file_size,2690,3637,3062,2526,4298,16213
38,first_login_interval,2427,3898,3027,2334,4487,16173
39,app_tgt,2483,3577,2911,2362,4209,15542
37,media_tgt,2245,3148,2891,2180,3698,14162
36,horizontal_converted_file_size,2044,3088,2596,1954,3409,13091
35,advertiser_tgt,1980,2667,2282,1875,3106,11910
34,day,1721,2738,2057,1652,3148,11316


In [33]:
%%time
pred_list = []
for model in models:
    print('best_iteration', model.best_iteration)
    pred = model.predict(test_feat_df, num_iteration = model.best_iteration)
    pred_list.append(pred)
    
pred = np.mean(pred_list, axis=0)

best_iteration 167
best_iteration 241
best_iteration 200
best_iteration 162
best_iteration 275
CPU times: user 1min 34s, sys: 257 ms, total: 1min 35s
Wall time: 5.52 s


In [34]:
assert len(pred) == len(test_df)

In [35]:
sub_df = pd.DataFrame({ 'target': pred })
sub_df.to_csv(os.path.join(OUTPUT_DIR, 'submission.csv'), index=False)

In [36]:
print('- feature={}'.format(feature_count))
print('- score={:.4f}'.format(score))

- feature=44
- score=0.2589


#### simple_4: target_enc
- Wall time: 8min
- feature=44
- score=0.2589
- publicLB= 0.1973

#### simple_3: tuning
- Wall time: 7min 19s
- feature=41
- score=0.2229
- publicLB= 0.1970

#### simple_2
- Wall time: 47min 46s
- feature= 41
- score= 0.214588
- publicLB= 0.1907

#### tuning
```
Best Params: {
    'objective': 'binary', 
    'boosting_type': 'gbdt', 
    'seed': 0, 
    'learning_rate': 0.1, 
    'feature_pre_filter': False, 
    'lambda_l1': 6.419345380049121e-07, 
    'lambda_l2': 8.432801302426078, 
    'num_leaves': 212, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 0.9907178796872467, 
    'bagging_freq': 2, 
    'min_child_samples': 100, 
    'num_iterations': 1000, 
    'early_stopping_round': 100
}
Best Iteration: 245
Best Score: 'pr_auc', 0.22382995580267329
```