In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing

In [2]:
INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'

In [3]:
def read_feather(filename):
    df = pd.read_feather(os.path.join(INPUT_DIR, filename))
    print(filename, df.shape)
    return df

In [4]:
train_df = read_feather('train.f')
test_df = read_feather('test.f')

train.f (1997595, 35)
test.f (390095, 30)


In [5]:
campaign_df = read_feather('campaign.f')
map_gv_df = read_feather('map_game_feed_native_video_assets.f')
ad_video_df = read_feather('advertiser_video.f')
ad_cvideo_df = read_feather('advertiser_converted_video.f')

campaign.f (14627, 4)
map_game_feed_native_video_assets.f (2796, 3)
advertiser_video.f (11707, 6)
advertiser_converted_video.f (198622, 8)


In [6]:
ad_cvideo_df = ad_cvideo_df.drop_duplicates(
    subset=['mst_advertiser_video_id', 
                   'mst_game_feed_id', 
                    'mst_video_template_id'], keep='last')

le = preprocessing.LabelEncoder()
le.fit(['vertical', 'horizontal'])
ad_cvideo_df['rectangle_type_id'] = le.transform(ad_cvideo_df['rectangle_type'])
ad_cvideo_df.drop(columns=['rectangle_type'], inplace=True)
print('ad_cvideo_df', ad_cvideo_df.shape)

ad_cvideo_df (107493, 8)


#### 連続変数の特徴量

In [7]:
def create_continuous_features(input_df):
    use_columns = [
        # 連続変数
        'first_login_interval',
        'max_login_interval', 
        'frequency', 
        'login_frequency', 
        'last_login_interval',
        'from_click',
    ]
    return input_df[use_columns].copy()

#### Category系の特徴量

In [8]:
def create_category_features(input_df):
    use_columns = [
        # category 系の id. label-encoding として使う
        'adnw_id',
        'adspot_id',
        'adspot_video_format_id',
        'game_feed_asset_type_id',
        'auction_type_id',
        'category_id',
        'header_bidding',
        'is_interstitial',
        'os',
#  os_version',
        'pos',
        'user_type_id'
    ]
    return input_df[use_columns].copy()

#### country_code

In [9]:
def create_countrycode(input_df):
    le = preprocessing.LabelEncoder()
    le.fit(['None', 'JP', 'US', 'KR'])
    return pd.DataFrame(le.transform(input_df['country_code'].fillna('None')), columns=['country'])

#### date系

In [10]:
def create_date_features(input_df):
    date_df = pd.DataFrame(pd.to_datetime(input_df['imp_at'], utc=True))
    date_df['imp_at'] = date_df['imp_at'].dt.tz_convert('Asia/Tokyo')
    date_df['day'] = date_df['imp_at'].dt.day
    date_df['hour'] = date_df['imp_at'].dt.hour
    date_df['total_minute'] = date_df['imp_at'].dt.hour*60+date_df['imp_at'].dt.minute
    date_df['dayofweek'] = date_df['imp_at'].dt.dayofweek
    date_df.drop(columns=['imp_at'], inplace=True)
    return date_df

#### campaign

In [11]:
def create_campaign_features(input_df):
    campaign = pd.merge(input_df[['campaign_id']], campaign_df, left_on='campaign_id', right_on='id', how='left')
    campaign.drop(columns=['campaign_id', 'id', 'mst_advertiser_id', 'mst_advertiser_order_id'], inplace=True)
    return campaign

#### map_game_feed_native_video_assets

In [12]:
def create_gamefeed_features(input_df):
    input_merge = pd.merge(input_df[['game_feed_id', 'advertiser_id', 'video_template_id']], map_gv_df, 
                           left_on='game_feed_id', right_on='mst_game_feed_id', how='left').drop(columns=['mst_game_feed_id'])
    
    horizontal = ad_video_df.copy()
    left_keys = ['horizontal_mst_advertiser_video_id', 'advertiser_id']
    right_keys = ['id', 'mst_advertiser_id']
    horizontal.columns = [f'horizontal_{c}' if c not in right_keys else c for c in horizontal.columns]
    input_merge = pd.merge(input_merge, horizontal, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys) 
    
    vertical = ad_video_df.copy()
    left_keys = ['vertical_mst_advertiser_video_id', 'advertiser_id']
    right_keys = ['id', 'mst_advertiser_id']
    vertical.columns = [f'vertical_{c}' if c not in right_keys else c for c in vertical.columns]
    input_merge = pd.merge(input_merge, vertical, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys)
    
    left_keys = [
        "horizontal_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    horizontal = ad_cvideo_df.copy()
    horizontal.columns = [f"horizontal_converted_{c}" if c not in right_keys else c for c in horizontal.columns]
    input_merge = pd.merge(input_merge, horizontal, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys) 
    
    left_keys = [
        "vertical_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    vertical = ad_cvideo_df.copy()
    vertical.columns = [f"vertical_converted_{c}" if c not in right_keys else c for c in vertical.columns]
    input_merge = pd.merge(input_merge, vertical, left_on=left_keys, right_on=right_keys, how='left').drop(columns=right_keys)
    
    input_merge.drop(columns=['game_feed_id', 'advertiser_id', 'video_template_id', 
                              'horizontal_mst_advertiser_video_id', 'vertical_mst_advertiser_video_id'], inplace=True)
    return input_merge

In [13]:
processors = [
    create_continuous_features,
    create_category_features,
    create_countrycode,
    create_date_features,
    create_campaign_features,
    create_gamefeed_features
]

In [14]:
def to_feature(input_df):
    out_df = pd.DataFrame()
    for func in processors:
        _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
    return out_df

In [15]:
train_feat_df = to_feature(train_df)
test_feat_df = to_feature(test_df)

In [16]:
assert len(train_feat_df) == len(train_df)
assert len(test_feat_df) == len(test_df)

In [17]:
feature_count = len(train_feat_df.columns)

#### LightGBM による学習

In [18]:
from sklearn.metrics import average_precision_score
import lightgbm as lgbm

def pr_auc(y_pred, y_true):
    """lightGBM の round ごとに PR-AUC を計算する用"""
    score = average_precision_score(y_true.get_label(), y_pred)
    return "pr_auc", score, True

In [19]:
lgbm_param = {
    'objective' : 'binary',
    'boosting_type': 'gbdt',
    'seed' : 0,
    'learning_rate':  0.1,
    'max_depth': 6,
}

In [20]:
def train_lgbm(X, y, cv, params: dict, verbose=100):

    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        x_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
        
        lgbm_train = lgbm.Dataset(x_train, y_train)
        lgbm_eval = lgbm.Dataset(x_valid, y_valid, reference=lgbm_train)
        
        lgbm_model = lgbm.train(params, 
                                                    lgbm_train, 
                                                    valid_sets=lgbm_eval,
                                                    num_boost_round=1000,
                                                    early_stopping_rounds=verbose,
                                                    feval=pr_auc,
                                                    verbose_eval=verbose)
        y_pred = lgbm_model.predict(x_valid, num_iteration=lgbm_model.best_iteration)
        
        oof_pred[idx_valid] = y_pred
        models.append(lgbm_model)

        print(f'Fold {i} PR-AUC: {average_precision_score(y_valid, y_pred):.4f}')

    score = average_precision_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models, score

In [21]:
from sklearn.model_selection import StratifiedKFold

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
y = train_df['target']
cv = list(fold.split(train_feat_df, y)) # もともとが generator なため明示的に list に変換する

In [22]:
%%time
oof, models, score = train_lgbm(train_feat_df, y, cv, params=lgbm_param)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.111254	valid_0's pr_auc: 0.200362
[200]	valid_0's binary_logloss: 0.110052	valid_0's pr_auc: 0.206499
[300]	valid_0's binary_logloss: 0.109378	valid_0's pr_auc: 0.210591
[400]	valid_0's binary_logloss: 0.108965	valid_0's pr_auc: 0.212557
[500]	valid_0's binary_logloss: 0.108658	valid_0's pr_auc: 0.214089
[600]	valid_0's binary_logloss: 0.108476	valid_0's pr_auc: 0.214924
[700]	valid_0's binary_logloss: 0.108404	valid_0's pr_auc: 0.214901
Early stopping, best iteration is:
[628]	valid_0's binary_logloss: 0.108425	valid_0's pr_auc: 0.215074
Fold 0 PR-AUC: 0.2151
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.1105	valid_0's pr_auc: 0.20174
[200]	valid_0's binary_logloss: 0.109048	valid_0's pr_auc: 0.20881
[300]	valid_0's binary_logloss: 0.108398	valid_0's pr_auc: 0.211808
[400]	valid_0's binary_logloss: 0.108035	valid_0's pr_auc: 0.213381
[500]	v

In [23]:
def feat_imp(model):
    fi = model.feature_importance()
    fn = model.feature_name()
    df_feature_importance = pd.DataFrame({'name':fn, 'imp':fi})
    df_feature_importance.sort_values('imp', inplace=True)
    return df_feature_importance

def feature_importance(models):
    fi = pd.DataFrame(columns=['name'])
    for i, model in enumerate(models):
        fi_tmp = feat_imp(model)
        colname = 'imp_{}'.format(i)
        fi_tmp.rename(columns={'imp': colname}, inplace=True)
        fi = pd.merge(fi, fi_tmp, on=['name'], how='outer')
    fi['sum'] = fi.sum(axis=1)
    return fi.sort_values(['sum'], ascending=False)

In [24]:
feature_importance(models)

Unnamed: 0,name,imp_0,imp_1,imp_2,imp_3,imp_4,sum
40,max_login_interval,2178,3250,3581,2827,3265,15101
38,last_login_interval,2093,2857,3356,2601,3080,13987
39,horizontal_file_size,2116,2813,3058,2595,2758,13340
37,first_login_interval,1872,2807,3190,2486,2757,13112
36,total_minute,1459,2219,2606,1937,2215,10436
34,horizontal_converted_file_size,1194,1817,2158,1594,1778,8541
35,day,1246,1777,1951,1571,1688,8233
33,horizontal_duration,1025,1155,1356,1166,1203,5905
32,mst_user_type_id,812,997,1097,904,955,4765
31,login_frequency,724,953,1115,891,1004,4687


In [25]:
%%time
pred_list = []
for model in models:
    print('best_iteration', model.best_iteration)
    pred = model.predict(test_feat_df, num_iteration = model.best_iteration)
    pred_list.append(pred)
    
pred = np.mean(pred_list, axis=0)

best_iteration 628
best_iteration 874
best_iteration 999
best_iteration 792
best_iteration 884
CPU times: user 3min 48s, sys: 2.85 s, total: 3min 50s
Wall time: 1min 30s


In [26]:
assert len(pred) == len(test_df)

In [27]:
sub_df = pd.DataFrame({ 'target': pred })
sub_df.to_csv(os.path.join(OUTPUT_DIR, 'submission.csv'), index=False)

In [29]:
print('- feature=', feature_count)
print('- score=', score)

- feature= 41
- score= 0.21458819940873314


#### simple_2
- feature= 41
- score= 0.214588
- publicLB= 0.1907