- add cross values summary info
- add xgb ts model for ensemble

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from datetime import timedelta
from sklearn import preprocessing
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
tqdm_notebook.pandas(desc="progress: ")

In [2]:
DIR = "../input/20201115at/"
train = pd.read_csv(DIR+"train.csv")
test = pd.read_csv(DIR+"test.csv")
campaign = pd.read_csv(DIR+"campaign.csv")

#advertiser_video = pd.read_csv(DIR+"advertiser_video.csv")
#map_game_feed_native_video_assets = pd.read_csv(DIR+'map_game_feed_native_video_assets.csv')
#advertiser_converted_video = pd.read_csv(DIR+"advertiser_converted_video.csv")

In [3]:
#cols = ["mst_advertiser_video_id", "mst_game_feed_id", "mst_video_template_id"]
#advertiser_converted_video = advertiser_converted_video[~advertiser_converted_video[cols].duplicated(keep="last")]

# preprocess

In [4]:
# with campaign
train = train.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)
test = test.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)

In [5]:
train["imp_at"] = pd.to_datetime(train["imp_at"])
test["imp_at"] = pd.to_datetime(test["imp_at"])
train["date"] = train["imp_at"].dt.date
test["date"] = test["imp_at"].dt.date
train["hour"] = train["imp_at"].dt.hour
test["hour"] = test["imp_at"].dt.hour

In [6]:
train["click_or_view_at"] = pd.to_datetime(train["click_or_view_at"])
train["cv_at"] = pd.to_datetime(train["cv_at"])

In [7]:
# drop because distribution is different
train.drop(['os_version'], axis=1, inplace=True)
test.drop(['os_version'], axis=1, inplace=True)

# noise features?
train.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id", "header_bidding"], axis=1, inplace=True)
test.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id", "header_bidding"], axis=1, inplace=True)

In [8]:
app_ranking = pd.concat([train, test]).groupby("app_id")["uid"].nunique().reset_index(drop=False)
app_ranking.columns = ["app_id", "app_users"]
train = train.merge(app_ranking, on="app_id", how="left")
test = test.merge(app_ranking, on="app_id", how="left")

media_app_rank = pd.concat([train, test]).groupby(["game_feed_id", "campaign_id"])["media_app_id"].nunique().reset_index(drop=False)
media_app_rank.columns = ["game_feed_id", "campaign_id", "madia_app_num"]
train = train.merge(media_app_rank, on=["game_feed_id", "campaign_id"], how="left")
test = test.merge(media_app_rank, on=["game_feed_id", "campaign_id"], how="left")

In [9]:
object_feats = [i for i in train.columns if "object" == str(train[i].dtype) 
                and i != "request_id" and i != "uid" and i != "date" and i != "hour"]

def get_non_overlapping(train: pd.DataFrame, test: pd.DataFrame, column: str):
    only_in_train = set(train[column].unique()) - set(test[column].unique())
    only_in_test = set(test[column].unique()) - set(train[column].unique())
    non_overlapping = only_in_train.union(only_in_test)
    return non_overlapping

def category2num(train: pd.DataFrame, test: pd.DataFrame, columns: list):
    train_ = train.copy()
    test_ = test.copy()
    for column in columns:
        non_overlapping = get_non_overlapping(train, test, column)
        try:
            if train[column].dtype == np.dtype("O"):
                # dtypeがobjectなら欠損は'missing' クラスにする
                train_[column] = train[column].fillna("missing")
                test_[column] = test[column].fillna("missing")
                train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else "other")
                test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else "other")
            else:
                # dtypeがint/floatなら欠損は'-1'とする
                train_[column] = train[column].fillna(-1)
                test_[column] = test[column].fillna(-1)
                train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else -2)
                test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else -2)

            le = preprocessing.LabelEncoder()
            concatenated = pd.concat([train_, test_], axis=0).reset_index(drop=True)
            le.fit(concatenated[column])
            train_[column] = le.transform(train_[column])
            test_[column] = le.transform(test_[column])
        except Exception:
            import pdb
            pdb.set_trace()
    return train_, test_

train, test = category2num(train, test, object_feats)

In [10]:
# drop because the exist only in train
train.drop(['click_or_view_at', 'cv_at','last_cv_interval', 'last_paid_interval'], axis=1, inplace=True)

# feature engineering

In [11]:
train["type"] = "train"
test["type"] = "test"
all_df = pd.concat([train, test])
all_df = all_df.sort_values(["uid", "imp_at"]).reset_index(drop=True)

In [12]:
all_df["count"] = 1
all_df["count"] = all_df.groupby("uid")["count"].cumsum()

In [13]:
def _create_whole_imp_at_feature(whole_df, var) -> pd.DataFrame:

    imp_at = pd.to_datetime(whole_df['imp_at']) + timedelta(hours=9) # utc -> asia/tokyo
    out_df = pd.DataFrame()
    out_df['hours'] = imp_at.dt.hour
    out_df['dayofweek'] = imp_at.dt.dayofweek
    out_df['hour_zone'] = pd.cut(out_df['hours'].values, bins=[-np.inf, 6, 12, 18, np.inf]).codes

    def _create_pivot(input_df, c, column='dayofweek', values='hours'):
        _df = pd.pivot_table(data=input_df, index=[c], columns=[column], values=values, aggfunc='count')
        _df = _df.fillna(0)
        _df.columns = [column + '=' + str(x) for x in _df.columns]

        # index ごとに正規化して割合にする
        _df = _df.div(_df.sum(axis=1), axis=0)
        return pd.merge(input_df[c], _df, on=c, how='left').drop(columns=[c]).add_prefix(c + '_')

    # uid での集約
    for c in [var]:
        _df = out_df.copy()
        _df[c] = whole_df[c]
        _feat = pd.concat([
            _create_pivot(_df, c),
            _create_pivot(_df, c, column='hour_zone', values='dayofweek')  
        ], axis=1)
        out_df = pd.concat([out_df, _feat], axis=1)

    out_df['request_id'] = whole_df['request_id']
    return out_df

user_imp_df = _create_whole_imp_at_feature(all_df, "uid")
user_imp_df.drop(["hours","dayofweek","hour_zone"], axis=1, inplace=True)
all_df = all_df.merge(user_imp_df, on="request_id", how="left")

In [14]:
camp_num = all_df.groupby("date")["campaign_id"].nunique().reset_index(drop=False)
camp_num.columns=["date", "camp_num"]
all_df = all_df.merge(camp_num, on="date", how="left")

user_num = all_df.groupby("date")["uid"].nunique().reset_index(drop=False)
user_num.columns=["date", "user_num"]
all_df = all_df.merge(user_num, on="date", how="left")

all_df.drop(["date", "hour"], axis=1, inplace=True)

In [15]:
train = all_df[all_df.type=="train"].drop("type", axis=1).reset_index(drop=True)
test = all_df[all_df.type=="test"].drop("type", axis=1).reset_index(drop=True)

In [16]:
train = train.sort_values("imp_at").reset_index(drop=True)
test = test.sort_values("imp_at").reset_index(drop=True)

In [17]:
ce_feats = ["uid"]
for c in ce_feats:
    freq = pd.concat([train[c], test[c]]).value_counts()
    train[c] = train[c].map(freq)
    test[c] = test[c].map(freq)

# separation for validation

In [18]:
train.drop(['request_id'], axis=1, inplace=True)
test.drop(['request_id'], axis=1, inplace=True)

In [19]:
mask = train.imp_at > "2020-05-21"
valid_df = train.loc[mask].copy()
train_df = train.loc[~mask].copy()

In [20]:
train = train.drop(["imp_at"], axis=1) 
test = test.drop(["imp_at"], axis=1) 

valid_df = valid_df.drop(["imp_at"], axis=1)
train_df = train_df.drop(["imp_at"], axis=1)

In [21]:
test = test.drop("target", axis=1)

In [22]:
train.shape, test.shape 

((1997595, 40), (390095, 39))

In [23]:
train.columns

Index(['target', 'adnw_id', 'adspot_id', 'adspot_video_format_id',
       'advertiser_id', 'app_id', 'auction_type_id', 'campaign_id',
       'first_login_interval', 'frequency', 'from_click', 'game_feed_id',
       'game_template_id', 'is_interstitial', 'item_id', 'last_login_interval',
       'login_frequency', 'max_login_interval', 'media_app_id', 'os', 'pos',
       'uid', 'mst_advertiser_order_id', 'mst_user_type_id', 'app_users',
       'madia_app_num', 'count', 'uid_dayofweek=0', 'uid_dayofweek=1',
       'uid_dayofweek=2', 'uid_dayofweek=3', 'uid_dayofweek=4',
       'uid_dayofweek=5', 'uid_dayofweek=6', 'uid_hour_zone=0',
       'uid_hour_zone=1', 'uid_hour_zone=2', 'uid_hour_zone=3', 'camp_num',
       'user_num'],
      dtype='object')

In [24]:
object_feats = [i for i in object_feats if i in train.columns] 
object_feats

['advertiser_id',
 'app_id',
 'campaign_id',
 'game_feed_id',
 'game_template_id',
 'media_app_id',
 'mst_advertiser_order_id']

In [25]:
del all_df, user_imp_df

# stratified lgb

In [26]:
# stratified k fold
n_folds = 5
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def modelling(seed):
    print("seed:", seed)
    lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 10, 'random_seed':seed,'max_depth': 5}
    
    y = train.target.copy()
    X = train.drop(['target'],axis=1).copy()
    
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    valid = np.zeros([X.shape[0]])
    features_list = [i for i in X.columns]
    pred_value = np.zeros(test.shape[0])
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    
    for i , (train_index, val_index) in enumerate(skf.split(X, y)):
        print("fold: ", i)
        
        X_train = X.iloc[train_index,:]
        y_train = y.iloc[train_index]

        X_valid = X.iloc[val_index,:]
        y_valid = y.iloc[val_index]
        
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 30,
                          feval=pr_auc_metric, categorical_feature = sorted(object_feats))
        
        valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
        valid[val_index] = valid_predict
        pred_value += model.predict(test, num_iteration = model.best_iteration)
        feature_importance_df["Fold_"+str(i+1)] = model.feature_importance()
    
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    
    return pred_value, valid, feature_importance_df

validation = np.zeros([train.shape[0]])
pred_value = np.zeros([test.shape[0]])
seeds = [0]
for seed_ in seeds:
    test_pred, oof_predict, feature_importance_df = modelling(seed_)
    validation += oof_predict / len(seeds)
    pred_value += test_pred / len(seeds)

score = average_precision_score(train.target, validation)
print(score)

seed: 0
fold:  0
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.266824	valid_1's pr_auc: 0.261274
[60]	training's pr_auc: 0.290312	valid_1's pr_auc: 0.279244
[90]	training's pr_auc: 0.303866	valid_1's pr_auc: 0.285887
[120]	training's pr_auc: 0.312721	valid_1's pr_auc: 0.288434
[150]	training's pr_auc: 0.321673	valid_1's pr_auc: 0.291614
[180]	training's pr_auc: 0.330722	valid_1's pr_auc: 0.294979
[210]	training's pr_auc: 0.338105	valid_1's pr_auc: 0.296171
[240]	training's pr_auc: 0.344596	valid_1's pr_auc: 0.297017
[270]	training's pr_auc: 0.351574	valid_1's pr_auc: 0.29803
[300]	training's pr_auc: 0.356813	valid_1's pr_auc: 0.298545
[330]	training's pr_auc: 0.362825	valid_1's pr_auc: 0.298982
[360]	training's pr_auc: 0.36814	valid_1's pr_auc: 0.299464
Early stopping, best iteration is:
[350]	training's pr_auc: 0.366459	valid_1's pr_auc: 0.299616
fold:  1
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.266

In [27]:
feature_importance_df.sort_values("Average", ascending=False)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
4,app_id,796,772,567,495,682,662.4,116.076871,0.175237
6,campaign_id,463,436,383,345,421,409.6,41.374388,0.101012
17,media_app_id,416,391,276,249,339,334.2,64.166658,0.192001
10,game_feed_id,243,219,170,129,181,188.4,39.595959,0.21017
20,uid,165,172,156,135,158,157.2,12.448293,0.079188
3,advertiser_id,159,159,136,119,146,143.8,15.118201,0.105134
21,mst_advertiser_order_id,111,102,69,77,102,92.2,16.216041,0.175879
25,count,93,98,78,72,90,86.2,9.682975,0.112331
24,madia_app_num,75,88,58,56,76,70.6,12.026637,0.170349
14,last_login_interval,73,63,53,43,49,56.2,10.628264,0.189115


# ts lgb

In [28]:
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def modelling_ts(seed):
    print("seed:", seed)
    lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 10, 'random_seed':seed,'max_depth': 5}
    y_train = train_df.target.copy()
    X_train = train_df.drop(['target'],axis=1).copy()
    y_valid = valid_df.target.copy()
    X_valid = valid_df.drop(['target'],axis=1).copy()
    
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
        
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
    model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
       num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 10, feval=pr_auc_metric, categorical_feature = sorted(object_feats))
    
    valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
    feature_importance_df["Importance"] = model.feature_importance()
        
    score = average_precision_score(y_valid, valid_predict)
    print("average precision score = {}".format(score))
    pred_value = model.predict(test, num_iteration = model.best_iteration)
    
    return pred_value, valid_predict, feature_importance_df

validation_ts = np.zeros([valid_df.shape[0]])
pred_value_ts = np.zeros([test.shape[0]])
seeds = [0]
for seed_ in seeds:
    test_pred, oof_predict, feature_importance_df = modelling_ts(seed_)
    validation_ts += oof_predict / len(seeds)
    pred_value_ts += test_pred / len(seeds)

ts_score = average_precision_score(valid_df.target, validation_ts)
print(ts_score)

seed: 0
Training until validation scores don't improve for 20 rounds
[10]	training's pr_auc: 0.251729	valid_1's pr_auc: 0.193945
[20]	training's pr_auc: 0.267496	valid_1's pr_auc: 0.202704
[30]	training's pr_auc: 0.279604	valid_1's pr_auc: 0.208789
[40]	training's pr_auc: 0.288693	valid_1's pr_auc: 0.212989
[50]	training's pr_auc: 0.295939	valid_1's pr_auc: 0.215513
[60]	training's pr_auc: 0.304917	valid_1's pr_auc: 0.218331
[70]	training's pr_auc: 0.310358	valid_1's pr_auc: 0.219703
[80]	training's pr_auc: 0.315263	valid_1's pr_auc: 0.221439
[90]	training's pr_auc: 0.31883	valid_1's pr_auc: 0.222125
[100]	training's pr_auc: 0.324017	valid_1's pr_auc: 0.224034
[110]	training's pr_auc: 0.327504	valid_1's pr_auc: 0.224547
[120]	training's pr_auc: 0.330139	valid_1's pr_auc: 0.225157
[130]	training's pr_auc: 0.333199	valid_1's pr_auc: 0.225695
[140]	training's pr_auc: 0.336918	valid_1's pr_auc: 0.226271
[150]	training's pr_auc: 0.3397	valid_1's pr_auc: 0.226472
[160]	training's pr_auc: 0.3

In [29]:
feature_importance_df.sort_values("Importance", ascending=False)

Unnamed: 0,Feature,Importance
4,app_id,383
6,campaign_id,309
17,media_app_id,152
20,uid,126
3,advertiser_id,112
10,game_feed_id,84
25,count,65
21,mst_advertiser_order_id,62
24,madia_app_num,42
22,mst_user_type_id,36


# stratified xgboost

In [30]:
# https://xgboost.readthedocs.io/en/latest/parameter.html
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', - average_precision_score(y_true.get_label(), y_predicted)

n_folds=5
np.random.seed(0)
def modelling_xgb():
    y = train.target
    X = train.drop(['target'],axis=1)
    xgb_params = {"objective" : "binary:logistic", "max_depth" : 5, "learning_rate" : 0.1} #, "tree_method": "gpu_hist"
    
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    valid = np.zeros([X.shape[0]])
    pred_value = np.zeros(test.shape[0])
    X_test = xgb.DMatrix(test.copy())
    
    for i , (train_index, val_index) in enumerate(skf.split(X, y)):
        print("fold: ", i)
        
        X_train = X.iloc[train_index,:]
        y_train = y.iloc[train_index]

        X_valid = X.iloc[val_index,:]
        y_valid = y.iloc[val_index]
        
        xgb_train = xgb.DMatrix(X_train, label = y_train)
        xgb_eval = xgb.DMatrix(X_valid, label = y_valid)
        watchlist = [(xgb_train, "train"), (xgb_eval, "eval")]
        num_boost_round = 100000
        
        model = xgb.train(
        xgb_params, xgb_train, num_boost_round, watchlist,
        early_stopping_rounds=10, verbose_eval = 50,
        feval=pr_auc_metric,
    )
                
        valid_predict = model.predict(xgb_eval, ntree_limit = model.best_ntree_limit)
        valid[val_index] = valid_predict
        pred_value += model.predict(X_test, ntree_limit = model.best_ntree_limit)
    
    return pred_value, valid

pred_value_xgb, validation_xgb = modelling_xgb()

score_xgb = average_precision_score(train.target, validation_xgb)
print(score_xgb)

fold:  0
[0]	train-error:0.03270	eval-error:0.03269	train-pr_auc:-0.14571	eval-pr_auc:-0.14742
Multiple eval metrics have been passed: 'eval-pr_auc' will be used for early stopping.

Will train until eval-pr_auc hasn't improved in 10 rounds.
[50]	train-error:0.03252	eval-error:0.03258	train-pr_auc:-0.25421	eval-pr_auc:-0.25173
[100]	train-error:0.03218	eval-error:0.03218	train-pr_auc:-0.29044	eval-pr_auc:-0.28501
[150]	train-error:0.03209	eval-error:0.03209	train-pr_auc:-0.30442	eval-pr_auc:-0.29535
[200]	train-error:0.03197	eval-error:0.03206	train-pr_auc:-0.31593	eval-pr_auc:-0.30166
[250]	train-error:0.03190	eval-error:0.03203	train-pr_auc:-0.32545	eval-pr_auc:-0.30576
[300]	train-error:0.03182	eval-error:0.03202	train-pr_auc:-0.33245	eval-pr_auc:-0.30801
[350]	train-error:0.03174	eval-error:0.03199	train-pr_auc:-0.33924	eval-pr_auc:-0.31055
[400]	train-error:0.03167	eval-error:0.03199	train-pr_auc:-0.34601	eval-pr_auc:-0.31279
[450]	train-error:0.03158	eval-error:0.03200	train-pr_a

# xgb ts

In [31]:
# https://xgboost.readthedocs.io/en/latest/parameter.html
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', - average_precision_score(y_true.get_label(), y_predicted)

n_folds=5
np.random.seed(0)
def modelling_xgb_ts():
    xgb_params = {"objective" : "binary:logistic", "max_depth" : 5, "learning_rate" : 0.1,}#"tree_method": "gpu_hist"
    
    y_train = train_df.target.copy()
    X_train = train_df.drop(['target'],axis=1).copy()
    y_valid = valid_df.target.copy()
    X_valid = valid_df.drop(['target'],axis=1).copy()
        
    X_test = xgb.DMatrix(test.copy())
        
    xgb_train = xgb.DMatrix(X_train, label = y_train)
    xgb_eval = xgb.DMatrix(X_valid, label = y_valid)
    watchlist = [(xgb_train, "train"), (xgb_eval, "eval")]
    num_boost_round = 10
        
    model = xgb.train(
        xgb_params, xgb_train, num_boost_round, watchlist,
        early_stopping_rounds=100000, verbose_eval = 50,
        feval=pr_auc_metric,
    )
                
    valid_predict = model.predict(xgb_eval, ntree_limit = model.best_ntree_limit)
    pred_value = model.predict(X_test, ntree_limit = model.best_ntree_limit)
    
    return pred_value, valid_predict

pred_value_xgb_ts, validation_xgb_ts = modelling_xgb_ts()

score_xgb_ts = average_precision_score(valid_df.target, validation_xgb_ts)
print(score_xgb_ts)

[0]	train-error:0.03232	eval-error:0.03439	train-pr_auc:-0.14996	eval-pr_auc:-0.11973
Multiple eval metrics have been passed: 'eval-pr_auc' will be used for early stopping.

Will train until eval-pr_auc hasn't improved in 100000 rounds.
[9]	train-error:0.03235	eval-error:0.03437	train-pr_auc:-0.21613	eval-pr_auc:-0.15424
0.15424526817510653


# submission

In [32]:
lgb_sub = pd.read_csv(DIR + "atmaCup7__sample_submission.csv")
lgb_sub["target"] = (pred_value+pred_value_ts+pred_value_xgb+pred_value_xgb_ts)/4
lgb_sub.to_csv("atmacup7_"+str(score)[:-10]+".csv", index = False)