- refactoring (cancel _create_whole_imp_at_feature to keep reproducibility)

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from datetime import timedelta
from sklearn import preprocessing
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

warnings.filterwarnings('ignore')
tqdm_notebook.pandas(desc="progress: ")
pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

In [2]:
DIR = "../input/20201115at/"
train = pd.read_csv(DIR+"train.csv")
test = pd.read_csv(DIR+"test.csv")
campaign = pd.read_csv(DIR+"campaign.csv")

#advertiser_video = pd.read_csv(DIR+"advertiser_video.csv")
#map_game_feed_native_video_assets = pd.read_csv(DIR+'map_game_feed_native_video_assets.csv')
#advertiser_converted_video = pd.read_csv(DIR+"advertiser_converted_video.csv")

# preprocess

In [3]:
train_only_uid = list(set(train['uid']) - set(test['uid']))
test_only_uid = list(set(test['uid']) - set(train['uid']))

In [4]:
# with campaign
train = train.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)
test = test.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)

In [5]:
train["imp_at"] = pd.to_datetime(train["imp_at"])
test["imp_at"] = pd.to_datetime(test["imp_at"])
train["date"] = train["imp_at"].dt.date
test["date"] = test["imp_at"].dt.date

In [6]:
train["click_or_view_at"] = pd.to_datetime(train["click_or_view_at"])
train["cv_at"] = pd.to_datetime(train["cv_at"])

In [7]:
# drop because distribution is different
train.drop(['os_version'], axis=1, inplace=True)
test.drop(['os_version'], axis=1, inplace=True)

# noise features?
train.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id", "header_bidding"], axis=1, inplace=True)
test.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id", "header_bidding"], axis=1, inplace=True)

In [8]:
#
app_ranking = pd.concat([train, test]).groupby("app_id")["uid"].nunique().reset_index(drop=False)
app_ranking.columns = ["app_id", "app_users"]
train = train.merge(app_ranking, on="app_id", how="left")
test = test.merge(app_ranking, on="app_id", how="left")

#
media_app_rank = pd.concat([train, test]).groupby(["game_feed_id", "campaign_id"])["media_app_id"].nunique().reset_index(drop=False)
media_app_rank.columns = ["game_feed_id", "campaign_id", "madia_app_num"]
train = train.merge(media_app_rank, on=["game_feed_id", "campaign_id"], how="left")
test = test.merge(media_app_rank, on=["game_feed_id", "campaign_id"], how="left")

# 
campaign_start_date = pd.concat([train, test]).groupby("campaign_id").first().reset_index(drop=False)[["campaign_id", "date"]]
campaign_start_date.columns = ["campaign_id", "first_date"]
train = train.merge(campaign_start_date, on=["campaign_id"], how="left")
test = test.merge(campaign_start_date, on=["campaign_id"], how="left")

In [9]:
tmp_all = pd.concat([train[["imp_at", "uid"]], test[["imp_at", "uid"]]]).reset_index(drop=True)
tmp_all["imp_at_diff"] = tmp_all.groupby("uid")["imp_at"].diff(-1).dt.total_seconds() / 3600 / 24 * -1

In [10]:
train = train.merge(tmp_all, on=["imp_at", "uid"], how="left")
test = test.merge(tmp_all, on=["imp_at", "uid"], how="left")
del tmp_all

In [11]:
train["days_from_start"] = (train["date"] - train["first_date"]).apply(lambda x: x.days)
test["days_from_start"] = (test["date"] - test["first_date"]).apply(lambda x: x.days)

In [12]:
ids = ["advertiser_id", "campaign_id"]

def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

concatenated_ = pd.concat([train[ids + ["uid"]], test[ids + ["uid"]]], axis=0).reset_index(drop=True)

tfidf_dfs = []
tfidf_svd_feats = []
for id_ in ids:
    print(id_)
    docs = concatenated_.groupby("uid")[id_].apply(join)
    max_features = int(concatenated_[id_].nunique() * 0.8)
    tv = TfidfVectorizer(max_features=max_features)
    X = tv.fit_transform(docs)

    if id_ == "advertiser_id":
        n_components = 20
    else:
        n_components = 50
    svd = TruncatedSVD(n_components=n_components, random_state=0)
    X = svd.fit_transform(X)
    df = pd.DataFrame(X, columns=[f"tfidf_{id_}_{i}" for i in range(n_components)])
    df.index = docs.index
    tfidf_dfs.append(df)
    tfidf_svd_feats += [f"tfidf_{id_}_{i}" for i in range(n_components)]

advertiser_id
campaign_id


In [13]:
def add_LDA_feats(train, test, key, values):
    cross_info = pd.crosstab(pd.concat([train[[key, values]], test[[key, values]]])[key] , pd.concat([train[[key, values]], test[[key, values]]])[values])
    lda = LatentDirichletAllocation(n_components = 5, random_state=0)
    out = lda.fit_transform(cross_info)
    out = pd.DataFrame(out, columns = ["lda_"+str(key)+"_"+str(values)+"_"+str(i) for i in range(5)])
    out.index = cross_info.index
    return out

#uid_adnw_id_df = add_LDA_feats(train, test, key='uid', values='adnw_id')
#train = train.merge(uid_adnw_id_df, on="uid", how="left", right_index=True).reset_index(drop=True)
#test = test.merge(uid_adnw_id_df, on="uid", how="left", right_index=True).reset_index(drop=True)

#del advertiser_id_uid_df

In [14]:
game_feed_target = train.groupby("game_feed_id")["target"].mean().reset_index(drop=False)
game_feed_counts = train["game_feed_id"].value_counts().reset_index(drop=False)
game_feed_counts.columns = ["game_feed_id", "game_feed_counts"]
game_feed_target = game_feed_target.merge(game_feed_counts, on ="game_feed_id", how="left")
popular_game_feed = list(game_feed_target[(game_feed_target.target > 0.1) & (game_feed_target.game_feed_counts > 100)]["game_feed_id"])
print(len(popular_game_feed))

train["popular_game"] = train["game_feed_id"].apply(lambda x: 1 if x in popular_game_feed else 0)
test["popular_game"] = test["game_feed_id"].apply(lambda x: 1 if x in popular_game_feed else 0)

camp_target = train.groupby("campaign_id")["target"].mean().reset_index(drop=False)
camp_counts = train["campaign_id"].value_counts().reset_index(drop=False)
camp_counts.columns = ["campaign_id", "campaign_counts"]
camp_target = camp_target.merge(camp_counts, on ="campaign_id", how="left")
popular_campaign = list(camp_target[(camp_target.target > 0.1) & (camp_target.campaign_counts > 100)]["campaign_id"])

train["popular_campaign"] = train["campaign_id"].apply(lambda x: 1 if x in popular_campaign else 0)
test["popular_campaign"] = test["campaign_id"].apply(lambda x: 1 if x in popular_campaign else 0)

154


In [15]:
object_feats = [i for i in train.columns if "object" == str(train[i].dtype) 
                and i != "request_id" and i != "uid" and i != "date" and i != "hour" and i !="first_date"]

def get_non_overlapping(train: pd.DataFrame, test: pd.DataFrame, column: str):
    only_in_train = set(train[column].unique()) - set(test[column].unique())
    only_in_test = set(test[column].unique()) - set(train[column].unique())
    non_overlapping = only_in_train.union(only_in_test)
    return non_overlapping

def category2num(train: pd.DataFrame, test: pd.DataFrame, columns: list):
    train_ = train.copy()
    test_ = test.copy()
    for column in columns:
        print(column)
        non_overlapping = get_non_overlapping(train, test, column)
        try:
            if train[column].dtype == np.dtype("O"):
                # dtypeがobjectなら欠損は'missing' クラスにする
                train_[column] = train[column].fillna("missing")
                test_[column] = test[column].fillna("missing")
                train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else "other")
                test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else "other")
            else:
                # dtypeがint/floatなら欠損は'-1'とする
                train_[column] = train[column].fillna(-1)
                test_[column] = test[column].fillna(-1)
                train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else -2)
                test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else -2)

            le = preprocessing.LabelEncoder()
            concatenated = pd.concat([train_, test_], axis=0).reset_index(drop=True)
            le.fit(concatenated[column])
            train_[column] = le.transform(train_[column])
            test_[column] = le.transform(test_[column])
        except Exception:
            import pdb
            pdb.set_trace()
    return train_, test_

train, test = category2num(train, test, object_feats)

advertiser_id
app_id
campaign_id
game_feed_id
game_template_id
media_app_id
mst_advertiser_order_id


In [16]:
last_paid_interval_df = train.groupby("uid")["last_paid_interval"].mean().reset_index(drop=False)
last_paid_interval_df.columns = ["uid", "last_paid_interval_mean"]

train = train.merge(last_paid_interval_df, on="uid", how="left")
test = test.merge(last_paid_interval_df, on="uid", how="left")

train.loc[train['uid'].isin(train_only_uid), 'last_paid_interval_mean'] = -1
test.loc[test['uid'].isin(test_only_uid), 'last_paid_interval_mean'] = -1

In [17]:
# drop because the exist only in train
train.drop(['click_or_view_at', 'cv_at','last_cv_interval', 'last_paid_interval'], axis=1, inplace=True)

# feature engineering

In [18]:
# ok
all_df = pd.concat([train, test], ignore_index=True)
all_df["count"] = 1
all_df["count"] = all_df.groupby("uid")["count"].cumsum()

train = pd.merge(train, all_df[["request_id", "count"]], on="request_id", how="left")
test = pd.merge(test, all_df[["request_id", "count"]], on="request_id", how="left")

In [19]:
train = pd.merge(train, tfidf_dfs[0], on="uid", how="left")
test = pd.merge(test, tfidf_dfs[0], on="uid", how="left")

train = pd.merge(train, tfidf_dfs[1], on="uid", how="left")
test = pd.merge(test, tfidf_dfs[1], on="uid", how="left")

In [20]:
del tfidf_dfs, concatenated_

In [21]:
def _create_whole_imp_at_feature(whole_df, var) -> pd.DataFrame:

    imp_at = pd.to_datetime(whole_df['imp_at']) + timedelta(hours=9) # utc -> asia/tokyo
    out_df = pd.DataFrame()
    out_df['hours'] = imp_at.dt.hour
    out_df['dayofweek'] = imp_at.dt.dayofweek
    out_df['hour_zone'] = pd.cut(out_df['hours'].values, bins=[-np.inf, 6, 12, 18, np.inf]).codes

    def _create_pivot(input_df, c, column='dayofweek', values='hours'):
        _df = pd.pivot_table(data=input_df, index=[c], columns=[column], values=values, aggfunc='count')
        _df = _df.fillna(0)
        _df.columns = [column + '=' + str(x) for x in _df.columns]

        # index ごとに正規化して割合にする
        _df = _df.div(_df.sum(axis=1), axis=0)
        return pd.merge(input_df[c], _df, on=c, how='left').drop(columns=[c]).add_prefix(c + '_')

    # uid での集約
    for c in [var]:
        _df = out_df.copy()
        _df[c] = whole_df[c]
        _feat = pd.concat([
            _create_pivot(_df, c),
            _create_pivot(_df, c, column='hour_zone', values='dayofweek')  
        ], axis=1)
        out_df = pd.concat([out_df, _feat], axis=1)

    out_df['request_id'] = whole_df['request_id']
    return out_df

#user_imp_df = _create_whole_imp_at_feature(all_df, "uid")
#user_imp_df.drop(["hours","dayofweek","hour_zone"], axis=1, inplace=True)
#train = pd.merge(train, user_imp_df, on="request_id", how="left")
#test = pd.merge(test, user_imp_df, on="request_id", how="left")

In [22]:
camp_num = all_df.groupby("date")["campaign_id"].nunique().reset_index(drop=False)
camp_num.columns=["date", "camp_num"]
user_num = all_df.groupby("date")["uid"].nunique().reset_index(drop=False)
user_num.columns=["date", "user_num"]

del all_df

train = pd.merge(train, camp_num, on="date", how="left")
test = pd.merge(test, camp_num, on="date", how="left")
train = pd.merge(train, user_num, on="date", how="left")
test = pd.merge(test, user_num, on="date", how="left")
train.drop(["date", "first_date",'request_id', "imp_at"], axis=1, inplace=True) 
test.drop(["date", "first_date",'request_id', "imp_at"], axis=1, inplace=True) 

In [23]:
ce_feats = ["uid"]
for c in ce_feats:
    freq = pd.concat([train[c], test[c]]).value_counts()
    train[c] = train[c].map(freq)
    test[c] = test[c].map(freq)

# stratified lgb

In [24]:
print(train.shape, test.shape)

(1997607, 104) (390095, 103)


In [25]:
print("all columns: ", train.columns)
object_feats = [i for i in object_feats if i in train.columns]
print("categorical feats: ", object_feats) 

all columns:  Index(['target', 'adnw_id', 'adspot_id', 'adspot_video_format_id',
       'advertiser_id', 'app_id', 'auction_type_id', 'campaign_id',
       'first_login_interval', 'frequency',
       ...
       'tfidf_campaign_id_42', 'tfidf_campaign_id_43', 'tfidf_campaign_id_44',
       'tfidf_campaign_id_45', 'tfidf_campaign_id_46', 'tfidf_campaign_id_47',
       'tfidf_campaign_id_48', 'tfidf_campaign_id_49', 'camp_num', 'user_num'],
      dtype='object', length=104)
categorical feats:  ['advertiser_id', 'app_id', 'campaign_id', 'game_feed_id', 'game_template_id', 'media_app_id', 'mst_advertiser_order_id']


In [26]:
# stratified k fold
n_folds = 5
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def modelling(seed):
    print("seed:", seed)
    lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 10, 'random_seed':seed,'max_depth': 5}
    
    y = train.target
    X = train.drop(['target'],axis=1)
    
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    valid = np.zeros([X.shape[0]])
    features_list = [i for i in X.columns]
    pred_value = np.zeros(test.shape[0])
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    
    for i , (train_index, val_index) in enumerate(skf.split(X, y)):
        print("fold: ", i)
        
        X_train = X.iloc[train_index,:]
        y_train = y.iloc[train_index]
        
        X_valid = X.iloc[val_index,:]
        y_valid = y.iloc[val_index]
        
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 30,
                          feval=pr_auc_metric, categorical_feature = sorted(object_feats))
        
        valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
        valid[val_index] = valid_predict
        pred_value += model.predict(test, num_iteration = model.best_iteration) / n_folds
        feature_importance_df["Fold_"+str(i+1)] = model.feature_importance()
    
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    
    return pred_value, valid, feature_importance_df

validation = np.zeros([train.shape[0]])
pred_value = np.zeros([test.shape[0]])
seeds = [0]
for seed_ in seeds:
    test_pred, oof_predict, feature_importance_df = modelling(seed_)
    validation += oof_predict / len(seeds)
    pred_value += test_pred / len(seeds)

score = average_precision_score(train.target, validation)
print(score)

seed: 0
fold:  0
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.342643	valid_1's pr_auc: 0.347127
[60]	training's pr_auc: 0.367939	valid_1's pr_auc: 0.366066
[90]	training's pr_auc: 0.379651	valid_1's pr_auc: 0.371379
[120]	training's pr_auc: 0.384647	valid_1's pr_auc: 0.37064
Early stopping, best iteration is:
[104]	training's pr_auc: 0.384187	valid_1's pr_auc: 0.373879
fold:  1
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.342769	valid_1's pr_auc: 0.336813
[60]	training's pr_auc: 0.367599	valid_1's pr_auc: 0.357059
[90]	training's pr_auc: 0.381987	valid_1's pr_auc: 0.365745
[120]	training's pr_auc: 0.391176	valid_1's pr_auc: 0.368737
Early stopping, best iteration is:
[113]	training's pr_auc: 0.390257	valid_1's pr_auc: 0.370744
fold:  2
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.341097	valid_1's pr_auc: 0.341282
[60]	training's pr_auc: 0.366319	valid_1's pr_auc

- seed: 0
- fold:  0
- Training until validation scores don't improve for 20 rounds
- [30]	training's pr_auc: 0.342643	valid_1's pr_auc: 0.347127
- [60]	training's pr_auc: 0.367939	valid_1's pr_auc: 0.366066
- [90]	training's pr_auc: 0.379651	valid_1's pr_auc: 0.371379
- [120]	training's pr_auc: 0.384647	valid_1's pr_auc: 0.37064
- Early stopping, best iteration is:
- [104]	training's pr_auc: 0.384187	valid_1's pr_auc: 0.373879
- fold:  1
- Training until validation scores don't improve for 20 rounds
- [30]	training's pr_auc: 0.342769	valid_1's pr_auc: 0.336813
- [60]	training's pr_auc: 0.367599	valid_1's pr_auc: 0.357059
- [90]	training's pr_auc: 0.381987	valid_1's pr_auc: 0.365745
- [120]	training's pr_auc: 0.391176	valid_1's pr_auc: 0.368737
- Early stopping, best iteration is:
- [113]	training's pr_auc: 0.390257	valid_1's pr_auc: 0.370744

In [27]:
feature_importance_df.sort_values("Average", ascending=False).head(100)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
6,campaign_id,188,206,264,250,317,245.0,45.475268,0.185613
4,app_id,142,153,253,214,364,225.2,80.372632,0.356894
17,media_app_id,41,50,117,89,168,93.0,46.411206,0.499045
29,last_paid_interval_mean,72,71,94,87,96,84.0,10.639549,0.126661
25,imp_at_diff,65,70,90,84,98,81.4,12.289833,0.150981
3,advertiser_id,62,63,76,74,85,72.0,8.602325,0.119477
10,game_feed_id,32,36,71,56,96,58.2,23.566077,0.404915
21,mst_advertiser_order_id,23,26,41,43,49,36.4,10.11138,0.277785
26,days_from_start,25,29,42,37,43,35.2,7.110556,0.202004
39,tfidf_advertiser_id_8,26,29,34,34,41,32.8,5.114685,0.155936


# stratified xgb

In [28]:
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', - average_precision_score(y_true.get_label(), y_predicted)

n_folds=5
np.random.seed(0)
def modelling_xgb():
    y = train.target
    X = train.drop(['target'],axis=1)
    xgb_params = {"objective" : "binary:logistic", "max_depth" : 5, "learning_rate" : 0.1} #, "tree_method": "gpu_hist"
    
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    valid = np.zeros([X.shape[0]])
    pred_value = np.zeros(test.shape[0])
    X_test = xgb.DMatrix(test)
    
    for i , (train_index, val_index) in enumerate(skf.split(X, y)):
        print("fold: ", i)
        
        X_train = X.iloc[train_index,:]
        y_train = y.iloc[train_index]
        
        X_valid = X.iloc[val_index,:]
        y_valid = y.iloc[val_index]
        
        xgb_train = xgb.DMatrix(X_train, label = y_train)
        xgb_eval = xgb.DMatrix(X_valid, label = y_valid)
        watchlist = [(xgb_train, "train"), (xgb_eval, "eval")]
        num_boost_round = 100000
        
        model = xgb.train(
        xgb_params, xgb_train, num_boost_round, watchlist,
        early_stopping_rounds=10, verbose_eval = 50,
        feval=pr_auc_metric,
    )
                
        valid_predict = model.predict(xgb_eval, ntree_limit = model.best_ntree_limit)
        valid[val_index] = valid_predict
        pred_value += model.predict(X_test, ntree_limit = model.best_ntree_limit) / n_folds
    
    return pred_value, valid

#pred_value_xgb, validation_xgb = modelling_xgb()
#score_xgb = average_precision_score(train.target, validation_xgb)
#print(score_xgb)

# submission

In [29]:
lgb_sub = pd.read_csv(DIR + "atmaCup7__sample_submission.csv")
lgb_sub["target"] = pred_value
lgb_sub.to_csv("atmacup7_"+str(score)[:-10]+".csv", index = False)