- add click view per day per user info

In [1]:
!pip install jpholiday 

Collecting jpholiday
  Downloading jpholiday-0.1.4-py3-none-any.whl (8.5 kB)
Installing collected packages: jpholiday
Successfully installed jpholiday-0.1.4


In [2]:
import os
import jpholiday
import numpy as np
import pandas as pd
import warnings
import lightgbm as lgb
from sklearn import preprocessing
from datetime import timedelta
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
tqdm_notebook.pandas(desc="progress: ")

In [3]:
DIR = "../input/20201115at/"
train = pd.read_csv(DIR+"train.csv")
test = pd.read_csv(DIR+"test.csv")
campaign = pd.read_csv(DIR+"campaign.csv")

#advertiser_video = pd.read_csv(DIR+"advertiser_video.csv")
#map_game_feed_native_video_assets = pd.read_csv(DIR+'map_game_feed_native_video_assets.csv')

#advertiser_converted_video = pd.read_csv(DIR+"advertiser_converted_video.csv")

# preprocess

In [4]:
# with campaign
train = train.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)
test = test.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)

# with map_gae_feed_native_video_assets
#train = train.merge(map_game_feed_native_video_assets, left_on=["game_feed_id"], right_on = ["mst_game_feed_id"], how="left").drop(["mst_game_feed_id"], axis=1)
#test = test.merge(map_game_feed_native_video_assets, left_on=["game_feed_id"], right_on = ["mst_game_feed_id"], how="left").drop(["mst_game_feed_id"], axis=1)

# with advertiser_video horizontal
#horizontal = advertiser_video.copy()
#horizontal.columns = [f"horizontal_{c}" if c not in ["id", "mst_advertiser_id"] else c for c in horizontal.columns]
#train = pd.merge(train, horizontal[["id", "mst_advertiser_id", "horizontal_duration"]], left_on=["horizontal_mst_advertiser_video_id", "advertiser_id"], 
#               right_on=["id", "mst_advertiser_id"], how="left").drop(columns=["id", "mst_advertiser_id"])
#test = pd.merge(test, horizontal[["id", "mst_advertiser_id", "horizontal_duration"]], left_on=["horizontal_mst_advertiser_video_id", "advertiser_id"], 
#               right_on=["id", "mst_advertiser_id"], how="left").drop(columns=["id", "mst_advertiser_id"])

In [5]:
train["imp_at"] = pd.to_datetime(train["imp_at"])
test["imp_at"] = pd.to_datetime(test["imp_at"])

In [6]:
train["click_or_view_at"] = pd.to_datetime(train["click_or_view_at"])
train["click_or_view_day"] = train["click_or_view_at"].dt.dayofweek
click_view_df = train.groupby("uid")["click_or_view_day"].value_counts().unstack(fill_value=0)
click_view_df.columns = ["cv"+str(i) for i in range(7)]
click_view_df = click_view_df.reset_index(drop=False)

train = train.merge(click_view_df, on="uid", how="left")
test = test.merge(click_view_df, on="uid", how="left")

In [7]:
# drop because distribution is different
train.drop(['os_version'], axis=1, inplace=True)
test.drop(['os_version'], axis=1, inplace=True)

# noise features?
train.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id"], axis=1, inplace=True)
test.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id"], axis=1, inplace=True)

# drop because the exist only in train
train.drop(['click_or_view_at', 'cv_at','last_cv_interval', 'last_paid_interval'], axis=1, inplace=True)

In [8]:
object_feats = [i for i in train.columns if "object" == str(train[i].dtype) and i != "request_id"]

def get_non_overlapping(train: pd.DataFrame, test: pd.DataFrame, column: str):
    only_in_train = set(train[column].unique()) - set(test[column].unique())
    only_in_test = set(test[column].unique()) - set(train[column].unique())
    non_overlapping = only_in_train.union(only_in_test)
    return non_overlapping


def category2num(train: pd.DataFrame, test: pd.DataFrame, columns: list):
    train_ = train.copy()
    test_ = test.copy()
    for column in columns:
        non_overlapping = get_non_overlapping(train, test, column)
        try:
            if train[column].dtype == np.dtype("O"):
                # dtypeがobjectなら欠損は'missing' クラスにする
                train_[column] = train[column].fillna("missing")
                test_[column] = test[column].fillna("missing")
                train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else "other")
                test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else "other")
            else:
                # dtypeがint/floatなら欠損は'-1'とする
                train_[column] = train[column].fillna(-1)
                test_[column] = test[column].fillna(-1)
                train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else -2)
                test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else -2)

            le = preprocessing.LabelEncoder()
            concatenated = pd.concat([train_, test_], axis=0).reset_index(drop=True)
            le.fit(concatenated[column])
            train_[column] = le.transform(train_[column])
            test_[column] = le.transform(test_[column])
        except Exception:
            import pdb
            pdb.set_trace()
    return train_, test_

train, test = category2num(train, test, object_feats)

# feature engineering

In [9]:
train["type"] = "train"
test["type"] = "test"
all_df = pd.concat([train, test])
all_df = all_df.sort_values(["uid", "imp_at"]).reset_index(drop=True)

In [10]:
all_df["count"] = 1
all_df["count"] = all_df.groupby("uid")["count"].cumsum()

In [11]:
def _create_whole_imp_at_feature(whole_df) -> pd.DataFrame:

    imp_at = pd.to_datetime(whole_df['imp_at']) + timedelta(hours=9) # utc -> asia/tokyo
    out_df = pd.DataFrame()
    out_df['hours'] = imp_at.dt.hour
    out_df['dayofweek'] = imp_at.dt.dayofweek
    out_df['hour_zone'] = pd.cut(out_df['hours'].values, bins=[-np.inf, 6, 12, 18, np.inf]).codes

    def _create_pivot(input_df, c, column='dayofweek', values='hours'):
        _df = pd.pivot_table(data=input_df, index=[c], columns=[column], values=values, aggfunc='count')
        _df = _df.fillna(0)
        _df.columns = [column + '=' + str(x) for x in _df.columns]

        # index ごとに正規化して割合にする
        _df = _df.div(_df.sum(axis=1), axis=0)
        return pd.merge(input_df[c], _df, on=c, how='left').drop(columns=[c]).add_prefix(c + '_')

    # uid での集約
    for c in ['uid']:
        _df = out_df.copy()
        _df[c] = whole_df[c]
        _feat = pd.concat([
            _create_pivot(_df, c),
            _create_pivot(_df, c, column='hour_zone', values='dayofweek')  
        ], axis=1)
        out_df = pd.concat([out_df, _feat], axis=1)

    out_df['request_id'] = whole_df['request_id']
    return out_df

user_imp_df = _create_whole_imp_at_feature(all_df)
user_imp_df.drop(["hours","dayofweek","hour_zone"], axis=1, inplace=True)
all_df = all_df.merge(user_imp_df, on="request_id", how="left")

In [12]:
#all_df["date"] = all_df["imp_at"].dt.date
#all_df["dif_login"] = all_df.groupby("uid")["date"].progress_apply(lambda x: x.diff(1))
#all_df["dif_login"] = all_df["dif_login"].progress_apply(lambda x: x.days)
#all_df["dif_login"] = all_df["dif_login"].fillna(0)
#all_df["total_logdays"] = all_df["dif_login"].progress_apply(lambda x: x if x ==0 or x==1 else 1)
#all_df["total_logdays"] = all_df.groupby("uid")["total_logdays"].progress_apply(lambda x: x.cumsum()) + 1

In [13]:
train = all_df[all_df.type=="train"].drop("type", axis=1).reset_index(drop=True)
test = all_df[all_df.type=="test"].drop("type", axis=1).reset_index(drop=True)

In [14]:
train = train.sort_values("imp_at").reset_index(drop=True)
test = test.sort_values("imp_at").reset_index(drop=True)

# separation for validation

In [15]:
train.drop(['request_id'], axis=1, inplace=True)
test.drop(['request_id'], axis=1, inplace=True)

In [16]:
mask = train.imp_at > "2020-05-21"
valid_df = train.loc[mask].copy()
train_df = train.loc[~mask].copy()

In [17]:
train = train.drop(["imp_at"], axis=1)
test = test.drop(["imp_at"], axis=1)

valid_df = valid_df.drop(["imp_at"], axis=1)
train_df = train_df.drop(["imp_at"], axis=1)

In [18]:
test = test.drop("target", axis=1)

In [19]:
train.shape, test.shape 

((1997595, 45), (390095, 44))

In [20]:
train.head()

Unnamed: 0,target,adnw_id,adspot_id,adspot_video_format_id,advertiser_id,app_id,auction_type_id,campaign_id,first_login_interval,frequency,...,uid_dayofweek=1,uid_dayofweek=2,uid_dayofweek=3,uid_dayofweek=4,uid_dayofweek=5,uid_dayofweek=6,uid_hour_zone=0,uid_hour_zone=1,uid_hour_zone=2,uid_hour_zone=3
0,0.0,4,1111,2.0,14,5653,1.0,256,1490644.0,0,...,0.132734,0.15512,0.146319,0.13759,0.148722,0.148673,0.19987,0.27165,0.261339,0.267141
1,0.0,28,1081,2.0,86,3127,2.0,56,790412.0,2,...,0.132734,0.15512,0.146319,0.13759,0.148722,0.148673,0.19987,0.27165,0.261339,0.267141
2,0.0,42,1123,3.0,4,4947,1.0,76,1506105.0,0,...,0.132734,0.15512,0.146319,0.13759,0.148722,0.148673,0.19987,0.27165,0.261339,0.267141
3,0.0,8,1113,2.0,15,4725,1.0,47,184283.0,0,...,0.132734,0.15512,0.146319,0.13759,0.148722,0.148673,0.19987,0.27165,0.261339,0.267141
4,0.0,22,1061,,95,67,4.0,214,1999495.0,2,...,0.132734,0.15512,0.146319,0.13759,0.148722,0.148673,0.19987,0.27165,0.261339,0.267141


# stratified modelling

In [21]:
object_feats = [i for i in object_feats if i in train.columns]

In [22]:
# stratified k fold
n_folds = 5
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def modelling(seed):
    print("seed:", seed)
    lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 10, 'random_seed':seed,'max_depth': 5}
    
    y = train.target.copy()
    X = train.drop(['target'],axis=1).copy()
    
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    valid = np.zeros([X.shape[0]])
    features_list = [i for i in X.columns]
    pred_value = np.zeros(test.shape[0])
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    
    for i , (train_index, val_index) in enumerate(skf.split(X, y)):
        print("fold: ", i)
        
        X_train = X.iloc[train_index,:]
        y_train = y.iloc[train_index]

        X_valid = X.iloc[val_index,:]
        y_valid = y.iloc[val_index]
        
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 10,
                          feval=pr_auc_metric, categorical_feature = sorted(object_feats))
        
        valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
        valid[val_index] = valid_predict
        pred_value += model.predict(test, num_iteration = model.best_iteration)
        feature_importance_df["Fold_"+str(i+1)] = model.feature_importance()
    
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    
    return pred_value, valid, feature_importance_df

validation = np.zeros([train.shape[0]])
pred_value = np.zeros([test.shape[0]])
seeds = [0]
for seed_ in seeds:
    test_pred, oof_predict, feature_importance_df = modelling(seed_)
    validation += oof_predict / len(seeds)
    pred_value += test_pred / len(seeds)

score = average_precision_score(train.target, validation)
print(score)

seed: 0
fold:  0
Training until validation scores don't improve for 20 rounds
[10]	training's pr_auc: 0.233935	valid_1's pr_auc: 0.2281
[20]	training's pr_auc: 0.249051	valid_1's pr_auc: 0.241284
[30]	training's pr_auc: 0.261595	valid_1's pr_auc: 0.252622
[40]	training's pr_auc: 0.271774	valid_1's pr_auc: 0.260299
[50]	training's pr_auc: 0.280146	valid_1's pr_auc: 0.266033
[60]	training's pr_auc: 0.287492	valid_1's pr_auc: 0.269771
[70]	training's pr_auc: 0.292549	valid_1's pr_auc: 0.271179
[80]	training's pr_auc: 0.297514	valid_1's pr_auc: 0.272928
[90]	training's pr_auc: 0.301679	valid_1's pr_auc: 0.274057
[100]	training's pr_auc: 0.305564	valid_1's pr_auc: 0.276069
[110]	training's pr_auc: 0.308526	valid_1's pr_auc: 0.276587
[120]	training's pr_auc: 0.311226	valid_1's pr_auc: 0.277379
[130]	training's pr_auc: 0.314205	valid_1's pr_auc: 0.278126
[140]	training's pr_auc: 0.31704	valid_1's pr_auc: 0.278365
[150]	training's pr_auc: 0.319662	valid_1's pr_auc: 0.279094
[160]	training's pr

In [23]:
feature_importance_df.sort_values("Average", ascending=False)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
4,app_id,615,453,388,463,726,529.0,123.464975,0.233393
6,campaign_id,357,302,249,287,425,324.0,61.26663,0.189095
18,media_app_id,351,230,205,229,380,279.0,71.78022,0.257277
10,game_feed_id,206,151,122,138,245,172.4,46.002609,0.266836
3,advertiser_id,158,134,141,136,169,147.6,13.632315,0.09236
21,uid,116,102,80,105,139,108.4,19.252013,0.177602
22,mst_advertiser_order_id,78,73,65,68,88,74.4,8.114185,0.109062
15,last_login_interval,73,54,47,52,76,60.4,11.774549,0.194943
32,count,77,48,36,53,83,59.4,17.805617,0.299758
23,mst_user_type_id,58,47,48,52,71,55.2,8.795453,0.159338


# ts modelling

In [24]:
n_folds = 5
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def modelling_ts(seed):
    print("seed:", seed)
    lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 10, 'random_seed':seed,'max_depth': 5}
    y_train = train_df.target.copy()
    X_train = train_df.drop(['target'],axis=1).copy()
    y_valid = valid_df.target.copy()
    X_valid = valid_df.drop(['target'],axis=1).copy()
    
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
        
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
    model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
       num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 10, feval=pr_auc_metric, categorical_feature = sorted(object_feats))
    
    valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
    feature_importance_df["Importance"] = model.feature_importance()
        
    score = average_precision_score(y_valid, valid_predict)
    print("average precision score = {}".format(score))
    pred_value = model.predict(test, num_iteration = model.best_iteration)
    
    return pred_value, valid_predict, feature_importance_df

validation_ts = np.zeros([valid_df.shape[0]])
pred_value_ts = np.zeros([test.shape[0]])
seeds = [0]
for seed_ in seeds:
    test_pred, oof_predict, feature_importance_df = modelling_ts(seed_)
    validation_ts += oof_predict / len(seeds)
    pred_value_ts += test_pred / len(seeds)

score = average_precision_score(valid_df.target, validation_ts)
print(score)

seed: 0
Training until validation scores don't improve for 20 rounds
[10]	training's pr_auc: 0.237918	valid_1's pr_auc: 0.18574
[20]	training's pr_auc: 0.25841	valid_1's pr_auc: 0.197972
[30]	training's pr_auc: 0.277056	valid_1's pr_auc: 0.204446
[40]	training's pr_auc: 0.290366	valid_1's pr_auc: 0.207064
[50]	training's pr_auc: 0.299401	valid_1's pr_auc: 0.20981
[60]	training's pr_auc: 0.306531	valid_1's pr_auc: 0.21046
[70]	training's pr_auc: 0.311276	valid_1's pr_auc: 0.211123
[80]	training's pr_auc: 0.316176	valid_1's pr_auc: 0.211877
[90]	training's pr_auc: 0.320718	valid_1's pr_auc: 0.213354
[100]	training's pr_auc: 0.324304	valid_1's pr_auc: 0.214298
[110]	training's pr_auc: 0.327883	valid_1's pr_auc: 0.214747
[120]	training's pr_auc: 0.33044	valid_1's pr_auc: 0.215016
[130]	training's pr_auc: 0.333849	valid_1's pr_auc: 0.215033
[140]	training's pr_auc: 0.33641	valid_1's pr_auc: 0.215549
[150]	training's pr_auc: 0.338673	valid_1's pr_auc: 0.215891
[160]	training's pr_auc: 0.3415

In [25]:
#feature_importance_df.sort_values("Importance", ascending=False)

# submission

In [26]:
lgb_sub = pd.read_csv(DIR + "atmaCup7__sample_submission.csv")
lgb_sub["target"] = pred_value
lgb_sub.to_csv("atmacup7_lgb"+str(score)[:-10]+".csv", index = False)