- add user nums per app

In [1]:
!pip install jpholiday 

Collecting jpholiday
  Downloading jpholiday-0.1.4-py3-none-any.whl (8.5 kB)
Installing collected packages: jpholiday
Successfully installed jpholiday-0.1.4


In [2]:
import os
import warnings
import jpholiday
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from datetime import timedelta
from sklearn import preprocessing
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
tqdm_notebook.pandas(desc="progress: ")

In [3]:
DIR = "../input/20201115at/"
train = pd.read_csv(DIR+"train.csv")
test = pd.read_csv(DIR+"test.csv")
campaign = pd.read_csv(DIR+"campaign.csv")

advertiser_video = pd.read_csv(DIR+"advertiser_video.csv")
map_game_feed_native_video_assets = pd.read_csv(DIR+'map_game_feed_native_video_assets.csv')
advertiser_converted_video = pd.read_csv(DIR+"advertiser_converted_video.csv")

In [4]:
cols = ["mst_advertiser_video_id", "mst_game_feed_id", "mst_video_template_id"]
advertiser_converted_video = advertiser_converted_video[~advertiser_converted_video[cols].duplicated(keep="last")]

# preprocess

In [5]:
# with campaign
train = train.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)
test = test.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)

In [6]:
def merge_all(
    df: pd.DataFrame, map_game_feed_native_video_assets: pd.DataFrame,
    advertiser_video: pd.DataFrame, advertiser_converted_video: pd.DataFrame,
) -> pd.DataFrame:
    # merge res and map_game_feed_native_video_assets
    res = pd.merge(df,map_game_feed_native_video_assets,left_on="game_feed_id",right_on="mst_game_feed_id",how="left",
    ).drop(columns=["mst_game_feed_id"])  
    # remove map_game_feed_native_video_assets keys

    # merge res and advertiser_video (horizontal case)
    horizontal = advertiser_video.copy()
    left_keys = ["horizontal_mst_advertiser_video_id", "advertiser_id"]
    right_keys = ["id", "mst_advertiser_id"]
    horizontal.columns = [f"horizontal_{c}" if c not in right_keys else c for c in horizontal.columns]
    res = pd.merge(res, horizontal, left_on=left_keys, right_on=right_keys, how="left").drop(columns=right_keys) 
    # remove advertiser_video keys
    
    # merge res and advertiser_video (vertical case)
    vertical = advertiser_video.copy()
    left_keys = ["vertical_mst_advertiser_video_id", "advertiser_id"]
    right_keys = ["id", "mst_advertiser_id"]
    vertical.columns = [f"vertical_{c}" if c not in right_keys else c for c in vertical.columns]
    res = pd.merge(res, vertical, left_on=left_keys, right_on=right_keys, how="left").drop(columns=right_keys) 
    # remove advertiser_video keys

    # merge res and advertiser_converted_video (horizontal case)
    left_keys = ["horizontal_mst_advertiser_video_id", "game_feed_id", "video_template_id",]
    right_keys = ["mst_advertiser_video_id","mst_game_feed_id", "mst_video_template_id",]
    horizontal = advertiser_converted_video.copy()
    horizontal.columns = [ f"horizontal_converted_{c}" if c not in right_keys else c for c in horizontal.columns]
    res = pd.merge(res, horizontal, left_on=left_keys, right_on=right_keys, how="left").drop(columns=right_keys)
    # remove advertiser_converted_video keys
    
    # merge res and advertiser_converted_video (vertical case)
    left_keys = ["vertical_mst_advertiser_video_id","game_feed_id","video_template_id",]
    right_keys = ["mst_advertiser_video_id","mst_game_feed_id","mst_video_template_id",]
    vertical = advertiser_converted_video.copy()
    vertical.columns = [f"vertical_converted_{c}" if c not in right_keys else c for c in vertical.columns]
    res = pd.merge(res, vertical, left_on=left_keys, right_on=right_keys, how="left").drop(columns=right_keys)
    # remove advertiser_converted_video keys

    return res

#train = merge_all(train, map_game_feed_native_video_assets, advertiser_video, advertiser_converted_video)
#test = merge_all(test, map_game_feed_native_video_assets, advertiser_video, advertiser_converted_video)

#train["duration"] = train["vertical_duration"].fillna(0) + train["horizontal_duration"].fillna(0)
#train.drop(["vertical_duration", "horizontal_duration"], axis=1, inplace=True)

#test["duration"] = test["vertical_duration"].fillna(0) + test["horizontal_duration"].fillna(0)
#test.drop(["vertical_duration", "horizontal_duration"], axis=1, inplace=True)

# filesize
#train["filesize"] = train["vertical_file_size"].fillna(0) + train["horizontal_file_size"].fillna(0)
#train.drop(["vertical_file_size", "horizontal_file_size"], axis=1, inplace=True)

#test["filesize"] = test["vertical_file_size"].fillna(0) + test["horizontal_file_size"].fillna(0)
#test.drop(["vertical_file_size", "horizontal_file_size"], axis=1, inplace=True)

# converted filesize
#train["converted_filesize"] = train["vertical_converted_file_size"].fillna(0.0) + train["horizontal_converted_file_size"].fillna(0)
#train.drop(["vertical_converted_file_size", "horizontal_converted_file_size"], axis=1, inplace=True)

#test["converted_filesize"] = test["horizontal_converted_file_size"].fillna(0.0) + test["vertical_converted_file_size"].fillna(0)
#test.drop(["vertical_converted_file_size", "horizontal_converted_file_size"], axis=1, inplace=True)

# bitrate
#train["bitrate"] = train["vertical_converted_bitrate"].fillna(0) + train["horizontal_converted_bitrate"].fillna(0)
#train.drop(["vertical_converted_bitrate", "horizontal_converted_bitrate"], axis=1, inplace=True)

#test["bitrate"] = test["vertical_converted_bitrate"].fillna(0) + test["horizontal_converted_bitrate"].fillna(0)
#test.drop(["vertical_converted_bitrate", "horizontal_converted_bitrate"], axis=1, inplace=True)

#new_cols = ['horizontal_mst_advertiser_video_id',  'vertical_mst_advertiser_video_id',
#            'horizontal_width', 'horizontal_height', 
#            'vertical_width', 'vertical_height','horizontal_converted_width', 'horizontal_converted_height', 
#            'horizontal_converted_rectangle_type', 'vertical_converted_width', 'vertical_converted_height', 
#            'vertical_converted_rectangle_type']
#train.drop(new_cols, axis=1, inplace=True)
#test.drop(new_cols, axis=1, inplace=True)

In [7]:
train["imp_at"] = pd.to_datetime(train["imp_at"])
test["imp_at"] = pd.to_datetime(test["imp_at"])
train["date"] = train["imp_at"].dt.date
test["date"] = test["imp_at"].dt.date
train["hour"] = train["imp_at"].dt.hour
test["hour"] = test["imp_at"].dt.hour

In [8]:
train["click_or_view_at"] = pd.to_datetime(train["click_or_view_at"])
train["cv_at"] = pd.to_datetime(train["cv_at"])

In [9]:
# drop because distribution is different
train.drop(['os_version'], axis=1, inplace=True)
test.drop(['os_version'], axis=1, inplace=True)

# noise features?
train.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id", "header_bidding"], axis=1, inplace=True)
test.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id", "header_bidding"], axis=1, inplace=True)

In [10]:
app_ranking = pd.concat([train, test]).groupby("app_id")["uid"].nunique().reset_index(drop=False)
app_ranking.columns = ["app_id", "app_users"]
train = train.merge(app_ranking, on="app_id", how="left")
test = test.merge(app_ranking, on="app_id", how="left")

#'campaign_id',
# 'game_template_id',
# 'mst_advertiser_order_id'

In [11]:
object_feats = [i for i in train.columns if "object" == str(train[i].dtype) 
                and i != "request_id" and i != "uid" and i != "date" and i != "hour"]

def get_non_overlapping(train: pd.DataFrame, test: pd.DataFrame, column: str):
    only_in_train = set(train[column].unique()) - set(test[column].unique())
    only_in_test = set(test[column].unique()) - set(train[column].unique())
    non_overlapping = only_in_train.union(only_in_test)
    return non_overlapping

def category2num(train: pd.DataFrame, test: pd.DataFrame, columns: list):
    train_ = train.copy()
    test_ = test.copy()
    for column in columns:
        non_overlapping = get_non_overlapping(train, test, column)
        try:
            if train[column].dtype == np.dtype("O"):
                # dtypeがobjectなら欠損は'missing' クラスにする
                train_[column] = train[column].fillna("missing")
                test_[column] = test[column].fillna("missing")
                train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else "other")
                test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else "other")
            else:
                # dtypeがint/floatなら欠損は'-1'とする
                train_[column] = train[column].fillna(-1)
                test_[column] = test[column].fillna(-1)
                train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else -2)
                test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else -2)

            le = preprocessing.LabelEncoder()
            concatenated = pd.concat([train_, test_], axis=0).reset_index(drop=True)
            le.fit(concatenated[column])
            train_[column] = le.transform(train_[column])
            test_[column] = le.transform(test_[column])
        except Exception:
            import pdb
            pdb.set_trace()
    return train_, test_

train, test = category2num(train, test, object_feats)

In [12]:
# drop because the exist only in train
train.drop(['click_or_view_at', 'cv_at','last_cv_interval', 'last_paid_interval'], axis=1, inplace=True)

# feature engineering

In [13]:
train["type"] = "train"
test["type"] = "test"
all_df = pd.concat([train, test])
all_df = all_df.sort_values(["uid", "imp_at"]).reset_index(drop=True)

In [14]:
all_df["count"] = 1
all_df["count"] = all_df.groupby("uid")["count"].cumsum()

In [15]:
def _create_whole_imp_at_feature(whole_df, var) -> pd.DataFrame:

    imp_at = pd.to_datetime(whole_df['imp_at']) + timedelta(hours=9) # utc -> asia/tokyo
    out_df = pd.DataFrame()
    out_df['hours'] = imp_at.dt.hour
    out_df['dayofweek'] = imp_at.dt.dayofweek
    out_df['hour_zone'] = pd.cut(out_df['hours'].values, bins=[-np.inf, 6, 12, 18, np.inf]).codes

    def _create_pivot(input_df, c, column='dayofweek', values='hours'):
        _df = pd.pivot_table(data=input_df, index=[c], columns=[column], values=values, aggfunc='count')
        _df = _df.fillna(0)
        _df.columns = [column + '=' + str(x) for x in _df.columns]

        # index ごとに正規化して割合にする
        _df = _df.div(_df.sum(axis=1), axis=0)
        return pd.merge(input_df[c], _df, on=c, how='left').drop(columns=[c]).add_prefix(c + '_')

    # uid での集約
    for c in [var]:
        _df = out_df.copy()
        _df[c] = whole_df[c]
        _feat = pd.concat([
            _create_pivot(_df, c),
            _create_pivot(_df, c, column='hour_zone', values='dayofweek')  
        ], axis=1)
        out_df = pd.concat([out_df, _feat], axis=1)

    out_df['request_id'] = whole_df['request_id']
    return out_df

user_imp_df = _create_whole_imp_at_feature(all_df, "uid")
user_imp_df.drop(["hours","dayofweek","hour_zone"], axis=1, inplace=True)
all_df = all_df.merge(user_imp_df, on="request_id", how="left")

In [16]:
camp_num = all_df.groupby("date")["campaign_id"].nunique().reset_index(drop=False)
camp_num.columns=["date", "camp_num"]
all_df = all_df.merge(camp_num, on="date", how="left")

user_num = all_df.groupby("date")["uid"].nunique().reset_index(drop=False)
user_num.columns=["date", "user_num"]
all_df = all_df.merge(user_num, on="date", how="left")

all_df.drop(["date", "hour"], axis=1, inplace=True)

In [17]:
train = all_df[all_df.type=="train"].drop("type", axis=1).reset_index(drop=True)
test = all_df[all_df.type=="test"].drop("type", axis=1).reset_index(drop=True)

In [18]:
train = train.sort_values("imp_at").reset_index(drop=True)
test = test.sort_values("imp_at").reset_index(drop=True)

In [19]:
ce_feats = ["uid"]
for c in ce_feats:
    freq = pd.concat([train[c], test[c]]).value_counts()
    train[c] = train[c].map(freq)
    test[c] = test[c].map(freq)

# separation for validation

In [20]:
train.drop(['request_id'], axis=1, inplace=True)
test.drop(['request_id'], axis=1, inplace=True)

In [21]:
mask = train.imp_at > "2020-05-21"
valid_df = train.loc[mask].copy()
train_df = train.loc[~mask].copy()

In [22]:
train = train.drop(["imp_at"], axis=1) 
test = test.drop(["imp_at"], axis=1) 

valid_df = valid_df.drop(["imp_at"], axis=1)
train_df = train_df.drop(["imp_at"], axis=1)

In [23]:
test = test.drop("target", axis=1)

In [24]:
train.shape, test.shape 

((1997595, 39), (390095, 38))

In [25]:
train.columns

Index(['target', 'adnw_id', 'adspot_id', 'adspot_video_format_id',
       'advertiser_id', 'app_id', 'auction_type_id', 'campaign_id',
       'first_login_interval', 'frequency', 'from_click', 'game_feed_id',
       'game_template_id', 'is_interstitial', 'item_id', 'last_login_interval',
       'login_frequency', 'max_login_interval', 'media_app_id', 'os', 'pos',
       'uid', 'mst_advertiser_order_id', 'mst_user_type_id', 'app_users',
       'count', 'uid_dayofweek=0', 'uid_dayofweek=1', 'uid_dayofweek=2',
       'uid_dayofweek=3', 'uid_dayofweek=4', 'uid_dayofweek=5',
       'uid_dayofweek=6', 'uid_hour_zone=0', 'uid_hour_zone=1',
       'uid_hour_zone=2', 'uid_hour_zone=3', 'camp_num', 'user_num'],
      dtype='object')

In [26]:
object_feats = [i for i in object_feats if i in train.columns] 
object_feats

['advertiser_id',
 'app_id',
 'campaign_id',
 'game_feed_id',
 'game_template_id',
 'media_app_id',
 'mst_advertiser_order_id']

In [27]:
del all_df, user_imp_df

# stratified lgb

In [28]:
# stratified k fold
n_folds = 5
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def modelling(seed):
    print("seed:", seed)
    lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 10, 'random_seed':seed,'max_depth': 5}
    
    y = train.target.copy()
    X = train.drop(['target'],axis=1).copy()
    
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    valid = np.zeros([X.shape[0]])
    features_list = [i for i in X.columns]
    pred_value = np.zeros(test.shape[0])
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    
    for i , (train_index, val_index) in enumerate(skf.split(X, y)):
        print("fold: ", i)
        
        X_train = X.iloc[train_index,:]
        y_train = y.iloc[train_index]

        X_valid = X.iloc[val_index,:]
        y_valid = y.iloc[val_index]
        
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 30,
                          feval=pr_auc_metric, categorical_feature = sorted(object_feats))
        
        valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
        valid[val_index] = valid_predict
        pred_value += model.predict(test, num_iteration = model.best_iteration)
        feature_importance_df["Fold_"+str(i+1)] = model.feature_importance()
    
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    
    return pred_value, valid, feature_importance_df

validation = np.zeros([train.shape[0]])
pred_value = np.zeros([test.shape[0]])
seeds = [0]
for seed_ in seeds:
    test_pred, oof_predict, feature_importance_df = modelling(seed_)
    validation += oof_predict / len(seeds)
    pred_value += test_pred / len(seeds)

score = average_precision_score(train.target, validation)
print(score)

seed: 0
fold:  0
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.268626	valid_1's pr_auc: 0.263553
[60]	training's pr_auc: 0.289384	valid_1's pr_auc: 0.276239
[90]	training's pr_auc: 0.302522	valid_1's pr_auc: 0.283454
[120]	training's pr_auc: 0.313047	valid_1's pr_auc: 0.287489
[150]	training's pr_auc: 0.320987	valid_1's pr_auc: 0.29044
[180]	training's pr_auc: 0.328624	valid_1's pr_auc: 0.292342
[210]	training's pr_auc: 0.336107	valid_1's pr_auc: 0.294477
[240]	training's pr_auc: 0.341841	valid_1's pr_auc: 0.295171
[270]	training's pr_auc: 0.34733	valid_1's pr_auc: 0.296092
[300]	training's pr_auc: 0.354018	valid_1's pr_auc: 0.296875
[330]	training's pr_auc: 0.358091	valid_1's pr_auc: 0.296127
Early stopping, best iteration is:
[324]	training's pr_auc: 0.358903	valid_1's pr_auc: 0.297754
fold:  1
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.266308	valid_1's pr_auc: 0.265062
[60]	training's pr_auc: 0.2888

In [29]:
feature_importance_df.sort_values("Average", ascending=False)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
4,app_id,737,886,713,712,292,668.0,198.807445,0.297616
6,campaign_id,438,500,450,418,263,413.8,80.10593,0.193586
17,media_app_id,366,470,361,331,136,332.8,109.045679,0.327661
10,game_feed_id,227,274,223,207,73,200.8,67.688699,0.337095
20,uid,163,191,178,165,112,161.8,26.858146,0.165996
3,advertiser_id,185,179,164,152,110,158.0,26.630809,0.168549
24,count,105,130,102,95,47,95.8,27.110146,0.282987
21,mst_advertiser_order_id,94,119,105,95,52,93.0,22.387497,0.240726
14,last_login_interval,81,88,64,59,25,63.4,21.950854,0.346228
7,first_login_interval,66,90,52,64,26,59.6,20.838426,0.349638


# ts lgb

In [30]:
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def modelling_ts(seed):
    print("seed:", seed)
    lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 10, 'random_seed':seed,'max_depth': 5}
    y_train = train_df.target.copy()
    X_train = train_df.drop(['target'],axis=1).copy()
    y_valid = valid_df.target.copy()
    X_valid = valid_df.drop(['target'],axis=1).copy()
    
    valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
    real = np.array([])
    features_list = [i for i in X_train.columns]
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
        
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
    model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
       num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 10, feval=pr_auc_metric, categorical_feature = sorted(object_feats))
    
    valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
    feature_importance_df["Importance"] = model.feature_importance()
        
    score = average_precision_score(y_valid, valid_predict)
    print("average precision score = {}".format(score))
    pred_value = model.predict(test, num_iteration = model.best_iteration)
    
    return pred_value, valid_predict, feature_importance_df

validation_ts = np.zeros([valid_df.shape[0]])
pred_value_ts = np.zeros([test.shape[0]])
seeds = [0]
for seed_ in seeds:
    test_pred, oof_predict, feature_importance_df = modelling_ts(seed_)
    validation_ts += oof_predict / len(seeds)
    pred_value_ts += test_pred / len(seeds)

ts_score = average_precision_score(valid_df.target, validation_ts)
print(ts_score)

seed: 0
Training until validation scores don't improve for 20 rounds
[10]	training's pr_auc: 0.253792	valid_1's pr_auc: 0.194218
[20]	training's pr_auc: 0.267442	valid_1's pr_auc: 0.201826
[30]	training's pr_auc: 0.279788	valid_1's pr_auc: 0.208617
[40]	training's pr_auc: 0.289771	valid_1's pr_auc: 0.21245
[50]	training's pr_auc: 0.298863	valid_1's pr_auc: 0.216081
[60]	training's pr_auc: 0.304348	valid_1's pr_auc: 0.21785
[70]	training's pr_auc: 0.308938	valid_1's pr_auc: 0.219115
[80]	training's pr_auc: 0.313683	valid_1's pr_auc: 0.22092
[90]	training's pr_auc: 0.317084	valid_1's pr_auc: 0.221964
[100]	training's pr_auc: 0.321071	valid_1's pr_auc: 0.222909
[110]	training's pr_auc: 0.324957	valid_1's pr_auc: 0.223915
[120]	training's pr_auc: 0.328823	valid_1's pr_auc: 0.225041
[130]	training's pr_auc: 0.331761	valid_1's pr_auc: 0.225572
[140]	training's pr_auc: 0.33502	valid_1's pr_auc: 0.225803
[150]	training's pr_auc: 0.33769	valid_1's pr_auc: 0.22633
[160]	training's pr_auc: 0.3419

In [31]:
feature_importance_df.sort_values("Importance", ascending=False)

Unnamed: 0,Feature,Importance
4,app_id,444
6,campaign_id,329
17,media_app_id,212
20,uid,141
10,game_feed_id,112
3,advertiser_id,102
21,mst_advertiser_order_id,78
24,count,66
22,mst_user_type_id,41
14,last_login_interval,40


# stratified xgboost

In [32]:
# https://xgboost.readthedocs.io/en/latest/parameter.html
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', - average_precision_score(y_true.get_label(), y_predicted)

n_folds=5
np.random.seed(0)
def modelling_xgb():
    y = train.target
    X = train.drop(['target'],axis=1)
    xgb_params = {"objective" : "binary:logistic", "max_depth" : 5, "learning_rate" : 0.1, "tree_method": "gpu_hist"}
    
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    valid = np.zeros([X.shape[0]])
    features_list = [i for i in X.columns]
    pred_value = np.zeros(test.shape[0])
    X_test = xgb.DMatrix(test.copy())
    
    for i , (train_index, val_index) in enumerate(skf.split(X, y)):
        print("fold: ", i)
        
        X_train = X.iloc[train_index,:]
        y_train = y.iloc[train_index]

        X_valid = X.iloc[val_index,:]
        y_valid = y.iloc[val_index]
        
        xgb_train = xgb.DMatrix(X_train, label = y_train)
        xgb_eval = xgb.DMatrix(X_valid, label = y_valid)
        watchlist = [(xgb_train, "train"), (xgb_eval, "eval")]
        num_boost_round = 100000
        
        model = xgb.train(
        xgb_params, xgb_train, num_boost_round, watchlist,
        early_stopping_rounds=10, verbose_eval = 50,
        feval=pr_auc_metric,
    )
                
        valid_predict = model.predict(xgb_eval, ntree_limit = model.best_ntree_limit)
        valid[val_index] = valid_predict
        pred_value += model.predict(X_test, ntree_limit = model.best_ntree_limit)
    
    return pred_value, valid

pred_value_xgb, validation_xgb = modelling_xgb()

score_xgb = average_precision_score(train.target, validation_xgb)
print(score_xgb)

fold:  0
[0]	train-error:0.03273	eval-error:0.03268	train-pr_auc:-0.14573	eval-pr_auc:-0.14584
Multiple eval metrics have been passed: 'eval-pr_auc' will be used for early stopping.

Will train until eval-pr_auc hasn't improved in 10 rounds.
[50]	train-error:0.03249	eval-error:0.03251	train-pr_auc:-0.25610	eval-pr_auc:-0.25362
[100]	train-error:0.03217	eval-error:0.03217	train-pr_auc:-0.28790	eval-pr_auc:-0.28167
[150]	train-error:0.03207	eval-error:0.03211	train-pr_auc:-0.30226	eval-pr_auc:-0.29164
[200]	train-error:0.03198	eval-error:0.03208	train-pr_auc:-0.31355	eval-pr_auc:-0.29789
[250]	train-error:0.03188	eval-error:0.03203	train-pr_auc:-0.32331	eval-pr_auc:-0.30313
[300]	train-error:0.03178	eval-error:0.03199	train-pr_auc:-0.33129	eval-pr_auc:-0.30622
[350]	train-error:0.03171	eval-error:0.03195	train-pr_auc:-0.33887	eval-pr_auc:-0.30905
[400]	train-error:0.03165	eval-error:0.03196	train-pr_auc:-0.34545	eval-pr_auc:-0.31083
[450]	train-error:0.03155	eval-error:0.03200	train-pr_a

# submission

In [33]:
lgb_sub = pd.read_csv(DIR + "atmaCup7__sample_submission.csv")
lgb_sub["target"] = (pred_value+pred_value_ts+pred_value_xgb)/3
lgb_sub.to_csv("atmacup7_"+str(score)[:-10]+".csv", index = False)