- go back to version 71
- 3 lgb seed averaging

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from numba import jit
from datetime import timedelta
from sklearn import preprocessing
from tqdm.notebook import tqdm_notebook
from sklearn.metrics import average_precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

warnings.filterwarnings('ignore')
tqdm_notebook.pandas(desc="progress: ")
pd.set_option("max_rows", 200)
pd.set_option("max_columns", 100)

In [2]:
DIR = "../input/20201115at/"
train = pd.read_csv(DIR+"train.csv")
test = pd.read_csv(DIR+"test.csv")
campaign = pd.read_csv(DIR+"campaign.csv")

#advertiser_video = pd.read_csv(DIR+"advertiser_video.csv")
#map_game_feed_native_video_assets = pd.read_csv(DIR+'map_game_feed_native_video_assets.csv')
#advertiser_converted_video = pd.read_csv(DIR+"advertiser_converted_video.csv")

In [3]:
def target_encoding(X_train, y_train, X_test, col_name, replace=True, option = "mean"):
    X_train = X_train.copy()
    X_test = X_test.copy()

    Xy = pd.DataFrame({'trans_col': X_train[col_name], 'target': y_train})
    
    if option == "sum":
        target_mean_all = Xy.groupby('trans_col')['target'].sum()
    else:
        target_mean_all = Xy.groupby('trans_col')['target'].mean()  
        
    if replace:
        X_test[col_name] = X_test[col_name].map(target_mean_all)
    else:
        X_test[f'te_{col_name}'] = X_test[col_name].map(target_mean_all)

    oof_target = np.zeros(X_train.shape[0])
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for idx_1, idx_2 in kf.split(X_train, y_train):
        if option == "sum":
            target_mean = Xy.iloc[idx_1, :].groupby('trans_col')['target'].sum()
        else:
            target_mean = Xy.iloc[idx_1, :].groupby('trans_col')['target'].mean()
            
        oof_target[idx_2] = X_train[col_name].iloc[idx_2].map(target_mean)

    if replace:
        X_train[col_name] = oof_target
    else:
        X_train[f'te_{col_name}'] = oof_target
        if col_name != "uid":
            train_only_class = list(set(X_train[col_name]) - set(X_test[col_name]))
            X_train.loc[X_train[col_name].isin(train_only_class), f'te_{col_name}'] = np.nan

    return X_train, X_test

# preprocess

In [4]:
train_only_uid = list(set(train['uid']) - set(test['uid']))
test_only_uid = list(set(test['uid']) - set(train['uid']))

In [5]:
# with campaign
train = train.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)
test = test.merge(campaign, left_on="campaign_id", right_on = "id", how="left").drop(["mst_advertiser_id", "id"], axis=1)

In [6]:
train["imp_at"] = pd.to_datetime(train["imp_at"])
test["imp_at"] = pd.to_datetime(test["imp_at"])
train["date"] = train["imp_at"].dt.date
test["date"] = test["imp_at"].dt.date

In [7]:
train["click_or_view_at"] = pd.to_datetime(train["click_or_view_at"])
train["cv_at"] = pd.to_datetime(train["cv_at"])

In [8]:
# drop because distribution is different
train.drop(['os_version'], axis=1, inplace=True)
test.drop(['os_version'], axis=1, inplace=True)

# noise features?
train.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id", "header_bidding"], axis=1, inplace=True)
test.drop(['user_type_id', "category_id",  "video_template_id", "country_code", "game_feed_asset_type_id", "header_bidding"], axis=1, inplace=True)

In [9]:
#
app_ranking = pd.concat([train, test]).groupby("app_id")["uid"].nunique().reset_index(drop=False)
app_ranking.columns = ["app_id", "app_users"]
train = train.merge(app_ranking, on="app_id", how="left")
test = test.merge(app_ranking, on="app_id", how="left")

#
media_app_rank = pd.concat([train, test]).groupby(["game_feed_id", "campaign_id"])["media_app_id"].nunique().reset_index(drop=False)
media_app_rank.columns = ["game_feed_id", "campaign_id", "madia_app_num"]
train = train.merge(media_app_rank, on=["game_feed_id", "campaign_id"], how="left")
test = test.merge(media_app_rank, on=["game_feed_id", "campaign_id"], how="left")

# 
campaign_start_date = pd.concat([train, test]).groupby("campaign_id").first().reset_index(drop=False)[["campaign_id", "date"]]
campaign_start_date.columns = ["campaign_id", "first_date"]
train = train.merge(campaign_start_date, on=["campaign_id"], how="left")
test = test.merge(campaign_start_date, on=["campaign_id"], how="left")

In [10]:
tmp_all = pd.concat([train[["imp_at", "uid"]], test[["imp_at", "uid"]]]).reset_index(drop=True)
tmp_all["imp_at_diff"] = tmp_all.groupby("uid")["imp_at"].diff(-1).dt.total_seconds() / 3600 / 24 * -1
train = train.merge(tmp_all, on=["imp_at", "uid"], how="left")
test = test.merge(tmp_all, on=["imp_at", "uid"], how="left")

del tmp_all

In [11]:
tmp_all = pd.concat([train[["imp_at", "uid", "advertiser_id"]], test[["imp_at", "uid", "advertiser_id"]]]).reset_index(drop=True)
tmp_all["imp_at_diff_by_user_advertiser"] = tmp_all.groupby(["uid", "advertiser_id"])["imp_at"].diff(-1).dt.total_seconds() / 3600 / 24 * -1
train = train.merge(tmp_all, on=["imp_at", "uid", "advertiser_id"], how="left")
test = test.merge(tmp_all, on=["imp_at", "uid", "advertiser_id"], how="left")

del tmp_all

In [12]:
tmp_all = pd.concat([train[["advertiser_id", "max_login_interval"]], test[["advertiser_id", "max_login_interval"]]]).reset_index(drop=True)
max_login_interval = tmp_all.groupby("advertiser_id")["max_login_interval"].agg(["mean", "std"]).add_prefix("max_login_interval_")

train = train.merge(max_login_interval, on="advertiser_id", how="left")
test = test.merge(max_login_interval, on="advertiser_id", how="left")
del tmp_all, max_login_interval

In [13]:
imp_at_diff_stats_by_date =train.groupby("date")["imp_at_diff"].agg(["mean", "std", "max", "min"])
imp_at_diff_stats_by_date = imp_at_diff_stats_by_date.add_prefix("imp_at_diff_by_date_")

train = train.merge(imp_at_diff_stats_by_date, on=["date"], how="left")
test = test.merge(imp_at_diff_stats_by_date, on=["date"], how="left")
del imp_at_diff_stats_by_date

In [14]:
train["hour"] = train["imp_at"].dt.hour
test["hour"] = test["imp_at"].dt.hour

imp_at_diff_stats_by_hour =train.groupby("hour")["imp_at_diff"].agg(["mean", "std", "max", "min"])
imp_at_diff_stats_by_hour = imp_at_diff_stats_by_hour.add_prefix("imp_at_diff_by_hour_")

train = train.merge(imp_at_diff_stats_by_hour, on=["hour"], how="left")
test = test.merge(imp_at_diff_stats_by_hour, on=["hour"], how="left")
del imp_at_diff_stats_by_hour
del train["hour"], test["hour"]

In [15]:
train["days_from_start"] = (train["date"] - train["first_date"]).apply(lambda x: x.days)
test["days_from_start"] = (test["date"] - test["first_date"]).apply(lambda x: x.days)

In [16]:
ids = ["advertiser_id", "campaign_id"]

def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

concatenated_ = pd.concat([train[ids + ["uid"]], test[ids + ["uid"]]], axis=0).reset_index(drop=True)

tfidf_dfs = []
tfidf_svd_feats = []
for id_ in ids:
    print(id_)
    docs = concatenated_.groupby("uid")[id_].apply(join)
    max_features = int(concatenated_[id_].nunique() * 0.8)
    tv = TfidfVectorizer(max_features=max_features)
    X = tv.fit_transform(docs)

    if id_ == "advertiser_id":
        n_components = 20
    else:
        n_components = 50
    svd = TruncatedSVD(n_components=n_components, random_state=0)
    X = svd.fit_transform(X)
    df = pd.DataFrame(X, columns=[f"tfidf_{id_}_{i}" for i in range(n_components)])
    df.index = docs.index
    tfidf_dfs.append(df)
    tfidf_svd_feats += [f"tfidf_{id_}_{i}" for i in range(n_components)]

advertiser_id
campaign_id


In [17]:
def add_LDA_feats(train, test, key, values):
    train = train.copy()
    test = test.copy()
    cross_info = pd.crosstab(pd.concat([train[[key, values]], test[[key, values]]])[key] , pd.concat([train[[key, values]], test[[key, values]]])[values])
    lda = LatentDirichletAllocation(n_components = 3, random_state=0)
    out = lda.fit_transform(cross_info)
    out = pd.DataFrame(out, columns = ["lda_"+str(key)+"_"+str(values)+"_"+str(i) for i in range(3)])
    out.index = cross_info.index
    train = train.merge(out, on=key, how="left")
    test = test.merge(out, on=key, how="left")
    return train, test

# train, test = add_LDA_feats(train, test, key='uid', values='advertiser_id')

In [18]:
#
game_feed_target = train.groupby("game_feed_id")["target"].mean().reset_index(drop=False)
game_feed_counts = train["game_feed_id"].value_counts().reset_index(drop=False)
game_feed_counts.columns = ["game_feed_id", "game_feed_counts"]
game_feed_target = game_feed_target.merge(game_feed_counts, on ="game_feed_id", how="left")
popular_game_feed = list(game_feed_target[(game_feed_target.target > 0.1) & (game_feed_target.game_feed_counts > 100)]["game_feed_id"])

train["popular_game"] = train["game_feed_id"].apply(lambda x: 1 if x in popular_game_feed else 0)
test["popular_game"] = test["game_feed_id"].apply(lambda x: 1 if x in popular_game_feed else 0)

#
camp_target = train.groupby("campaign_id")["target"].mean().reset_index(drop=False)
camp_counts = train["campaign_id"].value_counts().reset_index(drop=False)
camp_counts.columns = ["campaign_id", "campaign_counts"]
camp_target = camp_target.merge(camp_counts, on ="campaign_id", how="left")
popular_campaign = list(camp_target[(camp_target.target > 0.1) & (camp_target.campaign_counts > 100)]["campaign_id"])

train["popular_campaign"] = train["campaign_id"].apply(lambda x: 1 if x in popular_campaign else 0)
test["popular_campaign"] = test["campaign_id"].apply(lambda x: 1 if x in popular_campaign else 0)

In [19]:
#y_train = train['target'].copy()
#train, test = target_encoding(train, y_train, test, "advertiser_id", False, "mean")
#train, test = target_encoding(train, y_train, test, "app_id", False, "mean")
#train, test = target_encoding(train, y_train, test, "campaign_id", False, "mean")
#train, test = target_encoding(train, y_train, test, "item_id", False, "mean")
#train, test = target_encoding(train, y_train, test, "mst_advertiser_order_id", False, "mean")
#del y_train

In [20]:
def shift(xs, n):
    e = np.empty_like(xs)
    if n >= 0:
        e[:n] = np.nan
        e[n:] = xs[:-n]
    else:
        e[n:] = np.nan
        e[:n] = xs[-n:]
    return e

@jit
def same_category_ornot(a):
    return shift(a,1) == a

def make_category_to_check_next_is_same(X_train, X_test, col_name):
    X_train = X_train.copy()
    X_test = X_test.copy()

    same_cat_next = X_train.groupby("uid")[col_name].apply(lambda x: same_category_ornot(x.values)).values
    same_cat_next = np.hstack(same_cat_next)
    X_train[str(col_name)+"_next"] = same_cat_next
    del same_cat_next
    
    same_cat_next = X_test.groupby("uid")[col_name].apply(lambda x: same_category_ornot(x.values)).values
    same_cat_next = np.hstack(same_cat_next)
    X_test[str(col_name)+"_next"] = same_cat_next
    del same_cat_next

    X_train[str(col_name)+"_next"] = X_train[str(col_name)+"_next"].astype(int)
    X_test[str(col_name)+"_next"] = X_test[str(col_name)+"_next"].astype(int)
    
    return X_train, X_test

# train, test = make_category_to_check_next_is_same(train, test, "app_id")

In [21]:
object_feats = [i for i in train.columns if "object" == str(train[i].dtype) 
                and i != "request_id" and i != "uid" and i != "date" and i != "hour" and i !="first_date"]

def get_non_overlapping(train: pd.DataFrame, test: pd.DataFrame, column: str):
    only_in_train = set(train[column].unique()) - set(test[column].unique())
    only_in_test = set(test[column].unique()) - set(train[column].unique())
    non_overlapping = only_in_train.union(only_in_test)
    return non_overlapping

def category2num(train: pd.DataFrame, test: pd.DataFrame, columns: list):
    train_ = train.copy()
    test_ = test.copy()
    for column in columns:
        print(column)
        non_overlapping = get_non_overlapping(train, test, column)
        if train[column].dtype == np.dtype("O"):
            # dtypeがobjectなら欠損は'missing' クラスにする
            train_[column] = train[column].fillna("missing")
            test_[column] = test[column].fillna("missing")
            train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else "other")
            test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else "other")
                
        else:
            # dtypeがint/floatなら欠損は'-1'とする
            train_[column] = train[column].fillna(-1)
            test_[column] = test[column].fillna(-1)
            train_[column] = train_[column].map(lambda x: x if x not in non_overlapping else -2)
            test_[column] = test_[column].map(lambda x: x if x not in non_overlapping else -2)

        le = preprocessing.LabelEncoder()
        concatenated = pd.concat([train_, test_], axis=0).reset_index(drop=True)
        le.fit(concatenated[column])
        train_[column] = le.transform(train_[column])
        test_[column] = le.transform(test_[column])
    return train_, test_

train, test = category2num(train, test, object_feats)

advertiser_id
app_id
campaign_id
game_feed_id
game_template_id
media_app_id
mst_advertiser_order_id


In [22]:
last_paid_interval_df = train.groupby("uid")["last_paid_interval"].mean().reset_index(drop=False)
last_paid_interval_df.columns = ["uid", "last_paid_interval_mean"]

train = train.merge(last_paid_interval_df, on="uid", how="left")
test = test.merge(last_paid_interval_df, on="uid", how="left")

train.loc[train['uid'].isin(train_only_uid), 'last_paid_interval_mean'] = -1
test.loc[test['uid'].isin(test_only_uid), 'last_paid_interval_mean'] = -1

In [23]:
# drop because the exist only in train
train.drop(['click_or_view_at', 'cv_at','last_cv_interval', 'last_paid_interval'], axis=1, inplace=True)

# feature engineering

In [24]:
# ok
all_df = pd.concat([train, test], ignore_index=True)
all_df["count"] = 1
all_df["count"] = all_df.groupby("uid")["count"].cumsum()

train = pd.merge(train, all_df[["request_id", "count"]], on="request_id", how="left")
test = pd.merge(test, all_df[["request_id", "count"]], on="request_id", how="left")

In [25]:
train = pd.merge(train, tfidf_dfs[0], on="uid", how="left")
test = pd.merge(test, tfidf_dfs[0], on="uid", how="left")

train = pd.merge(train, tfidf_dfs[1], on="uid", how="left")
test = pd.merge(test, tfidf_dfs[1], on="uid", how="left")

In [26]:
camp_num = all_df.groupby("date")["campaign_id"].nunique().reset_index(drop=False)
camp_num.columns=["date", "camp_num"]
user_num = all_df.groupby("date")["uid"].nunique().reset_index(drop=False)
user_num.columns=["date", "user_num"]

train = pd.merge(train, camp_num, on="date", how="left")
test = pd.merge(test, camp_num, on="date", how="left")
train = pd.merge(train, user_num, on="date", how="left")
test = pd.merge(test, user_num, on="date", how="left")

In [27]:
y_train = train['target'].copy()
train, test = target_encoding(train, y_train, test, "uid", False, "sum")
del y_train

In [28]:
ce_feats = ["uid"]
for c in ce_feats:
    freq = pd.concat([train[c], test[c]]).value_counts()
    train[c] = train[c].map(freq)
    test[c] = test[c].map(freq)

In [29]:
del all_df 
del camp_num, user_num
del tfidf_dfs, concatenated_

train.drop(["date", "first_date",'request_id', "imp_at"], axis=1, inplace=True) 
test.drop(["date", "first_date",'request_id', "imp_at"], axis=1, inplace=True) 

# stratified lgb

In [30]:
print(train.shape, test.shape)

(1997847, 116) (390095, 115)


In [31]:
print("all columns: ", train.columns)
object_feats = [i for i in object_feats if i in train.columns]
print("categorical feats: ", object_feats) 

all columns:  Index(['target', 'adnw_id', 'adspot_id', 'adspot_video_format_id',
       'advertiser_id', 'app_id', 'auction_type_id', 'campaign_id',
       'first_login_interval', 'frequency',
       ...
       'tfidf_campaign_id_43', 'tfidf_campaign_id_44', 'tfidf_campaign_id_45',
       'tfidf_campaign_id_46', 'tfidf_campaign_id_47', 'tfidf_campaign_id_48',
       'tfidf_campaign_id_49', 'camp_num', 'user_num', 'te_uid'],
      dtype='object', length=116)
categorical feats:  ['advertiser_id', 'app_id', 'campaign_id', 'game_feed_id', 'game_template_id', 'media_app_id', 'mst_advertiser_order_id']


In [32]:
# stratified k fold
n_folds = 5
def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

def modelling(seed):
    print("seed:", seed)
    lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt', 
               'tree_learner': 'serial', 'learning_rate': 0.1, "num_leaves": 10, 'random_seed':seed,'max_depth': 5,
               } 
    
    y = train.target
    X = train.drop(['target'],axis=1)
    
    skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    valid = np.zeros([X.shape[0]])
    features_list = [i for i in X.columns]
    pred_value = np.zeros(test.shape[0])
    feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
    
    for i , (train_index, val_index) in enumerate(skf.split(X, y)):
        print("fold: ", i)
        
        X_train = X.iloc[train_index,:]
        y_train = y.iloc[train_index]
        
        X_valid = X.iloc[val_index,:]
        y_valid = y.iloc[val_index]
        
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
        model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
           num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 30,
                          feval=pr_auc_metric, categorical_feature = object_feats)
        
        valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
        valid[val_index] = valid_predict
        pred_value += model.predict(test, num_iteration = model.best_iteration) / n_folds
        feature_importance_df["Fold_"+str(i+1)] = model.feature_importance()
    
    feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
    feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]
    
    return pred_value, valid, feature_importance_df

validation = np.zeros([train.shape[0]])
pred_value = np.zeros([test.shape[0]])
seeds = [0,1,2]
for seed_ in seeds:
    test_pred, oof_predict, feature_importance_df = modelling(seed_)
    validation += oof_predict / len(seeds)
    pred_value += test_pred / len(seeds)

score = average_precision_score(train.target, validation)
print(score)

seed: 0
fold:  0
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.377339	valid_1's pr_auc: 0.369668
[60]	training's pr_auc: 0.400858	valid_1's pr_auc: 0.387293
[90]	training's pr_auc: 0.414143	valid_1's pr_auc: 0.393005
[120]	training's pr_auc: 0.42277	valid_1's pr_auc: 0.395107
[150]	training's pr_auc: 0.430109	valid_1's pr_auc: 0.397857
Early stopping, best iteration is:
[151]	training's pr_auc: 0.430555	valid_1's pr_auc: 0.398204
fold:  1
Training until validation scores don't improve for 20 rounds
[30]	training's pr_auc: 0.372334	valid_1's pr_auc: 0.372372
[60]	training's pr_auc: 0.399944	valid_1's pr_auc: 0.393695
[90]	training's pr_auc: 0.410516	valid_1's pr_auc: 0.398172
[120]	training's pr_auc: 0.420818	valid_1's pr_auc: 0.402367
[150]	training's pr_auc: 0.428283	valid_1's pr_auc: 0.404408
[180]	training's pr_auc: 0.434251	valid_1's pr_auc: 0.404962
[210]	training's pr_auc: 0.440673	valid_1's pr_auc: 0.405808
Early stopping, best iteration 

In [33]:
feature_importance_df.sort_values("Average", ascending=False).head(150).reset_index(drop=True)

Unnamed: 0,Feature,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Average,Std,Cv
0,campaign_id,243,230,202,205,265,229.0,23.655866,0.103301
1,app_id,229,187,167,143,281,201.4,48.783604,0.242222
2,imp_at_diff_by_user_advertiser,103,100,97,83,105,97.6,7.787169,0.079787
3,media_app_id,86,85,68,56,120,83.0,21.61481,0.260419
4,advertiser_id,81,69,71,75,78,74.8,4.4,0.058824
5,last_paid_interval_mean,67,58,64,54,73,63.2,6.675328,0.105622
6,game_feed_id,77,49,45,40,87,59.6,18.778711,0.315079
7,te_uid,58,53,47,50,58,53.2,4.354308,0.081848
8,mst_advertiser_order_id,33,38,34,26,37,33.6,4.223742,0.125707
9,tfidf_advertiser_id_8,29,27,28,26,30,28.0,1.414214,0.050508


# stratified xgb

In [34]:
# def pr_auc_metric(y_predicted, y_true):
#     return 'pr_auc', - average_precision_score(y_true.get_label(), y_predicted)

# n_folds=5
# np.random.seed(0)
# def modelling_xgb():
#     y = train.target
#     X = train.drop(['target'],axis=1)
#     xgb_params = {"objective" : "binary:logistic", "max_depth" : 5, "learning_rate" : 0.1, "tree_method": "gpu_hist"}
    
#     skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    
#     valid = np.zeros([X.shape[0]])
#     pred_value = np.zeros(test.shape[0])
#     X_test = xgb.DMatrix(test)
    
#     for i , (train_index, val_index) in enumerate(skf.split(X, y)):
#         print("fold: ", i)
        
#         X_train = X.iloc[train_index,:]
#         y_train = y.iloc[train_index]
        
#         X_valid = X.iloc[val_index,:]
#         y_valid = y.iloc[val_index]
        
#         xgb_train = xgb.DMatrix(X_train, label = y_train)
#         xgb_eval = xgb.DMatrix(X_valid, label = y_valid)
#         watchlist = [(xgb_train, "train"), (xgb_eval, "eval")]
#         num_boost_round = 100000
        
#         model = xgb.train(
#         xgb_params, xgb_train, num_boost_round, watchlist,
#         early_stopping_rounds=10, verbose_eval = 50,
#         feval=pr_auc_metric,
#     )
                
#         valid_predict = model.predict(xgb_eval, ntree_limit = model.best_ntree_limit)
#         valid[val_index] = valid_predict
#         pred_value += model.predict(X_test, ntree_limit = model.best_ntree_limit) / n_folds
    
#     return pred_value, valid

# pred_value_xgb, validation_xgb = modelling_xgb()
# score_xgb = average_precision_score(train.target, validation_xgb)
# print(score_xgb)

# submission

In [35]:
lgb_sub = pd.read_csv(DIR + "atmaCup7__sample_submission.csv")
lgb_sub["target"] = pred_value # + pred_value_xgb) / 2
lgb_sub.to_csv("atmacup7_"+str(score)[:-10]+".csv", index = False)