- merge all data

In [1]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.metrics import average_precision_score

In [2]:
DIR = "../input/20201115at/"
train = pd.read_csv(DIR+"train.csv")
test = pd.read_csv(DIR+"test.csv")
advertiser_video = pd.read_csv(DIR+"advertiser_video.csv")
campaign = pd.read_csv(DIR+"campaign.csv")
map_game_feed_native_video_assets = pd.read_csv(DIR+'map_game_feed_native_video_assets.csv')
advertiser_converted_video = pd.read_csv(DIR+"advertiser_converted_video.csv")

# preprocess

In [3]:
def merge_all(
    df: pd.DataFrame,
    campaign: pd.DataFrame,
    map_game_feed_native_video_assets: pd.DataFrame,
    advertiser_video: pd.DataFrame,
    advertiser_converted_video: pd.DataFrame,
) -> pd.DataFrame:
    # merge df and campaign
    res = pd.merge(
        df, campaign, left_on="campaign_id", right_on="id", how="left"
    ).drop(
        columns=["id", "mst_advertiser_id"]
    )  # remove campaign keys

    # merge res and map_game_feed_native_video_assets
    res = pd.merge(
        res,
        map_game_feed_native_video_assets,
        left_on="game_feed_id",
        right_on="mst_game_feed_id",
        how="left",
    ).drop(
        columns=["mst_game_feed_id"]
    )  # remove map_game_feed_native_video_assets keys

    # merge res and advertiser_video (horizontal case)
    horizontal = advertiser_video.copy()
    left_keys = ["horizontal_mst_advertiser_video_id", "advertiser_id"]
    right_keys = ["id", "mst_advertiser_id"]
    horizontal.columns = [
        f"horizontal_{c}" if c not in right_keys else c for c in horizontal.columns
    ]
    res = pd.merge(
        res, horizontal, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_video keys

    # merge res and advertiser_video (vertical case)
    vertical = advertiser_video.copy()
    left_keys = ["vertical_mst_advertiser_video_id", "advertiser_id"]
    right_keys = ["id", "mst_advertiser_id"]
    vertical.columns = [
        f"vertical_{c}" if c not in right_keys else c for c in vertical.columns
    ]
    res = pd.merge(
        res, vertical, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_video keys

    # merge res and advertiser_converted_video (horizontal case)
    left_keys = [
        "horizontal_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    horizontal = advertiser_converted_video.copy()
    horizontal.columns = [
        f"horizontal_converted_{c}" if c not in right_keys else c
        for c in horizontal.columns
    ]
    res = pd.merge(
        res, horizontal, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_converted_video keys

    # merge res and advertiser_converted_video (vertical case)
    left_keys = [
        "vertical_mst_advertiser_video_id",
        "game_feed_id",
        "video_template_id",
    ]
    right_keys = [
        "mst_advertiser_video_id",
        "mst_game_feed_id",
        "mst_video_template_id",
    ]
    vertical = advertiser_converted_video.copy()
    vertical.columns = [
        f"vertical_converted_{c}" if c not in right_keys else c
        for c in vertical.columns
    ]
    res = pd.merge(
        res, vertical, left_on=left_keys, right_on=right_keys, how="left"
    ).drop(
        columns=right_keys
    )  # remove advertiser_converted_video keys

    return res

In [4]:
train["imp_at"] = pd.to_datetime(train["imp_at"])

In [5]:
train.drop(['click_or_view_at', 'cv_at','last_cv_interval', 'last_paid_interval'], axis=1, inplace=True)
train.drop(['request_id'], axis=1, inplace=True)
test.drop(['request_id'], axis=1, inplace=True)

In [6]:
#train = merge_all(train, campaign, map_game_feed_native_video_assets, advertiser_video, advertiser_converted_video)
#test = merge_all(test, campaign, map_game_feed_native_video_assets, advertiser_video, advertiser_converted_video)

In [7]:
object_feats = [i for i in train.columns if "object" == str(train[i].dtype)]
train[object_feats] = train[object_feats].fillna("-999")
test[object_feats] = test[object_feats].fillna("-999")

for i in object_feats:
    print(i)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(pd.concat([train[i], test[i]])))
    train[i] = lbl.transform(list(train[i]))
    test[i] = lbl.transform(list(test[i]))

advertiser_id
app_id
campaign_id
country_code
game_feed_id
game_template_id
media_app_id
os_version
uid
video_template_id


In [8]:
train.head()

Unnamed: 0,imp_at,target,adnw_id,adspot_id,adspot_video_format_id,advertiser_id,app_id,auction_type_id,campaign_id,category_id,...,last_login_interval,login_frequency,max_login_interval,media_app_id,os,os_version,pos,uid,user_type_id,video_template_id
0,2020-04-28 00:04:14.639,0,4,1111,2.0,20,14872,1.0,403,8497,...,11320.0,10.0,2607.0,0,1,87,0,273693,1,6
1,2020-04-28 00:09:13.759,0,28,1081,2.0,121,8301,2.0,91,0,...,12589.0,2.0,13540.0,8995,2,152,1,427139,2,4
2,2020-04-28 00:09:14.708,0,42,1123,3.0,5,12948,1.0,114,0,...,10254.0,9.0,33725.0,347,1,55,0,1005266,1,0
3,2020-04-28 00:10:40.291,0,8,1113,2.0,22,12379,1.0,75,0,...,39618.0,4.0,7377.0,6145,1,91,0,492541,1,7
4,2020-04-28 00:11:45.895,0,22,1061,,135,196,4.0,331,0,...,35652.0,5.0,164759.0,2190,1,71,0,698493,1,0


In [9]:
mask = train.imp_at > "2020-05-21"
valid = train.loc[mask].copy()
train = train.loc[~mask]

In [10]:
train.shape, valid.shape

((1568715, 30), (428880, 30))

In [11]:
remove_features = ["imp_at"]
    
#for i in X_train.columns:
#    if (X_train[i].std() == 0) and i not in remove_features:
#        remove_features.append(i)
    
train = train.drop(remove_features, axis=1)
valid = valid.drop(remove_features, axis=1)
test = test.drop(remove_features, axis=1)

In [12]:
train.shape, valid.shape

((1568715, 29), (428880, 29))

In [13]:
train.head()

Unnamed: 0,target,adnw_id,adspot_id,adspot_video_format_id,advertiser_id,app_id,auction_type_id,campaign_id,category_id,country_code,...,last_login_interval,login_frequency,max_login_interval,media_app_id,os,os_version,pos,uid,user_type_id,video_template_id
0,0,4,1111,2.0,20,14872,1.0,403,8497,1,...,11320.0,10.0,2607.0,0,1,87,0,273693,1,6
1,0,28,1081,2.0,121,8301,2.0,91,0,1,...,12589.0,2.0,13540.0,8995,2,152,1,427139,2,4
2,0,42,1123,3.0,5,12948,1.0,114,0,1,...,10254.0,9.0,33725.0,347,1,55,0,1005266,1,0
3,0,8,1113,2.0,22,12379,1.0,75,0,1,...,39618.0,4.0,7377.0,6145,1,91,0,492541,1,7
4,0,22,1061,,135,196,4.0,331,0,1,...,35652.0,5.0,164759.0,2190,1,71,0,698493,1,0


# modelling

In [14]:
lgbm_params = {'objective': 'binary', 'metric': 'None', 'boosting_type': 'gbdt',
               'tree_learner': 'serial', 'learning_rate': 0.03, "num_leaves": 10, 'random_seed':44,'max_depth': 5}

def pr_auc_metric(y_predicted, y_true):
    return 'pr_auc', average_precision_score(y_true.get_label(), y_predicted), True

y_train = train.target.copy()
X_train = train.drop(['target'],axis=1).copy()
y_valid = valid.target.copy()
X_valid = valid.drop(['target'],axis=1).copy()
    
valid_lgb = pd.DataFrame(np.zeros([X_train.shape[0]]))
real = np.array([])
features_list = [i for i in X_train.columns]
feature_importance_df = pd.DataFrame(features_list, columns=["Feature"])
        
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        
model = lgb.train(lgbm_params, lgb_train,valid_sets=[lgb_train, lgb_eval],
       num_boost_round=10000,early_stopping_rounds=20,verbose_eval = 10, feval=pr_auc_metric, categorical_feature = sorted(object_feats))
        
valid_predict = model.predict(X_valid, num_iteration = model.best_iteration)
feature_importance_df["Importance"] = model.feature_importance()
        
#feature_importance_df["Average"] = np.mean(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
#feature_importance_df["Std"] = np.std(feature_importance_df.iloc[:,1:n_folds+1], axis=1)
#feature_importance_df["Cv"] = feature_importance_df["Std"] / feature_importance_df["Average"]

score = average_precision_score(y_valid, valid_predict)
print("average precision score = {}".format(score))
pred_value = model.predict(test, num_iteration = model.best_iteration)

New categorical_feature is ['advertiser_id', 'app_id', 'campaign_id', 'country_code', 'game_feed_id', 'game_template_id', 'media_app_id', 'os_version', 'uid', 'video_template_id']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds
[10]	training's pr_auc: 0.206436	valid_1's pr_auc: 0.17013
[20]	training's pr_auc: 0.211127	valid_1's pr_auc: 0.174428
[30]	training's pr_auc: 0.217629	valid_1's pr_auc: 0.180438
[40]	training's pr_auc: 0.218033	valid_1's pr_auc: 0.180603
[50]	training's pr_auc: 0.219806	valid_1's pr_auc: 0.182111
[60]	training's pr_auc: 0.223073	valid_1's pr_auc: 0.184023
[70]	training's pr_auc: 0.225921	valid_1's pr_auc: 0.186256
[80]	training's pr_auc: 0.22803	valid_1's pr_auc: 0.188128
[90]	training's pr_auc: 0.229867	valid_1's pr_auc: 0.189727
[100]	training's pr_auc: 0.232297	valid_1's pr_auc: 0.19132
[110]	training's pr_auc: 0.234481	valid_1's pr_auc: 0.192553
[120]	training's pr_auc: 0.236548	valid_1's pr_auc: 0.193783
[130]	training's pr_auc: 0.238408	valid_1's pr_auc: 0.19457
[140]	training's pr_auc: 0.239846	valid_1's pr_auc: 0.195464
[150]	training's pr_auc: 0.24174	valid_1's pr_auc: 0.196642
[160]	training's pr_auc: 0.243227	vali

In [15]:
feature_importance_df.sort_values("Importance", ascending=False)

Unnamed: 0,Feature,Importance
6,campaign_id,801
4,app_id,790
13,game_feed_id,773
25,uid,500
21,media_app_id,286
3,advertiser_id,273
9,first_login_interval,135
20,max_login_interval,114
19,login_frequency,98
18,last_login_interval,98


# submission

In [16]:
lgb_sub = pd.read_csv(DIR + "atmaCup7__sample_submission.csv")
lgb_sub["target"] = pred_value
lgb_sub.to_csv("atmacup7_lgb"+str(score)[:-10]+".csv", index = False)