In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc
import json
import xgboost as xgb
import pickle
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

In [2]:
dtypes = {
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'hour'          : 'uint8',
        'is_attributed' : 'uint8', 
        'ip_day_hour_count': 'uint32', 
        'ip_os_day_hour_count': 'uint32', 
        'ip_app_day_hour_count': 'uint32', 
        'ip_app_os_day_hour_count': 'uint32', 
        'app_day_hour_count': 'uint32', 
        'ip_device_os_count': 'uint32', 
        'ip_app_device_os_count': 'uint32', 
        'ip_device_os_mean': 'float16',
        'ip_app_device_os_mean': 'float16',
        'ip_app_device_mean': 'float16',
        'app_device_os_mean': 'float16',
        'ip_device_os_time2nextclick': 'int32',
        'ip_app_device_os_time2nextclick': 'int32',
        'ip_app_device_time2nextclick': 'int32',
        'ip_device_os_time2previousclick': 'int32',
        'ip_app_device_os_time2previousclick': 'int32',
        'ip_app_device_time2previousclick': 'int32',
        'ip_device_os_countfromfuture': 'uint32', 
        'ip_app_device_os_countfromfuture': 'uint32', 
        'ip_app_device_countfromfuture': 'uint32', 
        'ip_device_os_countfrompast': 'uint32', 
        'ip_app_device_os_countfrompast': 'uint32', 
        'ip_app_device_countfrompast': 'uint32', 
        'ip_device_os_lasttimediff': 'int32',
        'ip_app_device_os_lasttimediff': 'int32',
        'ip_app_device_lasttimediff': 'int32',
        'ip_device_os_firsttimediff': 'int32',
        'ip_app_device_os_firsttimediff': 'int32',
        'ip_app_device_firsttimediff': 'int32',
        'matrixFact_user_iposdeviceapp_item_app': 'float16',
        'matrixFact_user_ip_item_appdeviceos': 'float16',
        'matrixFact_user_ipchannel_item_appdeviceos': 'float16',
        'ip_device_os_regression': 'float16',
        'ip_app_device_os_regression': 'float16',
        'ip_app_device_regression': 'float16',
        'ip_app_device_os_channel_regression': 'float16', 
        'attributed_timediffmax':'int32',
        'attributed_timediffmin':'int32',
        'attributed_timediff':'float16',
        'matrixFact_user_ipappdeviceos_item_channel': 'float16'
    
        } 

# Load evaluation file

In [3]:
evl_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/day9_features_equalhour_supplementV3_feature42.csv'
evl = pd.read_csv(evl_path)
print('loading evl done!')
groudtruth = evl['is_attributed'].values

loading evl done!


# Lightgbm

In [5]:
model_path = '/home/kai/data/kaggle/talkingdata/wl/data/lightgbm/'
model_name = 'paramsCombo_1_lr0.05_scale99_ntree500'
feature_file = '/home/kai/data/kaggle/talkingdata/wl/data/lightgbm/featurecolsV3_col38_scale99.json'
num_trees_lightgbm = [300, 370, 470]


modelfile=model_path+model_name
model_load_lightgbm = lgb.Booster(model_file=modelfile)
print('load model done!')



feature_col_load_lightgbm = json.load(open(feature_file))
print('load feature cols done!')

preds_lightgbm = []
for ntree in num_trees_lightgbm:
    ntree = int(ntree)
    print('-------')
    cur_pred = model_load_lightgbm.predict(evl[model_load_lightgbm.feature_name()], num_iteration=ntree)
    print('predict done!')
    cur_roc = roc_auc_score(groudtruth, cur_pred)
    preds_lightgbm.append(cur_pred)
    print('predicting... {} done! roc is {}'.format(ntree, cur_roc))
pred_lightgbm = np.mean(preds_lightgbm, axis=0)
roc_lightgbm = roc_auc_score(groudtruth, pred_lightgbm)
print('overall roc: {}'.format(roc_lightgbm))

load model done!
load feature cols done!
-------
predict done!
predicting... 300 done! roc is 0.9865661195463861
-------
predict done!
predicting... 370 done! roc is 0.9872490201775088
-------
predict done!
predicting... 470 done! roc is 0.9881203090924928
overall roc: 0.9875862021306904


# XGBoost

In [17]:
model_path = '/home/kai/data/kaggle/talkingdata/wl/data/xgboost/'
model_name = 'all_suppelement_xgbtree_900_depth7_scale99_lr0.1'
model_postfix = '.pickle.dat'
feature_file_xgb = model_path + 'xgb-featurecolsV3_col38_scale99.json'
num_trees_load_xgboost = [80, 120, 150, 250, 350, 450]


modelfile=model_path + model_name + model_postfix
model_load_xgb = pickle.load(open(modelfile, "rb"))
model_load_xgb.set_params(**{'n_jobs':8, 'nthread':8})
print('load model done!')

feature_col_load_xgb = json.load(open(feature_file_xgb))
print('load feature cols done!')

preds_xgb = []
for ntree in num_trees_load_xgboost:
    ntree = int(ntree)
    print('-----------')
    cur_pred = model_load_xgb.predict_proba(evl[feature_col_load_xgb].values, ntree_limit=ntree)[:,1]
    preds_xgb.append(cur_pred)
    cur_roc = roc_auc_score(groudtruth, cur_pred)
    print('predicting... {} done! roc is {}'.format(ntree, cur_roc))

pred_xgb = np.mean(preds_xgb, axis=0)
roc_xgb = roc_auc_score(groudtruth, pred_xgb)
print('overall roc: {}'.format(roc_xgb))

load model done!
load feature cols done!
-----------
predicting... 80 done! roc is 0.9834868698550314
-----------
predicting... 120 done! roc is 0.9845585483539724
-----------
predicting... 150 done! roc is 0.9851541042537096
-----------
predicting... 250 done! roc is 0.9853402461429296
-----------
predicting... 350 done! roc is 0.9853402461429296
-----------
predicting... 450 done! roc is 0.9853402461429296
overall roc: 0.9850219470181852


# CatBoost

In [24]:
model_path = '/home/kai/data/kaggle/talkingdata/wl/data/catboost/'
model_name = 'all_suppelement_tree_1200_depth6_scale398_lr0.05'
model_postfix = ''
feature_file_catboost = model_path + 'catboost-featurecolsV3_col38_scale398.json'
num_trees_load_catboost = [300, 650, 850, 1100, 1200]


modelfile=model_path + model_name + model_postfix
model_load_catboost = CatBoostClassifier().load_model(fname=modelfile)
model_load_catboost.set_params(**{'thread_count':30})
print('load model done!')

feature_col_load_catboost = json.load(open(feature_file_catboost))
print('load feature cols done!')


preds_catboost = []

for ntree in num_trees_load_catboost:
    ntree = int(ntree)
    print('-----------')
    cur_pred = model_load_catboost.predict_proba(evl[feature_col_load_catboost].values, ntree_start=0, ntree_end=ntree)[:,1]
    preds_catboost.append(cur_pred)
    cur_roc = roc_auc_score(groudtruth, cur_pred)
    print('predicting... {} done! roc is {}'.format(ntree, cur_roc))

pred_catboost = np.mean(preds_catboost, axis=0)
roc_catboost = roc_auc_score(groudtruth, pred_catboost)
print('overall roc: {}'.format(roc_catboost))

load model done!
load feature cols done!
-----------
predicting... 300 done! roc is 0.9823135788741154
-----------
predicting... 650 done! roc is 0.9834321113945278
-----------
predicting... 850 done! roc is 0.9837594762372315
-----------
predicting... 1100 done! roc is 0.9840574219877543
-----------
predicting... 1200 done! roc is 0.9841536069873347
overall roc: 0.9836434675247574


# XGBoost LR 0.35 ff 0.7 ntree160


In [17]:
model_path = '/home/kai/data/kaggle/talkingdata/wl/data/xgboost/'
model_name = 'all_suppelement_xgbtree_160_depth7_scale99_lr0.35_ff0.7'
model_postfix = '.pickle.dat'
feature_file_xgb = model_path + 'xgb-featurecolsV3_col38_scale99_lr0.35_ff0.7.json'
num_trees_load_xgboost = [ 160]


modelfile=model_path + model_name + model_postfix
model_load_xgb = pickle.load(open(modelfile, "rb"))
model_load_xgb.set_params(**{'n_jobs':8, 'nthread':8})
print('load model done!')

# feature_col_load_xgb = json.load(open(feature_file_xgb))
# print('load feature cols done!')

# preds_xgb = []
# for ntree in num_trees_load_xgboost:
#     ntree = int(ntree)
#     print('-----------')
#     cur_pred = model_load_xgb.predict_proba(evl[feature_col_load_xgb].values, ntree_limit=ntree)[:,1]
#     preds_xgb.append(cur_pred)
#     cur_roc = roc_auc_score(groudtruth, cur_pred)
#     print('predicting... {} done! roc is {}'.format(ntree, cur_roc))

# pred_xgb = np.mean(preds_xgb, axis=0)
# roc_xgb = roc_auc_score(groudtruth, pred_xgb)
# print('overall roc: {}'.format(roc_xgb))

load model done!


# Catboost

In [11]:
model_path = '/home/kai/data/kaggle/talkingdata/wl/data/catboost/'
model_name = 'catboost2-all_suppelement_tree_200_depth7_scale99_lr0.35_ff0.7'
model_postfix = ''
feature_file_catboost = model_path + 'catboost2-featurecolsV3_col38_depth7_scale99_tree200_lr0.35.json'
num_trees_load_catboost = [120, 140, 160, 180, 200]


modelfile=model_path + model_name + model_postfix
model_load_catboost = CatBoostClassifier().load_model(fname=modelfile)
model_load_catboost.set_params(**{'thread_count':30})
print('load model done!')

feature_col_load_catboost = json.load(open(feature_file_catboost))
print('load feature cols done!')


preds_catboost = []

for ntree in num_trees_load_catboost:
    ntree = int(ntree)
    print('-----------')
    cur_pred = model_load_catboost.predict_proba(evl[feature_col_load_catboost].values, ntree_start=0, ntree_end=ntree)[:,1]
    preds_catboost.append(cur_pred)
    cur_roc = roc_auc_score(groudtruth, cur_pred)
    print('predicting... {} done! roc is {}'.format(ntree, cur_roc))

pred_catboost = np.mean(preds_catboost, axis=0)
roc_catboost = roc_auc_score(groudtruth, pred_catboost)
print('overall roc: {}'.format(roc_catboost))

load model done!
load feature cols done!
-----------
predicting... 120 done! roc is 0.9834432295981801
-----------
predicting... 140 done! roc is 0.9836116512851736
-----------
predicting... 160 done! roc is 0.9837413865165421
-----------
predicting... 180 done! roc is 0.9838806717241653
-----------
predicting... 200 done! roc is 0.984021132773208
overall roc: 0.983770181016828


# Lightgbm encoding500

In [4]:
model_path = '/home/kai/data/kaggle/talkingdata/wl/data/lightgbm/'
model_name = 'lightgbm_paramsCombo_1_lr0.05_scale99_ntree500_ff0.5_bf0.7_onehot500'
feature_file = '/home/kai/data/kaggle/talkingdata/wl/data/lightgbm/lightgbm-featurecolsV3_col38_scale99_honehot500.json'
num_trees_lightgbm = [300, 370, 470]


modelfile=model_path+model_name
model_load_lightgbm = lgb.Booster(model_file=modelfile)
print('load model done!')



feature_col_load_lightgbm = json.load(open(feature_file))
print('load feature cols done!')

preds_lightgbm = []
for ntree in num_trees_lightgbm:
    ntree = int(ntree)
    print('-------')
    cur_pred = model_load_lightgbm.predict(evl[model_load_lightgbm.feature_name()], num_iteration=ntree)
    print('predict done!')
    cur_roc = roc_auc_score(groudtruth, cur_pred)
    preds_lightgbm.append(cur_pred)
    print('predicting... {} done! roc is {}'.format(ntree, cur_roc))
pred_lightgbm = np.mean(preds_lightgbm, axis=0)
roc_lightgbm = roc_auc_score(groudtruth, pred_lightgbm)
print('overall roc: {}'.format(roc_lightgbm))

load model done!
load feature cols done!
-------
predict done!
predicting... 300 done! roc is 0.9852844480551558
-------
predict done!
predicting... 370 done! roc is 0.985885154379836
-------
predict done!
predicting... 470 done! roc is 0.9866509382041232
overall roc: 0.9860918655504829


# Catboost 0504 12:15am

In [4]:
model_path = '/home/kai/data/kaggle/talkingdata/wl/data/catboost/'
model_name = 'all_suppelement_tree_200_depth6_scale99_lr0.35_ff0.6_onehot200'
model_postfix = ''
feature_file_catboost = model_path + 'catboost-featurecolsV3_col38_depth6_scale99_tree200_lr0.35_ff0.6_onehot200.json'
num_trees_load_catboost = [120, 140, 160, 180, 200]


modelfile=model_path + model_name + model_postfix
model_load_catboost = CatBoostClassifier().load_model(fname=modelfile)
model_load_catboost.set_params(**{'thread_count':30})
print('load model done!')

feature_col_load_catboost = json.load(open(feature_file_catboost))
print('load feature cols done!')


preds_catboost = []

for ntree in num_trees_load_catboost:
    ntree = int(ntree)
    print('-----------')
    cur_pred = model_load_catboost.predict_proba(evl[feature_col_load_catboost].values, ntree_start=0, ntree_end=ntree)[:,1]
    preds_catboost.append(cur_pred)
    cur_roc = roc_auc_score(groudtruth, cur_pred)
    print('predicting... {} done! roc is {}'.format(ntree, cur_roc))

pred_catboost = np.mean(preds_catboost, axis=0)
roc_catboost = roc_auc_score(groudtruth, pred_catboost)
print('overall roc: {}'.format(roc_catboost))

load model done!
load feature cols done!
-----------
predicting... 120 done! roc is 0.9830265201861706
-----------
predicting... 140 done! roc is 0.9832312648211872
-----------
predicting... 160 done! roc is 0.9833264578291416
-----------
predicting... 180 done! roc is 0.9834340634021058
-----------
predicting... 200 done! roc is 0.9835420255731265
overall roc: 0.9833353425287115
