In [6]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import numpy as np
import gc
import json

In [7]:
dtypes = {
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'hour'          : 'uint8',
        'is_attributed' : 'uint8', 
        'ip_day_hour_count': 'uint32', 
        'ip_os_day_hour_count': 'uint32', 
        'ip_app_day_hour_count': 'uint32', 
        'ip_app_os_day_hour_count': 'uint32', 
        'app_day_hour_count': 'uint32', 
        'ip_device_os_count': 'uint32', 
        'ip_app_device_os_count': 'uint32', 
        'ip_device_os_mean': 'float16',
        'ip_app_device_os_mean': 'float16',
        'ip_app_device_mean': 'float16',
        'app_device_os_mean': 'float16',
        'ip_device_os_time2nextclick': 'int32',
        'ip_app_device_os_time2nextclick': 'int32',
        'ip_app_device_time2nextclick': 'int32',
        'ip_device_os_time2previousclick': 'int32',
        'ip_app_device_os_time2previousclick': 'int32',
        'ip_app_device_time2previousclick': 'int32',
        'ip_device_os_countfromfuture': 'uint32', 
        'ip_app_device_os_countfromfuture': 'uint32', 
        'ip_app_device_countfromfuture': 'uint32', 
        'ip_device_os_countfrompast': 'uint32', 
        'ip_app_device_os_countfrompast': 'uint32', 
        'ip_app_device_countfrompast': 'uint32', 
        'ip_device_os_lasttimediff': 'int32',
        'ip_app_device_os_lasttimediff': 'int32',
        'ip_app_device_lasttimediff': 'int32',
        'ip_device_os_firsttimediff': 'int32',
        'ip_app_device_os_firsttimediff': 'int32',
        'ip_app_device_firsttimediff': 'int32',
        'matrixFact_user_iposdeviceapp_item_app': 'float16',
        'matrixFact_user_ip_item_appdeviceos': 'float16',
        'matrixFact_user_ipchannel_item_appdeviceos': 'float16',
        'ip_device_os_regression': 'float16',
        'ip_app_device_os_regression': 'float16',
        'ip_app_device_regression': 'float16',
        'ip_app_device_os_channel_regression': 'float16', 
        'attributed_timediffmax':'int32',
        'attributed_timediffmin':'int32',
        'attributed_timediff':'float16',
        'matrixFact_user_ipappdeviceos_item_channel': 'float16'
    
        } 

In [12]:
model_path = '/home/kai/data/kaggle/talkingdata/wl/data/catboost/'
model_name = 'all_suppelement_tree_1500_depth6_scale99_lr0.1_ff0.5_onehot50_l2reg9'
model_postfix = ''
feature_file = model_path + 'catboost-featurecolsV3_all_suppelement_tree_1500_depth6_scale99_lr0.1_ff0.5_onehot50_l2reg9.json'
num_trees_load = [1100, 1300, 1500]

In [13]:
modelfile=model_path + model_name + model_postfix
model_load = CatBoostClassifier().load_model(fname=modelfile)

print('load model done!')

# this may not need to be loaded in the future
# num_trees_load = np.load(model_path+prefix.format(1)+'.npy')

print('load number of trees done!')

load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
file_format = '{}_features_supplementV3_feature42.csv'
test = pd.read_csv(load_path+file_format.format('test'),dtype=dtypes)
print('load test feature done!')


feature_col_load = json.load(open(feature_file))
print('load feature cols done!')

preds = []
submission_postfix = '_ntree'



for ntree in num_trees_load:
    ntree = int(ntree)
    preds.append(model_load.predict_proba(test[feature_col_load],ntree_start=0, ntree_end=ntree)[:,1])
    print('predicting... {} done!'.format(ntree))
    submission_postfix += '_{}'.format(ntree)
pred = np.mean(preds, axis=0)


print('getting submission')
df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
print('loading file done!')
df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = pred
print('predicting file done!')
submission_name = model_path+model_name+submission_postfix+'.csv.gz'
df_sub.to_csv(submission_name, compression='gzip', index=False)

    

load model done!
load number of trees done!
load test feature done!
load feature cols done!
predicting... 1100 done!
predicting... 1300 done!
predicting... 1500 done!
getting submission
loading file done!
predicting file done!


In [5]:
df_sub.head(20)

Unnamed: 0,click_id,is_attributed
0,0,0.054963
1,1,0.008781
2,2,0.001767
3,3,0.010516
4,4,0.006437
5,5,0.003287
6,6,0.033037
7,7,0.105608
8,9,0.067955
9,8,0.00489
