In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import numpy as np
import gc

In [2]:
dtypes = {
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'hour'          : 'uint8',
        'is_attributed' : 'uint8', 
        'ip_day_hour_count': 'uint32', 
        'ip_os_day_hour_count': 'uint32', 
        'ip_app_day_hour_count': 'uint32', 
        'ip_app_os_day_hour_count': 'uint32', 
        'app_day_hour_count': 'uint32', 
        'ip_device_os_count': 'uint32', 
        'ip_app_device_os_count': 'uint32', 
        'ip_device_os_mean': 'float16',
        'ip_app_device_os_mean': 'float16',
        'ip_app_device_mean': 'float16',
        'app_device_os_mean': 'float16',
        'ip_device_os_time2nextclick': 'int32',
        'ip_app_device_os_time2nextclick': 'int32',
        'ip_app_device_time2nextclick': 'int32',
        'ip_device_os_time2previousclick': 'int32',
        'ip_app_device_os_time2previousclick': 'int32',
        'ip_app_device_time2previousclick': 'int32',
        'ip_device_os_countfromfuture': 'uint32', 
        'ip_app_device_os_countfromfuture': 'uint32', 
        'ip_app_device_countfromfuture': 'uint32', 
        'ip_device_os_countfrompast': 'uint32', 
        'ip_app_device_os_countfrompast': 'uint32', 
        'ip_app_device_countfrompast': 'uint32', 
        'ip_device_os_lasttimediff': 'int32',
        'ip_app_device_os_lasttimediff': 'int32',
        'ip_app_device_lasttimediff': 'int32',
        'ip_device_os_firsttimediff': 'int32',
        'ip_app_device_os_firsttimediff': 'int32',
        'ip_app_device_firsttimediff': 'int32',
        'matrixFact_user_iposdeviceapp_item_app': 'float16',
        'matrixFact_user_ip_item_appdeviceos': 'float16',
        'matrixFact_user_ipchannel_item_appdeviceos': 'float16',
        'ip_device_os_regression': 'float16',
        'ip_app_device_os_regression': 'float16',
        'ip_app_device_regression': 'float16',
        'ip_app_device_os_channel_regression': 'float16', 
        'attributed_timediffmax':'int32',
        'attributed_timediffmin':'int32',
        'attributed_timediff':'float16',
        'matrixFact_user_ipappdeviceos_item_channel': 'float16'
    
        } 

In [3]:
df_all = pd.read_csv('/home/kai/talkingdata/data/ALL_features_supplementV3_feature42.csv',dtype=dtypes)
target = 'is_attributed'
extra = ['ip_app_device_os_channel_regression']
feature_cols = list(set(df_all.columns) - set([target]) -set(extra))

In [4]:
feature_cols

['ip_app_os_day_hour_count',
 'ip_os_day_hour_count',
 'channel',
 'ip_day_hour_count',
 'device',
 'app_device_os_mean',
 'matrixFact_user_ipchannel_item_appdeviceos',
 'ip_app_device_time2previousclick',
 'attributed_timediff',
 'ip_app_device_os_countfrompast',
 'ip_app_device_os_countfromfuture',
 'ip_app_device_os_count',
 'os',
 'ip_app_device_countfromfuture',
 'ip_device_os_time2nextclick',
 'ip_device_os_mean',
 'matrixFact_user_iposdeviceapp_item_app',
 'ip_app_device_lasttimediff',
 'ip_device_os_lasttimediff',
 'ip_app_device_os_mean',
 'ip_app_device_firsttimediff',
 'ip_app_device_countfrompast',
 'ip_device_os_count',
 'ip_app_device_os_time2previousclick',
 'app',
 'ip_device_os_time2previousclick',
 'ip_app_device_mean',
 'hour',
 'ip_device_os_firsttimediff',
 'ip_app_device_os_lasttimediff',
 'ip_app_device_os_time2nextclick',
 'app_day_hour_count',
 'ip_app_day_hour_count',
 'ip_device_os_countfromfuture',
 'ip_app_device_time2nextclick',
 'matrixFact_user_ip_item_a

In [5]:
from sklearn.metrics import roc_auc_score

target = 'is_attributed'

# params = {
# #         'eval_metric': ROCMetric(), 
#     'eval_metric': 'AUC',
#          'learning_rate':0.35, 
# #          'od_type':'Iter',
# #          'od_wait':40,
#          'loss_function':'Logloss', 
#          'depth':7, 
# #          'thread_count':90, 
#          'iterations':200,
#          'scale_pos_weight': 99,
#         'l2_leaf_reg': 6,
#     'leaf_estimation_method': 'Gradient',
#     'rsm': 0.7,
# #     'max_ctr_complexity':2,
#         }

params = {
    'eval_metric': 'AUC',
         'learning_rate':0.35, 
         'loss_function':'Logloss', 
         'depth':7, 
         'iterations':200,
         'scale_pos_weight': 99,
        'l2_leaf_reg': 9,
        'one_hot_max_size': 50,
    'leaf_estimation_method': 'Gradient',
    'rsm': 0.6,
        }

categorical_col = [ 'app', 'device', 'os', 'channel', 'hour']


category_index = [feature_cols.index(cat) for cat in categorical_col]

In [None]:
def train_catboost(x_train, x_val, feature_cols, category_index, params, best_round = None, target='is_attributed'):
    param = params.copy()    
    print('Start training')
    model = CatBoostClassifier(**param)
    model.fit(x_train[feature_cols],x_train[target],  
              cat_features=category_index, 
              use_best_model=True, 
              verbose_eval=1 )
    return model

# Train Catboost

In [None]:
save_model_name = '/home/kai/talkingdata/data/all_suppelement_tree_200_depth7_scale99_lr0.35_ff0.6_onehot50'
model = train_catboost(df_all, None, feature_cols,category_index,params,None,target)
model.save_model(save_model_name)

Start training
You should provide test set for use best model. use_best_model parameter swiched to false value.
0:	learn: 0.9688646	total: 2m 46s	remaining: 9h 12m 55s
1:	learn: 0.9719039	total: 5m 28s	remaining: 9h 2m 1s
2:	learn: 0.9734741	total: 8m 14s	remaining: 9h 58s
3:	learn: 0.9738883	total: 11m 5s	remaining: 9h 3m 22s
4:	learn: 0.9761802	total: 13m 52s	remaining: 9h 48s
5:	learn: 0.9774496	total: 16m 37s	remaining: 8h 57m 35s
6:	learn: 0.9778132	total: 19m 43s	remaining: 9h 3m 56s
7:	learn: 0.9789622	total: 23m 50s	remaining: 9h 32m 15s
8:	learn: 0.9792355	total: 27m 8s	remaining: 9h 36m 6s
9:	learn: 0.9796439	total: 30m 22s	remaining: 9h 37m 16s
10:	learn: 0.9799557	total: 35m 14s	remaining: 10h 5m 29s
11:	learn: 0.9805175	total: 39m 20s	remaining: 10h 16m 24s
12:	learn: 0.9807606	total: 42m 56s	remaining: 10h 17m 48s
13:	learn: 0.9809596	total: 46m 6s	remaining: 10h 12m 29s
14:	learn: 0.9811934	total: 49m 12s	remaining: 10h 6m 48s
15:	learn: 0.9816037	total: 52m 15s	remainin

In [None]:
import json
feature_file = '/home/kai/talkingdata/data/catboost-featurecolsV3_col38_depth7_scale99_tree200_lr0.35_onehot50.json'

with open(feature_file, 'w') as outfile:
    json.dump(feature_cols, outfile)

In [None]:
df_all.head(20)