In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import numpy as np
import gc

In [2]:
dtypes = {
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'hour'          : 'uint8',
        'is_attributed' : 'uint8', 
        'ip_day_hour_count': 'uint32', 
        'ip_os_day_hour_count': 'uint32', 
        'ip_app_day_hour_count': 'uint32', 
        'ip_app_os_day_hour_count': 'uint32', 
        'app_day_hour_count': 'uint32', 
        'ip_device_os_count': 'uint32', 
        'ip_app_device_os_count': 'uint32', 
        'ip_device_os_mean': 'float16',
        'ip_app_device_os_mean': 'float16',
        'ip_app_device_mean': 'float16',
        'app_device_os_mean': 'float16',
        'ip_device_os_time2nextclick': 'int32',
        'ip_app_device_os_time2nextclick': 'int32',
        'ip_app_device_time2nextclick': 'int32',
        'ip_device_os_time2previousclick': 'int32',
        'ip_app_device_os_time2previousclick': 'int32',
        'ip_app_device_time2previousclick': 'int32',
        'ip_device_os_countfromfuture': 'uint32', 
        'ip_app_device_os_countfromfuture': 'uint32', 
        'ip_app_device_countfromfuture': 'uint32', 
        'ip_device_os_countfrompast': 'uint32', 
        'ip_app_device_os_countfrompast': 'uint32', 
        'ip_app_device_countfrompast': 'uint32', 
        'ip_device_os_lasttimediff': 'int32',
        'ip_app_device_os_lasttimediff': 'int32',
        'ip_app_device_lasttimediff': 'int32',
        'ip_device_os_firsttimediff': 'int32',
        'ip_app_device_os_firsttimediff': 'int32',
        'ip_app_device_firsttimediff': 'int32',
        'matrixFact_user_iposdeviceapp_item_app': 'float16',
        'matrixFact_user_ip_item_appdeviceos': 'float16',
        'matrixFact_user_ipchannel_item_appdeviceos': 'float16',
        'ip_device_os_regression': 'float16',
        'ip_app_device_os_regression': 'float16',
        'ip_app_device_regression': 'float16',
        'ip_app_device_os_channel_regression': 'float16', 
        'attributed_timediffmax':'int32',
        'attributed_timediffmin':'int32',
        'attributed_timediff':'float16',
        'matrixFact_user_ipappdeviceos_item_channel': 'float16'
    
        } 

# Load Data

In [3]:
df_all = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/data/equalhour/ALL_features_supplementV3_feature42.csv',dtype=dtypes)
target = 'is_attributed'
extra = ['ip_app_device_os_channel_regression']
feature_cols = list(set(df_all.columns) - set([target]) -set(extra))

# Define ROC eval Metrix

In [4]:
from sklearn.metrics import roc_auc_score

class ROCMetric(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        approx = approxes[0]
        roc = roc_auc_score(np.array(target), np.array(approx))
        return roc, 1

In [5]:
target = 'is_attributed'

params = {
#         'eval_metric': ROCMetric(), 
    'eval_metric': 'Logloss',
         'learning_rate':0.05, 
#          'od_type':'Iter',
#          'od_wait':40,
         'loss_function':'Logloss', 
         'depth':6, 
         'thread_count':2, 
         'iterations':1200,
         'scale_pos_weight': 398,
        'l2_leaf_reg': 6,
    'leaf_estimation_method': 'Gradient',
#     'max_ctr_complexity':2,
        }


categorical_col = [ 'app', 'device', 'os', 'channel', 'hour']


category_index = [feature_cols.index(cat) for cat in categorical_col]

In [6]:
# def train_catboost(x_train, x_val, feature_cols, category_index, params, best_round = None, target='is_attributed'):
#     param = params.copy()
# #     y_train = x_train[target].values
# #     y_val = x_val[target].values
#     print('Building pool...')
#     train_pool = Pool(x_train[feature_cols], x_train[target], cat_features=category_index)
#     print('train pool done!')
#     val_pool = Pool(x_val[feature_cols], x_val[target], cat_features=category_index)
#     print('val pool done!')
#     if best_round is not None:
#         param['iterations'] = best_round
#         del param['od_type']
#         del param['od_wait']
#     print('start training')
    
#     print('Get train pool and val pool')
#     model = CatBoostClassifier(**param)
#     model.fit(train_pool,  eval_set=val_pool, use_best_model=True, verbose_eval=10 )
#     return model

In [7]:
def train_catboost(x_train, x_val, feature_cols, category_index, params, best_round = None, target='is_attributed'):
    param = params.copy()    
    print('Start training')
    model = CatBoostClassifier(**param)
    model.fit(x_train[feature_cols],x_train[target],  
              cat_features=category_index, 
              use_best_model=True, 
              verbose_eval=1 )
    return model

# Train CatBoost

In [8]:
save_model_name = '/home/kai/data/kaggle/talkingdata/wl/data/catboost/all_suppelement_tree_1200'
model = train_catboost(df_all, None, feature_cols,category_index,params,None,target)
model.save_model(save_model_name)

Start training
You should provide test set for use best model. use_best_model parameter swiched to false value.
0:	learn: 0.6058109	total: 2m 44s	remaining: 2d 6h 43m 48s
1:	learn: 0.5303534	total: 5m 20s	remaining: 2d 5h 19m 32s
2:	learn: 0.4653169	total: 7m 57s	remaining: 2d 4h 55m 16s
3:	learn: 0.4093420	total: 10m 8s	remaining: 2d 2h 34m 27s
4:	learn: 0.3611906	total: 12m 27s	remaining: 2d 1h 38m 32s
5:	learn: 0.3197572	total: 15m 29s	remaining: 2d 3h 23m 27s
6:	learn: 0.2840695	total: 18m 31s	remaining: 2d 4h 35m 54s
7:	learn: 0.2532829	total: 20m 51s	remaining: 2d 3h 47m 53s
8:	learn: 0.2266705	total: 24m 13s	remaining: 2d 5h 26m 7s
9:	learn: 0.2036112	total: 27m 29s	remaining: 2d 6h 31m 31s
10:	learn: 0.1835770	total: 30m 15s	remaining: 2d 6h 29m 54s
11:	learn: 0.1661206	total: 32m 33s	remaining: 2d 5h 43m 24s
12:	learn: 0.1508639	total: 34m 51s	remaining: 2d 5h 2m 59s
13:	learn: 0.1374878	total: 37m 19s	remaining: 2d 4h 41m 55s
14:	learn: 0.1257227	total: 40m 30s	remaining: 2d 

KeyboardInterrupt: 

In [None]:
# for day in day_list:
#     train_day = list(set(day_list)-set([day]))
#     file_name = 'concat_{}{}_val_{}_allday_supplement_V2_attributediff_depth6.npy'.format(train_day[0],train_day[1],day)
#     print(file_name)
#     trainset = pd.concat([df_dict[train_day[0]],df_dict[train_day[1]]])
#     valset = df_dict[day]
#     print('building train val done!')
#     gc.collect()
#     model = train_catboost(trainset, valset, feature_cols, category_index, params)
#     best_round = model.tree_count_
    
#     df_all = pd.concat([trainset, valset])
#     del trainset
#     gc.collect()
#     model = train_catboost(trainset, valset, feature_cols, category_index, params, best_round)
#     del df_all
#     gc.collect()
    
#     pred = model.predict(df_dict['test'][feature_cols])
#     np.save(load_path+'catboost/'+file_name, pred)
    
#     # prediction
#     df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
#     print('loading file done!')
#     df_sub = pd.DataFrame()
#     df_sub['click_id'] = df_test_raw['click_id']
#     df_sub['is_attributed'] = pred
#     print('predicting file done!')
#     df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/catboost/concat_{}{}_val_{}_allday_supplement_V2_attributediff_depth6.csv.gz'.format(train_day[0],train_day[1],day), compression='gzip', index=False)

    

concat_day8day9_val_day7_allday_supplement_V2_attributediff_depth6.npy
building train val done!
Building pool...
train pool done!
val pool done!
start training
Get train pool and val pool
0:	learn: 0.9668847	test: 0.9655616	best: 0.9655616 (0)	total: 4m 57s	remaining: 17d 5h 28m 25s
