In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import numpy as np
import gc

In [2]:
df_all = pd.read_feather('/home/kai/talkingdata/data/ALL_features_supplementV3_feature42.ftr')
target = 'is_attributed'
extra = ['ip_app_device_os_channel_regression']
feature_cols = list(set(df_all.columns) - set([target]) -set(extra))

In [3]:
path ='/home/kai/talkingdata/data/'
df_hour = pd.read_csv(path+'hourdistri.csv', index_col='Unnamed: 0')
index = {}
for day in ['day7', 'day8','day9']:
    index[day] = list(range(df_hour.loc[day,'4start'], df_hour.loc[day,'6end0sec'])) + \
    list(range(df_hour.loc[day,'9start'], df_hour.loc[day,'11end0sec'])) + \
    list(range(df_hour.loc[day,'13start'], df_hour.loc[day,'15end0sec'])) 

In [4]:

# trainset = df_all.iloc[index['day7']+index['day8']]
trainset = df_all.iloc[index['day8']]
valset = df_all.iloc[index['day9']]
del df_all
gc.collect()

16

In [7]:

scale_pos_weight = [50, 99 ,400]
learning_rate = [0.15, 0.3]
one_hot_max_size = [3,10,50,200]
depth = [3,6,9]
l2_leaf_reg = [6,9,20,50]


list_of_parameter = []
for sw in scale_pos_weight:
    for lr in learning_rate:
        for oh in one_hot_max_size:
            for d in depth:
                for l2 in l2_leaf_reg:
                    list_of_parameter.append({'scale_pos_weight': sw, 
                                              'learning_rate':lr, 
                                              'one_hot_max_size':oh,
                                              'ROC':0, 
                                              'depth': d, 
                                              'iterations': 0,
                                              'l2_leaf_reg':l2})
                        

df_grid = pd.DataFrame(list_of_parameter)
df_grid.to_csv('/home/kai/talkingdata/data/catboostgirdsearchparams-result.csv', index=False)                        

## Define Parameter

In [8]:
from sklearn.metrics import roc_auc_score

target = 'is_attributed'



params_raw = {
    'eval_metric': 'AUC',
         'learning_rate':0.35, 
         'loss_function':'Logloss', 
         'depth':7, 
         'iterations':1200,
         'scale_pos_weight': 99,
        'l2_leaf_reg': 9,
        'one_hot_max_size': 50,
        'leaf_estimation_method': 'Gradient',
        'rsm': 0.6,
        'od_type':'Iter',
         'od_wait':40,
        }

categorical_col = [ 'app', 'device', 'os', 'channel', 'hour']


category_index = [feature_cols.index(cat) for cat in categorical_col]

In [9]:
def train_catboost(x_train, x_val, feature_cols, category_index, params, best_round = None, target='is_attributed'):
    param = params.copy()    
    print('Start training')
    model = CatBoostClassifier(**param)
    model.fit(x_train[feature_cols],x_train[target],
              eval_set=(x_val[feature_cols], x_val[target]),
              cat_features=category_index, 
              use_best_model=True, 
              verbose_eval=1 )
    return model

def get_parameters(df ,param):
    params = param.copy()
    dff = df.copy()
    dff = dff[(dff.ROC == 0) ]
    if len(dff) > 0:
        index = dff.index.values.copy()
        np.random.shuffle(index)
        num=index[0]
        
        param_get = dff.loc[num].to_dict()
        param_get.pop('ROC')
        param_get.pop('iterations')
        param_get['depth'] = int(param_get['depth'])
        param_get['one_hot_max_size'] = int(param_get['one_hot_max_size'])
        param_get['l2_leaf_reg'] = int(param_get['l2_leaf_reg'])
        
        params.update(param_get)
        return(params, num)
    else:
        return (None, None)
    

# Train Catboost Grid Search

In [None]:
df_grid = pd.read_csv('/home/kai/talkingdata/data/catboostgirdsearchparams-result.csv')
params, index = get_parameters(df_grid, params_raw)
counter = 0
while index is not None:
    counter += 1
    print('=================================================')
    print(counter)
    print(index)
    print(params)

    model = train_catboost(trainset, valset, feature_cols, category_index, params)
    best_round = model.tree_count_
    
    ROC = roc_auc_score(valset[target].values, model.predict_proba(valset[feature_cols])[:,1])
    df_grid.loc[index, 'ROC'] = ROC
    df_grid.loc[index, 'iterations'] = best_round
    df_grid.to_csv('/home/kai/talkingdata/data/catboostgirdsearchparams-result.csv', index=False)
    params, index = get_parameters(df_grid, params_raw)
    
    gc.collect()
    
print('done!')

1
9
{'eval_metric': 'AUC', 'learning_rate': 0.14999999999999999, 'loss_function': 'Logloss', 'depth': 9, 'iterations': 1200, 'scale_pos_weight': 50.0, 'l2_leaf_reg': 9, 'one_hot_max_size': 3, 'leaf_estimation_method': 'Gradient', 'rsm': 0.6, 'od_type': 'Iter', 'od_wait': 40}
Start training
0:	learn: 0.9611702	test: 0.9549804	best: 0.9549804 (0)	total: 28.3s	remaining: 9h 25m 10s
1:	learn: 0.9665345	test: 0.9595807	best: 0.9595807 (1)	total: 55.2s	remaining: 9h 11m 31s
2:	learn: 0.9672374	test: 0.9600668	best: 0.9600668 (2)	total: 1m 22s	remaining: 9h 7m 21s
3:	learn: 0.9745130	test: 0.9678320	best: 0.9678320 (3)	total: 1m 51s	remaining: 9h 14m 27s
4:	learn: 0.9767294	test: 0.9698479	best: 0.9698479 (4)	total: 2m 18s	remaining: 9h 11m 9s
5:	learn: 0.9780815	test: 0.9707005	best: 0.9707005 (5)	total: 2m 47s	remaining: 9h 15m 32s
6:	learn: 0.9790781	test: 0.9722156	best: 0.9722156 (6)	total: 3m 17s	remaining: 9h 19m 56s
7:	learn: 0.9788683	test: 0.9722403	best: 0.9722403 (7)	total: 3m 47s

84:	learn: 0.9862414	test: 0.9803129	best: 0.9803129 (84)	total: 44m 12s	remaining: 9h 39m 54s
85:	learn: 0.9862834	test: 0.9803719	best: 0.9803719 (85)	total: 44m 44s	remaining: 9h 39m 39s
86:	learn: 0.9863141	test: 0.9803925	best: 0.9803925 (86)	total: 45m 16s	remaining: 9h 39m 9s
87:	learn: 0.9863543	test: 0.9804427	best: 0.9804427 (87)	total: 45m 47s	remaining: 9h 38m 41s
88:	learn: 0.9863669	test: 0.9804470	best: 0.9804470 (88)	total: 46m 19s	remaining: 9h 38m 22s
89:	learn: 0.9863835	test: 0.9804482	best: 0.9804482 (89)	total: 46m 50s	remaining: 9h 37m 46s
90:	learn: 0.9864079	test: 0.9804700	best: 0.9804700 (90)	total: 47m 22s	remaining: 9h 37m 21s
91:	learn: 0.9864436	test: 0.9804778	best: 0.9804778 (91)	total: 47m 53s	remaining: 9h 36m 42s
92:	learn: 0.9864916	test: 0.9805269	best: 0.9805269 (92)	total: 48m 25s	remaining: 9h 36m 22s
93:	learn: 0.9865033	test: 0.9805131	best: 0.9805269 (92)	total: 48m 57s	remaining: 9h 35m 59s
94:	learn: 0.9865360	test: 0.9805501	best: 0.980550

168:	learn: 0.9879273	test: 0.9811213	best: 0.9811220 (167)	total: 1h 28m 31s	remaining: 9h
169:	learn: 0.9879615	test: 0.9811186	best: 0.9811220 (167)	total: 1h 29m 3s	remaining: 8h 59m 36s
170:	learn: 0.9879823	test: 0.9811233	best: 0.9811233 (170)	total: 1h 29m 34s	remaining: 8h 59m 4s
171:	learn: 0.9879897	test: 0.9811200	best: 0.9811233 (170)	total: 1h 30m 6s	remaining: 8h 58m 30s
172:	learn: 0.9880053	test: 0.9811108	best: 0.9811233 (170)	total: 1h 30m 38s	remaining: 8h 58m 2s
173:	learn: 0.9880267	test: 0.9811130	best: 0.9811233 (170)	total: 1h 31m 9s	remaining: 8h 57m 30s
174:	learn: 0.9880414	test: 0.9811305	best: 0.9811305 (174)	total: 1h 31m 42s	remaining: 8h 57m 6s
175:	learn: 0.9880567	test: 0.9811369	best: 0.9811369 (175)	total: 1h 32m 11s	remaining: 8h 56m 23s
176:	learn: 0.9880741	test: 0.9811418	best: 0.9811418 (176)	total: 1h 32m 43s	remaining: 8h 55m 55s
177:	learn: 0.9880933	test: 0.9811474	best: 0.9811474 (177)	total: 1h 33m 15s	remaining: 8h 55m 28s
178:	learn: 0.

251:	learn: 0.9891425	test: 0.9813819	best: 0.9813819 (251)	total: 2h 12m 8s	remaining: 8h 17m 8s
252:	learn: 0.9891681	test: 0.9813807	best: 0.9813819 (251)	total: 2h 12m 41s	remaining: 8h 16m 39s
253:	learn: 0.9891914	test: 0.9813856	best: 0.9813856 (253)	total: 2h 13m 11s	remaining: 8h 16m 2s
254:	learn: 0.9892057	test: 0.9813803	best: 0.9813856 (253)	total: 2h 13m 43s	remaining: 8h 15m 35s
255:	learn: 0.9892191	test: 0.9813800	best: 0.9813856 (253)	total: 2h 14m 16s	remaining: 8h 15m 9s
256:	learn: 0.9892341	test: 0.9813731	best: 0.9813856 (253)	total: 2h 14m 47s	remaining: 8h 14m 36s
257:	learn: 0.9892512	test: 0.9813711	best: 0.9813856 (253)	total: 2h 15m 20s	remaining: 8h 14m 7s
258:	learn: 0.9892609	test: 0.9813747	best: 0.9813856 (253)	total: 2h 15m 51s	remaining: 8h 13m 36s
259:	learn: 0.9892699	test: 0.9813760	best: 0.9813856 (253)	total: 2h 16m 24s	remaining: 8h 13m 9s
260:	learn: 0.9892779	test: 0.9813791	best: 0.9813856 (253)	total: 2h 16m 55s	remaining: 8h 12m 38s
261:	l

41:	learn: 0.9872936	test: 0.9803939	best: 0.9804098 (40)	total: 21m 54s	remaining: 10h 3m 52s
42:	learn: 0.9873672	test: 0.9804149	best: 0.9804149 (42)	total: 22m 27s	remaining: 10h 4m 28s
43:	learn: 0.9874568	test: 0.9804370	best: 0.9804370 (43)	total: 23m	remaining: 10h 4m 28s
44:	learn: 0.9875124	test: 0.9804182	best: 0.9804370 (43)	total: 23m 32s	remaining: 10h 4m 20s
45:	learn: 0.9875785	test: 0.9804399	best: 0.9804399 (45)	total: 24m 5s	remaining: 10h 4m 20s
46:	learn: 0.9876231	test: 0.9804523	best: 0.9804523 (46)	total: 24m 36s	remaining: 10h 3m 45s
47:	learn: 0.9876773	test: 0.9804524	best: 0.9804524 (47)	total: 25m 8s	remaining: 10h 3m 13s
48:	learn: 0.9877247	test: 0.9804394	best: 0.9804524 (47)	total: 25m 39s	remaining: 10h 2m 42s
49:	learn: 0.9877492	test: 0.9804440	best: 0.9804524 (47)	total: 26m 9s	remaining: 10h 1m 49s
50:	learn: 0.9878334	test: 0.9804459	best: 0.9804524 (47)	total: 26m 41s	remaining: 10h 1m 14s
51:	learn: 0.9879205	test: 0.9804849	best: 0.9804849 (51)

128:	learn: 0.9909109	test: 0.9802698	best: 0.9806600 (88)	total: 1h 7m 44s	remaining: 9h 22m 23s
129:	learn: 0.9909291	test: 0.9802631	best: 0.9806600 (88)	total: 1h 8m 16s	remaining: 9h 21m 54s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.9806600322
bestIteration = 88

Shrink model to first 89 iterations.
3
175
{'eval_metric': 'AUC', 'learning_rate': 0.29999999999999999, 'loss_function': 'Logloss', 'depth': 6, 'iterations': 1200, 'scale_pos_weight': 99.0, 'l2_leaf_reg': 50, 'one_hot_max_size': 50, 'leaf_estimation_method': 'Gradient', 'rsm': 0.6, 'od_type': 'Iter', 'od_wait': 40}
Start training
0:	learn: 0.9587637	test: 0.9501355	best: 0.9501355 (0)	total: 22.4s	remaining: 7h 27m 20s
1:	learn: 0.9703040	test: 0.9629225	best: 0.9629225 (1)	total: 45.3s	remaining: 7h 31m 48s
2:	learn: 0.9720810	test: 0.9645696	best: 0.9645696 (2)	total: 1m 8s	remaining: 7h 36m 48s
3:	learn: 0.9752058	test: 0.9673017	best: 0.9673017 (3)	total: 1m 31s	remaining: 7h 37m 14s
4:	learn

81:	learn: 0.9862826	test: 0.9804892	best: 0.9804919 (80)	total: 35m 16s	remaining: 8h 1m 2s
82:	learn: 0.9863084	test: 0.9804754	best: 0.9804919 (80)	total: 35m 42s	remaining: 8h 34s
83:	learn: 0.9863265	test: 0.9805089	best: 0.9805089 (83)	total: 36m 9s	remaining: 8h 20s
84:	learn: 0.9863358	test: 0.9805114	best: 0.9805114 (84)	total: 36m 36s	remaining: 8h 7s
85:	learn: 0.9863412	test: 0.9805090	best: 0.9805114 (84)	total: 37m 2s	remaining: 7h 59m 47s
86:	learn: 0.9863626	test: 0.9805189	best: 0.9805189 (86)	total: 37m 27s	remaining: 7h 59m 14s
87:	learn: 0.9863731	test: 0.9805247	best: 0.9805247 (87)	total: 37m 53s	remaining: 7h 58m 50s
88:	learn: 0.9864022	test: 0.9805331	best: 0.9805331 (88)	total: 38m 20s	remaining: 7h 58m 33s
89:	learn: 0.9864156	test: 0.9805546	best: 0.9805546 (89)	total: 38m 46s	remaining: 7h 58m 16s
90:	learn: 0.9864384	test: 0.9805904	best: 0.9805904 (90)	total: 39m 12s	remaining: 7h 57m 46s
91:	learn: 0.9864627	test: 0.9806188	best: 0.9806188 (91)	total: 39

166:	learn: 0.9875785	test: 0.9809910	best: 0.9809910 (166)	total: 1h 12m 36s	remaining: 7h 29m 8s
167:	learn: 0.9875964	test: 0.9810055	best: 0.9810055 (167)	total: 1h 13m 3s	remaining: 7h 28m 48s
168:	learn: 0.9876053	test: 0.9810098	best: 0.9810098 (168)	total: 1h 13m 29s	remaining: 7h 28m 21s
169:	learn: 0.9876143	test: 0.9810032	best: 0.9810098 (168)	total: 1h 13m 55s	remaining: 7h 27m 56s
170:	learn: 0.9876266	test: 0.9810007	best: 0.9810098 (168)	total: 1h 14m 21s	remaining: 7h 27m 28s
171:	learn: 0.9876411	test: 0.9810001	best: 0.9810098 (168)	total: 1h 14m 48s	remaining: 7h 27m 4s
172:	learn: 0.9876464	test: 0.9809933	best: 0.9810098 (168)	total: 1h 15m 14s	remaining: 7h 26m 40s
173:	learn: 0.9876607	test: 0.9809889	best: 0.9810098 (168)	total: 1h 15m 41s	remaining: 7h 26m 17s
174:	learn: 0.9876708	test: 0.9809951	best: 0.9810098 (168)	total: 1h 16m 7s	remaining: 7h 25m 52s
175:	learn: 0.9876843	test: 0.9809927	best: 0.9810098 (168)	total: 1h 16m 33s	remaining: 7h 25m 26s
176:

36:	learn: 0.9861065	test: 0.9801440	best: 0.9801440 (36)	total: 18m 49s	remaining: 9h 51m 52s
37:	learn: 0.9861947	test: 0.9801520	best: 0.9801520 (37)	total: 19m 22s	remaining: 9h 52m 19s
38:	learn: 0.9862529	test: 0.9801644	best: 0.9801644 (38)	total: 19m 52s	remaining: 9h 51m 39s
39:	learn: 0.9863313	test: 0.9802383	best: 0.9802383 (39)	total: 20m 23s	remaining: 9h 51m 22s
40:	learn: 0.9863640	test: 0.9802373	best: 0.9802383 (39)	total: 20m 53s	remaining: 9h 50m 44s
41:	learn: 0.9864251	test: 0.9802846	best: 0.9802846 (41)	total: 21m 24s	remaining: 9h 50m 24s
42:	learn: 0.9865112	test: 0.9803052	best: 0.9803052 (42)	total: 21m 56s	remaining: 9h 50m 31s
43:	learn: 0.9865596	test: 0.9803284	best: 0.9803284 (43)	total: 22m 28s	remaining: 9h 50m 16s
44:	learn: 0.9865955	test: 0.9803333	best: 0.9803333 (44)	total: 22m 58s	remaining: 9h 49m 47s
45:	learn: 0.9866330	test: 0.9804164	best: 0.9804164 (45)	total: 23m 30s	remaining: 9h 49m 45s
46:	learn: 0.9867081	test: 0.9805034	best: 0.98050