In [1]:
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
 
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x) 
pd.options.display.max_rows = 200

In [2]:
train_df = pd.read_csv('data/train_data_1120.csv')
test_df = pd.read_csv('data/test_data_1120.csv')
print(train_df.shape)
print(test_df.shape)

train_target = train_df['action']
test_id = test_df['Unnamed: 0']
#train_df.drop(['action'], axis = 1, inplace=True)
train_df.drop(['action','Unnamed: 0', 'request_id', 'uuid'], axis = 1, inplace=True)
test_df.drop(['Unnamed: 0', 'request_id', 'uuid'], axis = 1, inplace=True)

(4206064, 24)
(192000, 23)


In [3]:
train_df.columns

Index(['poi_id', 'request_cate_id', 'device_type', 'gender', 'job',
       'cate_level1', 'cate_level2', 'cate_level3', 'area_id', 'latitude_req',
       'longitude_req', 'age', 'avg_price', 'poi_star', 'longitude_poi',
       'latitude_poi', 'distance', 'poi_cnt_deal', 'poi_avg_discount',
       'request_time_second'],
      dtype='object')

In [4]:
train_df.describe(include = 'all')

Unnamed: 0,poi_id,request_cate_id,device_type,gender,job,cate_level1,cate_level2,cate_level3,area_id,latitude_req,longitude_req,age,avg_price,poi_star,longitude_poi,latitude_poi,distance,poi_cnt_deal,poi_avg_discount,request_time_second
count,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0,4206064.0
mean,8604.99485,0.41689,0.42652,0.3637,0.76298,0.00098,4.57454,27.33124,1152.82388,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0
std,8138.06885,2.29427,0.57297,0.48106,0.84138,0.04163,5.60262,37.34551,1397.19556,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12.39752,-33.40891,-2.24384,-0.28045,-6.47669,-16.13778,-4.88344,-0.15472,-0.82632,-5.95836,-3.63785
25%,1964.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,99.0,-0.64002,-0.37751,-0.52455,-0.13815,-0.63194,-0.38485,-0.64021,-0.14685,-0.51899,-0.62437,-0.77382
50%,6062.0,0.0,0.0,0.0,1.0,0.0,2.0,12.0,569.0,-0.0862,0.02989,0.0,-0.04792,0.24478,0.02857,-0.08644,-0.13583,-0.36533,-0.0,0.19642
75%,13189.0,0.0,1.0,1.0,1.0,0.0,7.0,37.0,1737.0,0.70939,0.7265,0.16316,0.02843,0.82925,0.74369,0.70291,-0.10089,-0.058,0.67393,0.72484
max,40117.0,64.0,15.0,1.0,3.0,5.0,52.0,280.0,7017.0,4.81803,8.64741,6.86838,379.23678,0.82925,2.44985,3.10245,104.80274,3.62993,2.62619,2.03416


In [5]:
test_df.columns

Index(['poi_id', 'request_cate_id', 'device_type', 'gender', 'job',
       'cate_level1', 'cate_level2', 'cate_level3', 'area_id', 'latitude_req',
       'longitude_req', 'age', 'avg_price', 'poi_star', 'longitude_poi',
       'latitude_poi', 'distance', 'poi_cnt_deal', 'poi_avg_discount',
       'request_time_second'],
      dtype='object')

In [6]:
test_df.describe(include = 'all')

Unnamed: 0,poi_id,request_cate_id,device_type,gender,job,cate_level1,cate_level2,cate_level3,area_id,latitude_req,longitude_req,age,avg_price,poi_star,longitude_poi,latitude_poi,distance,poi_cnt_deal,poi_avg_discount,request_time_second
count,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0,192000.0
mean,10056.34934,0.39197,0.42924,0.38554,0.74985,0.00151,4.56885,27.78676,1196.97389,-0.01421,0.02061,-0.12406,-0.00189,0.00691,0.02344,-0.0137,-0.01487,-0.03342,-0.01098,-0.10678
std,9554.10874,2.21903,0.56953,0.48672,0.83441,0.05941,5.66271,37.68687,1483.54306,1.01608,1.01044,0.96797,0.85655,0.94671,0.98603,1.01479,1.11298,0.9824,1.01852,0.9684
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.47999,-33.30018,-2.24384,-0.28045,-6.47669,-5.51368,-2.1769,-0.15472,-0.82632,-5.95836,-3.63759
25%,2114.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,100.0,-0.79209,-0.24847,-0.69648,-0.12427,-0.63194,-0.25558,-0.79282,-0.14707,-0.51899,-0.65067,-0.82785
50%,6993.0,0.0,0.0,0.0,1.0,0.0,2.0,13.0,572.0,-0.08927,0.02857,-0.12406,-0.04445,0.24478,0.02563,-0.09016,-0.13629,-0.36533,-0.00962,0.01625
75%,15639.0,0.0,1.0,1.0,1.0,0.0,7.0,38.0,1772.0,0.72833,0.72432,-0.00877,0.02843,0.82925,0.74121,0.72767,-0.10485,-0.058,0.68253,0.61763
max,40117.0,64.0,10.0,1.0,3.0,5.0,50.0,278.0,7015.0,4.47889,5.21102,6.86838,67.10067,0.82925,2.44985,2.83904,99.98669,3.62993,2.62619,2.03409


In [8]:
n_splits = 5
random_state = 2000
# np.random.seed(random_state)
splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split(train_df, train_target))

cols = list(train_df.columns)
sparse_features = ['request_cate_id', 'device_type', 'gender', 'job', 'cate_level1', 'cate_level2', 'cate_level3']
feature_importance_df = pd.DataFrame()
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [9]:
for i, (train_idx, valid_idx) in enumerate(splits):
    print('Folder', i)
    x_tr, y_tr = train_df.iloc[train_idx], train_target.iloc[train_idx]
    x_valid, y_valid = train_df.iloc[valid_idx], train_target.iloc[valid_idx]
    
    # x_tr, y_tr = augment(x_tr, y_tr, cols)
    num_round = 100000  
    clf = cat.CatBoostClassifier(iterations=num_round, depth=7,\
                                 learning_rate=0.1, verbose=1000, \
                                 loss_function='Logloss', eval_metric='AUC', \
                                early_stopping_rounds=2000, random_seed=random_state, \
                                task_type='GPU', devices='2',\
                                 cat_features=sparse_features,\
                                 bootstrap_type='Poisson', subsample=0.99)
    
    clf.fit(x_tr, y_tr, eval_set=cat.Pool(x_valid, y_valid, cat_features=sparse_features))
    
    print(f'best score {clf.get_best_score()}')
    print(f'best_iteration {clf.get_best_iteration()}')
    oof[valid_idx] = clf.predict_proba(x_valid)[:, 1]
    predictions += clf.predict_proba(test_df)[:, 1] / n_splits
    del x_tr
    del y_tr
    
print(metrics.roc_auc_score(train_target.values, oof))

Folder 0




0:	learn: 0.5666763	test: 0.5707949	best: 0.5707949 (0)	total: 377ms	remaining: 10h 27m 50s
1000:	learn: 0.6800184	test: 0.6466253	best: 0.6466607 (900)	total: 4m 47s	remaining: 7h 53m 51s
2000:	learn: 0.7035335	test: 0.6471917	best: 0.6473765 (1700)	total: 11m 4s	remaining: 9h 2m 45s
3000:	learn: 0.7238027	test: 0.6464683	best: 0.6473765 (1700)	total: 17m 55s	remaining: 9h 39m 23s
bestTest = 0.6473765373
bestIteration = 1700
Shrink model to first 1701 iterations.
best score {'learn': {'Logloss': 0.1020005882132671, 'AUC': 0.737351804971695}, 'validation': {'Logloss': 0.10870304436866762, 'AUC': 0.647376537322998}}
best_iteration 1700
Folder 1
0:	learn: 0.6076787	test: 0.6073602	best: 0.6073602 (0)	total: 240ms	remaining: 6h 39m 52s
1000:	learn: 0.6800132	test: 0.6474082	best: 0.6474591 (995)	total: 3m 50s	remaining: 6h 20m 30s
2000:	learn: 0.7042183	test: 0.6468134	best: 0.6474591 (995)	total: 7m 50s	remaining: 6h 24m 24s
bestTest = 0.6474590898
bestIteration = 995
Shrink model to fir

In [10]:
submission = pd.DataFrame({'ID': test_id, 'action': predictions})
oof = pd.DataFrame({'action': oof})

In [11]:
submission.to_csv('./catboost_oof_test_64875.csv', index = False)

In [12]:
oof.to_csv('./catboost_oof_train_64875.csv')

In [13]:
feature_importance_df = pd.DataFrame({'column': train_df.columns.values, 'score': clf.get_feature_importance()})

In [14]:
feature_importance_df.sort_values(by = ['score'], ascending=False)

Unnamed: 0,column,score
6,cate_level2,17.14319
16,distance,16.57573
1,request_cate_id,7.15165
7,cate_level3,5.79496
11,age,5.47012
19,request_time_second,5.26936
12,avg_price,4.78181
0,poi_id,4.70915
15,latitude_poi,3.8188
18,poi_avg_discount,3.79332
