In [1]:
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
 
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x) 
pd.options.display.max_rows = 200

In [2]:
train_df = pd.read_csv('./join_train_data_1030.csv')
test_df = pd.read_csv('./join_test_data_1030.csv')
print(train_df.shape)
print(test_df.shape)

(4198717, 22)
(192000, 21)


In [3]:
train_df.drop(train_df[train_df['latitude_req'].isnull().values==True].index, inplace=True)
train_df.isnull().sum()

poi_id                   0
request_id               0
pos                      0
time                     0
action                   0
uuid                     0
request_cate_id          0
request_time             0
latitude_req             0
longitude_req            0
device_type           1656
gender              317910
age                1582700
job                      0
cate_level1              0
cate_level2              0
cate_level3              0
area_id                  0
avg_price              230
poi_star              1441
longitude_poi            0
latitude_poi             0
dtype: int64

In [4]:
def geodistance(lat_req, lng_req, lat_poi, lng_poi):
    dlon = lng_req - lng_poi
    dlat = lat_req - lat_poi
    a = np.sin(dlat / 2) ** 2 + np.cos(lat_req) * np.cos(lat_poi) * np.sin(dlon / 2) ** 2
    distance = 2 * np.arcsin(np.sqrt(a)) * 6371 * 1000 # 地球平均半径，6371km
    distance = round(distance / 1000, 3)
    return distance

train_df['distance'] = geodistance(train_df['latitude_req'], train_df['longitude_req'], train_df['latitude_poi'], train_df['longitude_poi'])

In [5]:
print(train_df.shape)
train_df.describe(include='all')

(4084229, 23)


Unnamed: 0,poi_id,request_id,pos,time,action,uuid,request_cate_id,request_time,latitude_req,longitude_req,...,job,cate_level1,cate_level2,cate_level3,area_id,avg_price,poi_star,longitude_poi,latitude_poi,distance
count,4084229.0,4084229.0,4084229.0,4084229,4084229.0,4084229.0,4084229.0,4084229,4084229.0,4084229.0,...,4084229.0,4084229.0,4084229.0,4084229.0,4084229.0,4083999.0,4082788.0,4084229.0,4084229.0,4084229.0
unique,,,,22,,,,83540,,,...,,,,,,,,,,
top,,,,2019-05-09,,,,18:09:46,,,...,,,,,,,,,,
freq,,,,234552,,,,190,,,...,,,,,,,,,,
mean,9.263368834749047e+18,9.22527318217901e+18,21.14924,,0.02206,9.226826470747374e+18,141.89224,,31.38623,113.3746,...,1.83516,225.88114,470.24831,1079.40207,13402.06174,81.12429,44.32228,113.3834,31.38107,805.98
std,5.297128961152608e+18,5.32358510307789e+18,35.31436,,0.14687,5.324229550683126e+18,8462.79463,,6.26835,7.10072,...,1.00699,6.61876,682.16487,977.55815,10982.30053,290.18962,6.84505,6.91954,6.2524,1986.41766
min,736732185668513.9,6427683153551.0,1.0,,0.0,1568445154080.0,1.0,,-46.37095,-124.04854,...,0.0,3.0,10.0,8.0,5.0,0.0,0.0,75.15138,0.83036,0.006
25%,4.711116669438944e+18,4.619597329051458e+18,3.0,,0.0,4.619358645950457e+18,1.0,,27.38022,110.72561,...,1.0,226.0,201.0,153.0,4793.0,42.0,40.0,110.77155,27.382,76.834
50%,9.28071791450426e+18,9.225329840154776e+18,8.0,,0.0,9.228463794754765e+18,1.0,,30.84668,113.57921,...,1.0,226.0,229.0,383.0,13325.0,67.0,46.0,113.57769,30.84059,187.597
75%,1.3846860483431373e+19,1.3834059611531037e+19,28.0,,0.0,1.3833390155377547e+19,1.0,,35.82945,118.53784,...,3.0,226.0,315.0,2032.0,19553.0,89.0,50.0,118.53264,35.76437,542.13


In [6]:
train_target = train_df['action']
train_target.value_counts()

0    3994144
1      90085
Name: action, dtype: int64

In [None]:
train_df.drop(['action'], axis = 1, inplace=True)

In [7]:
test_id = test_df['ID']
test_df.drop(['ID'], axis = 1, inplace=True)

In [10]:
train_df.drop(['pos'], axis = 1, inplace=True)
print(train_df.shape)
train_df.describe()

(4084229, 21)


Unnamed: 0,poi_id,request_id,uuid,request_cate_id,latitude_req,longitude_req,gender,age,job,cate_level1,cate_level2,cate_level3,area_id,avg_price,poi_star,longitude_poi,latitude_poi,distance
count,4084229.0,4084229.0,4084229.0,4084229.0,4084229.0,4084229.0,3766319.0,2501529.0,4084229.0,4084229.0,4084229.0,4084229.0,4084229.0,4083999.0,4082788.0,4084229.0,4084229.0,4084229.0
mean,9.263368834749047e+18,9.22527318217901e+18,9.226826470747374e+18,141.89224,31.38623,113.3746,0.39509,28.05078,1.83516,225.88114,470.24831,1079.40207,13402.06174,81.12429,44.32228,113.3834,31.38107,805.98
std,5.297128961152608e+18,5.32358510307789e+18,5.324229550683126e+18,8462.79463,6.26835,7.10072,0.48887,7.42726,1.00699,6.61876,682.16487,977.55815,10982.30053,290.18962,6.84505,6.91954,6.2524,1986.41766
min,736732185668513.9,6427683153551.0,1568445154080.0,1.0,-46.37095,-124.04854,0.0,15.0,0.0,3.0,10.0,8.0,5.0,0.0,0.0,75.15138,0.83036,0.006
25%,4.711116669438944e+18,4.619597329051458e+18,4.619358645950457e+18,1.0,27.38022,110.72561,0.0,22.0,1.0,226.0,201.0,153.0,4793.0,42.0,40.0,110.77155,27.382,76.834
50%,9.28071791450426e+18,9.225329840154776e+18,9.228463794754765e+18,1.0,30.84668,113.57921,0.0,27.0,1.0,226.0,229.0,383.0,13325.0,67.0,46.0,113.57769,30.84059,187.597
75%,1.3846860483431373e+19,1.3834059611531037e+19,1.3833390155377547e+19,1.0,35.82945,118.53784,1.0,32.0,3.0,226.0,315.0,2032.0,19553.0,89.0,50.0,118.53264,35.76437,542.13
max,1.8446645066501652e+19,1.844674189648272e+19,1.8446742462670256e+19,8389617.0,61.6106,174.80572,1.0,68.0,3.0,1853.0,2413.0,2548.0,80019.0,109353.0,50.0,130.34367,50.79686,19968.795


In [11]:
print(test_df.shape)
test_df.describe()

(192000, 20)


Unnamed: 0,poi_id,request_id,uuid,request_cate_id,latitude_req,longitude_req,gender,age,job,cate_level1,cate_level2,cate_level3,area_id,avg_price,poi_star,longitude_poi,latitude_poi
count,192000.0,192000.0,191997.0,191997.0,186971.0,186971.0,177738.0,119496.0,191997.0,192000.0,192000.0,192000.0,192000.0,191993.0,191916.0,192000.0,192000.0
mean,9.264012196760375e+18,9.22701235131364e+18,9.228547745081668e+18,297.2851,31.29658,113.52134,0.41648,27.32943,1.82684,225.92268,443.81416,1088.54853,13388.74947,80.26317,44.37211,113.52783,31.29958
std,5.268318366026228e+18,5.330354692299017e+18,5.326838760505931e+18,38324.72048,6.37037,7.17691,0.49298,7.13652,0.99297,12.69359,651.78327,975.51403,10943.51506,246.80755,6.48042,6.83354,6.34942
min,1015769069232803.0,1746252198753.0,46993148784627.0,1.0,-40.61593,-123.27593,0.0,15.0,0.0,3.0,10.0,8.0,5.0,0.0,0.0,75.1537,17.76474
25%,4.781846136635041e+18,4.604571799849832e+18,4.6065762247883105e+18,1.0,26.42698,111.6128,0.0,22.0,1.0,226.0,201.0,153.0,4804.0,45.0,40.0,111.59408,26.42478
50%,9.28279533202496e+18,9.23852234550644e+18,9.247145681530706e+18,1.0,30.82624,113.56936,0.0,26.0,1.0,226.0,229.0,383.0,13305.0,68.0,46.0,113.54299,30.82119
75%,1.378164034747402e+19,1.3854574578587183e+19,1.3847680929835655e+19,1.0,35.94066,118.52135,1.0,31.0,3.0,226.0,315.0,2032.0,19455.0,89.0,50.0,118.50223,35.93822
max,1.8446645066501652e+19,1.8446480806651652e+19,1.844672235756096e+19,8389617.0,59.4834,150.3866,1.0,68.0,3.0,1853.0,2413.0,2547.0,37662.0,19415.0,50.0,130.34367,49.14874


In [12]:
test_df.fillna(value={'latitude_req': 31.29658, 'longitude_req': 113.52134}, inplace=True)
test_df.isnull().sum()

poi_id                 0
request_id             0
time                   0
uuid                   3
request_cate_id        3
request_time           3
latitude_req           0
longitude_req          0
device_type           86
gender             14262
age                72504
job                    3
cate_level1            0
cate_level2            0
cate_level3            0
area_id                0
avg_price              7
poi_star              84
longitude_poi          0
latitude_poi           0
dtype: int64

In [13]:
test_df['distance'] = geodistance(test_df['latitude_req'], test_df['longitude_req'], test_df['latitude_poi'], test_df['longitude_poi'])

In [14]:
train_df.describe(include = 'all')

Unnamed: 0,poi_id,request_id,time,uuid,request_cate_id,request_time,latitude_req,longitude_req,device_type,gender,...,job,cate_level1,cate_level2,cate_level3,area_id,avg_price,poi_star,longitude_poi,latitude_poi,distance
count,4084229.0,4084229.0,4084229,4084229.0,4084229.0,4084229,4084229.0,4084229.0,4082573,3766319.0,...,4084229.0,4084229.0,4084229.0,4084229.0,4084229.0,4083999.0,4082788.0,4084229.0,4084229.0,4084229.0
unique,,,22,,,83540,,,18,,...,,,,,,,,,,
top,,,2019-05-09,,,18:09:46,,,ANDROID,,...,,,,,,,,,,
freq,,,234552,,,190,,,2397828,,...,,,,,,,,,,
mean,9.263368834749047e+18,9.22527318217901e+18,,9.226826470747374e+18,141.89224,,31.38623,113.3746,,0.39509,...,1.83516,225.88114,470.24831,1079.40207,13402.06174,81.12429,44.32228,113.3834,31.38107,805.98
std,5.297128961152608e+18,5.32358510307789e+18,,5.324229550683126e+18,8462.79463,,6.26835,7.10072,,0.48887,...,1.00699,6.61876,682.16487,977.55815,10982.30053,290.18962,6.84505,6.91954,6.2524,1986.41766
min,736732185668513.9,6427683153551.0,,1568445154080.0,1.0,,-46.37095,-124.04854,,0.0,...,0.0,3.0,10.0,8.0,5.0,0.0,0.0,75.15138,0.83036,0.006
25%,4.711116669438944e+18,4.619597329051458e+18,,4.619358645950457e+18,1.0,,27.38022,110.72561,,0.0,...,1.0,226.0,201.0,153.0,4793.0,42.0,40.0,110.77155,27.382,76.834
50%,9.28071791450426e+18,9.225329840154776e+18,,9.228463794754765e+18,1.0,,30.84668,113.57921,,0.0,...,1.0,226.0,229.0,383.0,13325.0,67.0,46.0,113.57769,30.84059,187.597
75%,1.3846860483431373e+19,1.3834059611531037e+19,,1.3833390155377547e+19,1.0,,35.82945,118.53784,,1.0,...,3.0,226.0,315.0,2032.0,19553.0,89.0,50.0,118.53264,35.76437,542.13


In [15]:
train_df.drop(['request_id', 'uuid', 'poi_id'], axis = 1, inplace=True)
print(train_df.shape)

(4084229, 18)


In [16]:
test_df.describe(include = 'all')

Unnamed: 0,poi_id,request_id,time,uuid,request_cate_id,request_time,latitude_req,longitude_req,device_type,gender,...,job,cate_level1,cate_level2,cate_level3,area_id,avg_price,poi_star,longitude_poi,latitude_poi,distance
count,192000.0,192000.0,192000,191997.0,191997.0,191997,192000.0,192000.0,191914,177738.0,...,191997.0,192000.0,192000.0,192000.0,192000.0,191993.0,191916.0,192000.0,192000.0,192000.0
unique,,,1,,,53964,,,13,,...,,,,,,,,,,
top,,,2019-05-24,,,17:31:07,,,ANDROID,,...,,,,,,,,,,
freq,,,192000,,,19,,,113785,,...,,,,,,,,,,
mean,9.264012196760375e+18,9.22701235131364e+18,,9.228547745081668e+18,297.2851,,31.29658,113.52134,,0.41648,...,1.82684,225.92268,443.81416,1088.54853,13388.74947,80.26317,44.37211,113.52783,31.29958,954.35981
std,5.268318366026228e+18,5.330354692299017e+18,,5.326838760505931e+18,38324.72048,,6.28639,7.08229,,0.49298,...,0.99297,12.69359,651.78327,975.51403,10943.51506,246.80755,6.48042,6.83354,6.34942,2401.6953
min,1015769069232803.0,1746252198753.0,,46993148784627.0,1.0,,-40.61593,-123.27593,,0.0,...,0.0,3.0,10.0,8.0,5.0,0.0,0.0,75.1537,17.76474,0.052
25%,4.781846136635041e+18,4.604571799849832e+18,,4.6065762247883105e+18,1.0,,27.25724,111.70764,,0.0,...,1.0,226.0,201.0,153.0,4804.0,45.0,40.0,111.59408,26.42478,77.04975
50%,9.28279533202496e+18,9.23852234550644e+18,,9.247145681530706e+18,1.0,,30.89952,113.52134,,0.0,...,1.0,226.0,229.0,383.0,13305.0,68.0,46.0,113.54299,30.82119,191.17
75%,1.378164034747402e+19,1.3854574578587183e+19,,1.3847680929835655e+19,1.0,,35.71931,118.405,,1.0,...,3.0,226.0,315.0,2032.0,19455.0,89.0,50.0,118.50223,35.93822,563.78725


In [17]:
test_df.drop(['request_id', 'uuid', 'poi_id'], axis = 1, inplace=True)
print(test_df.shape)

(192000, 18)


In [18]:
def change_data_type(x):
    train_df[x] = train_df[x].astype('category')
    test_df[x] = test_df[x].astype('category')

In [19]:
# change_data_type('time')
# change_data_type('request_time')
change_data_type('device_type')
train_df.drop(['time', 'request_time'], axis = 1, inplace=True)
test_df.drop(['time', 'request_time'], axis = 1, inplace=True)

In [81]:
n_splits = 5
random_state = 2000
# np.random.seed(random_state)
splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split(train_df, train_target))

cols = list(train_df.columns)
feature_importance_df = pd.DataFrame()
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [29]:
train_df['device_type'].value_counts()

ANDROID          2397828
IPHONE           1634815
WEIXINPROGRAM      28298
TOUCH              12067
IPAD                6278
WANDIE              2672
(NONE)               221
ORGANIC              203
WAP                   92
REFERRAL              79
LITE                   6
WECHAT                 4
IOSWEB                 3
ANDROIDWEB             3
MT                     1
DSPPC                  1
WAP?                   1
(NOT%20SET)            1
Name: device_type, dtype: int64

In [35]:
train_df.loc[train_df['device_type'] == 'WAP?', ['device_type']] = 'WAP'

In [38]:
train_df['device_type'].value_counts().index

CategoricalIndex(['ANDROID', 'IPHONE', 'WEIXINPROGRAM', 'TOUCH', 'IPAD',
                  'WANDIE', '(NONE)', 'ORGANIC', 'WAP', 'REFERRAL', 'LITE',
                  'WECHAT', 'IOSWEB', 'ANDROIDWEB', 'MT', 'DSPPC',
                  '(NOT%20SET)', 'WAP?'],
                 categories=['(NONE)', '(NOT%20SET)', 'ANDROID', 'ANDROIDWEB', 'DSPPC', 'IOSWEB', 'IPAD', 'IPHONE', ...], ordered=False, dtype='category')

In [50]:
train_df['device_type'].fillna('(NONE)', inplace = True)

test_df['device_type'].fillna('(NONE)', inplace = True)

In [52]:
test_df['device_type'].value_counts()

ANDROID          113785
IPHONE            75876
WEIXINPROGRAM      1374
TOUCH               492
IPAD                218
WANDIE              134
(NONE)               93
ORGANIC              12
WAP                   7
REFERRAL              4
UNKNOWN               2
LITE                  2
(NOT%20SET)           1
Name: device_type, dtype: int64

In [53]:
test_df.loc[test_df['device_type'] == 'UNKNOWN', ['device_type']] = '(NONE)'

In [58]:
le = LabelEncoder()
le.fit(['ANDROID', 'IPHONE', 'WEIXINPROGRAM', 'TOUCH', 'IPAD',
                  'WANDIE', '(NONE)', 'ORGANIC', 'WAP', 'REFERRAL', 'LITE',
                  'WECHAT', 'IOSWEB', 'ANDROIDWEB', 'MT', 'DSPPC',
                  '(NOT%20SET)', ])
train_df['device_type'] = le.transform(train_df['device_type'])
test_df['device_type'] = le.transform(test_df['device_type'])

In [63]:
train_df['device_type'].value_counts()

2     2397828
7     1634815
16      28298
12      12067
6        6278
13       2672
0        1877
10        203
14         93
11         79
8           6
15          4
5           3
3           3
9           1
4           1
1           1
Name: device_type, dtype: int64

In [59]:
train_df.columns

Index(['request_cate_id', 'latitude_req', 'longitude_req', 'device_type',
       'gender', 'age', 'job', 'cate_level1', 'cate_level2', 'cate_level3',
       'area_id', 'avg_price', 'poi_star', 'longitude_poi', 'latitude_poi',
       'distance'],
      dtype='object')

In [None]:
for i, (train_idx, valid_idx) in enumerate(splits):
    print('Folder', i)
    x_tr, y_tr = train_df.iloc[train_idx], train_target.iloc[train_idx]
    x_valid, y_valid = train_df.iloc[valid_idx], train_target.iloc[valid_idx]
    
    # x_tr, y_tr = augment(x_tr, y_tr, cols)
    num_round = 200000  
    clf = cat.CatBoostClassifier(iterations=num_round, depth=5,\
                                 learning_rate=0.01, verbose=1000, \
                                 loss_function='Logloss', eval_metric='AUC', \
                                early_stopping_rounds=4000, random_seed=random_state, \
                                task_type='GPU', devices='0',\
                                 bootstrap_type='Poisson', subsample=0.99)
    
    # cat_features = [0,3,4,5,6,7,8,9,10]
    clf.fit(x_tr, y_tr, eval_set=cat.Pool(x_valid, y_valid))
    
    print(f'best score {clf.get_best_score()}')
    print(f'best_iteration {clf.get_best_iteration()}')
    oof[valid_idx] = clf.predict_proba(x_valid)[:, 1]
    predictions += clf.predict_proba(test_df)[:, 1] / n_splits
    del x_tr
    del y_tr
    
print(metrics.roc_auc_score(train_target.values, oof))

Folder 0
0:	learn: 0.5449741	test: 0.5459926	best: 0.5459926 (0)	total: 41.6ms	remaining: 2h 18m 48s
1000:	learn: 0.6342542	test: 0.6314708	best: 0.6314708 (1000)	total: 42.8s	remaining: 2h 21m 41s
2000:	learn: 0.6397316	test: 0.6348179	best: 0.6348192 (1999)	total: 1m 25s	remaining: 2h 20m 53s
3000:	learn: 0.6434036	test: 0.6364992	best: 0.6365023 (2998)	total: 2m 8s	remaining: 2h 20m 17s
4000:	learn: 0.6463072	test: 0.6373951	best: 0.6373954 (3998)	total: 2m 50s	remaining: 2h 19m 36s
5000:	learn: 0.6489049	test: 0.6380996	best: 0.6381001 (4997)	total: 3m 33s	remaining: 2h 18m 58s
6000:	learn: 0.6512462	test: 0.6385928	best: 0.6385928 (6000)	total: 4m 16s	remaining: 2h 18m 16s
7000:	learn: 0.6533557	test: 0.6389212	best: 0.6389219 (6998)	total: 4m 59s	remaining: 2h 17m 36s
8000:	learn: 0.6553465	test: 0.6392301	best: 0.6392301 (8000)	total: 5m 42s	remaining: 2h 16m 54s
9000:	learn: 0.6572298	test: 0.6395029	best: 0.6395036 (8999)	total: 6m 25s	remaining: 2h 16m 13s
10000:	learn: 0.659

2000:	learn: 0.6397686	test: 0.6359541	best: 0.6359541 (2000)	total: 1m 25s	remaining: 2h 21m 17s
3000:	learn: 0.6433868	test: 0.6375438	best: 0.6375438 (3000)	total: 2m 8s	remaining: 2h 20m 39s


In [71]:
submission = pd.DataFrame({'ID': test_id, 'action': predictions})
oof = pd.DataFrame({'action': oof})

In [77]:
submission.to_csv('./catboost_oof_test_63954.csv', index = False)

In [76]:
oof.to_csv('./catboost_oof_train_63954.csv')

In [74]:
submission.shape

(192000, 2)

In [75]:
submission.describe()

Unnamed: 0,ID,action
count,192000.0,192000.0
mean,95999.5,0.0224
std,55425.77018,0.01085
min,0.0,0.00159
25%,47999.75,0.01516
50%,95999.5,0.02121
75%,143999.25,0.0282
max,191999.0,0.19051


In [80]:
submission.isnull().sum()

ID        0
action    0
dtype: int64