In [1]:
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
 
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x) 
pd.options.display.max_rows = 200

In [2]:
train_df = pd.read_csv('data/train/train_beh.csv')
test_df = pd.read_csv('data/test/test_beh_b.csv')
print(train_df.shape)
print(test_df.shape)

(934282, 5)
(95669, 4)


In [3]:
train_df.head(5)

Unnamed: 0,id,flag,page_no,Unnamed: 3,page_tm
0,U64F0C9,0,SZA,2019-06-30 12:44:27,
1,U64F0C9,0,CQE,2019-06-25 16:15:11,
2,U64F0C9,0,AAO,2019-06-30 12:44:17,
3,U64F0C9,0,CQE,2019-06-17 13:50:12,
4,U64F0C9,0,AAO,2019-06-17 13:50:08,


In [4]:
test_df.head(5)

Unnamed: 0,id,page_no,Unnamed: 2,page_tm
0,U441F8F,CQA,2019-06-11 22:27:25,
1,U441F8F,XAI,2019-06-17 17:42:39,
2,U441F8F,CQA,2019-06-17 17:42:02,
3,U441F8F,XAI,2019-06-17 17:42:35,
4,U441F8F,CQE,2019-06-17 17:42:49,


In [5]:
train_df.describe(include='all')

Unnamed: 0,id,flag,page_no,Unnamed: 3,page_tm
count,934282,934282.0,934282,934282,0.0
unique,11913,,29,690024,
top,U0015B2,,CQA,2019-06-26 13:35:04,
freq,3617,,337400,9,
mean,,0.15059,,,
std,,0.35765,,,
min,,0.0,,,
25%,,0.0,,,
50%,,0.0,,,
75%,,0.0,,,


In [6]:
train_target = train_df['flag']
test_id = test_df['id']
train_df.drop(['flag','id', 'page_tm'], axis = 1, inplace=True)
test_df.drop(['id', 'page_tm'], axis = 1, inplace=True)

In [7]:
print(train_df.shape)
print(test_df.shape)

(934282, 2)
(95669, 2)


In [8]:
train_df.columns

Index(['page_no', 'Unnamed: 3'], dtype='object')

In [9]:
test_df.isnull().sum()

page_no       0
Unnamed: 2    0
dtype: int64

In [10]:
# 时间转换为秒数
from datetime import datetime
from datetime import timedelta
now = datetime.now()
now

datetime.datetime(2020, 5, 10, 12, 51, 58, 322797)

In [11]:
delta = (now - pd.to_datetime(train_df['Unnamed: 3'])).dt

In [12]:
train_df['tm_sec'] = delta.days * 24 * 60 * 60 + delta.seconds
train_df['tm_day'] = delta.days

In [13]:
delta = (now - pd.to_datetime(test_df['Unnamed: 2'])).dt
test_df['tm_sec'] = delta.days * 24 * 60 * 60 + delta.seconds
test_df['tm_day'] = delta.days

In [14]:
test_df.describe(include = 'all')

Unnamed: 0,page_no,Unnamed: 2,tm_sec,tm_day
count,95669,95669,95669.0,95669.0
unique,29,85223,,
top,CQA,2019-06-14 16:45:53,,
freq,34735,6,,
mean,,,28433695.42368,328.54768
std,,,742420.61881,8.60665
min,,,27175950.0,314.0
25%,,,27752387.0,321.0
50%,,,28414145.0,328.0
75%,,,29055519.0,336.0


In [15]:
train_df.drop(['Unnamed: 3'], axis = 1, inplace=True)
test_df.drop(['Unnamed: 2'], axis = 1, inplace=True)

In [16]:
n_splits = 5
random_state = 2000
# np.random.seed(random_state)
splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split(train_df, train_target))

cols = list(train_df.columns)
sparse_features = ['page_no']

feature_importance_df = pd.DataFrame()
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [17]:
for i, (train_idx, valid_idx) in enumerate(splits):
    print('Folder', i)
    x_tr, y_tr = train_df.iloc[train_idx], train_target.iloc[train_idx]
    x_valid, y_valid = train_df.iloc[valid_idx], train_target.iloc[valid_idx]
    
    # x_tr, y_tr = augment(x_tr, y_tr, cols)
    num_round = 100000  
    clf = cat.CatBoostClassifier(iterations=num_round, depth=7,\
                                 l2_leaf_reg=4, learning_rate=0.1, verbose=1000, \
                                 loss_function='Logloss', eval_metric='AUC', \
                                early_stopping_rounds=2000, random_seed=random_state, \
                                task_type='GPU', devices='1',\
                                 cat_features=sparse_features,\
                                 bootstrap_type='Poisson', subsample=0.99)
    
    clf.fit(x_tr, y_tr, eval_set=cat.Pool(x_valid, y_valid, cat_features=sparse_features))
    
    print(f'best score {clf.get_best_score()}')
    print(f'best_iteration {clf.get_best_iteration()}')
    oof[valid_idx] = clf.predict_proba(x_valid)[:, 1]
    predictions += clf.predict_proba(test_df)[:, 1] / n_splits
    del x_tr
    del y_tr
    
print(metrics.roc_auc_score(train_target.values, oof))

Folder 0
0:	learn: 0.5299682	test: 0.5299339	best: 0.5299339 (0)	total: 34.1ms	remaining: 56m 47s
1000:	learn: 0.5805946	test: 0.5573977	best: 0.5574389 (997)	total: 33.9s	remaining: 55m 55s
2000:	learn: 0.5929674	test: 0.5587432	best: 0.5587453 (1999)	total: 1m 7s	remaining: 55m 21s
3000:	learn: 0.6022959	test: 0.5593033	best: 0.5593932 (2600)	total: 1m 41s	remaining: 54m 49s
4000:	learn: 0.6100204	test: 0.5593918	best: 0.5594115 (3122)	total: 2m 16s	remaining: 54m 23s
5000:	learn: 0.6162933	test: 0.5592768	best: 0.5594990 (4120)	total: 2m 50s	remaining: 53m 54s
6000:	learn: 0.6219206	test: 0.5590261	best: 0.5594990 (4120)	total: 3m 24s	remaining: 53m 19s
bestTest = 0.5594989657
bestIteration = 4120
Shrink model to first 4121 iterations.
best score {'learn': {'Logloss': 0.4081622319965214, 'AUC': 0.6226595938205719}, 'validation': {'Logloss': 0.41997448249195907, 'AUC': 0.5594989657402039}}
best_iteration 4120
Folder 1
0:	learn: 0.5290775	test: 0.5272362	best: 0.5272362 (0)	total: 40.

In [18]:
# 0.5600358550554558
train_df = pd.read_csv('data/train/train_beh.csv')
train_id = train_df.id

In [19]:
beh_feature_test_pred = pd.DataFrame({'id': test_id, 'beh_pred': predictions})
beh_feature_train_pred = pd.DataFrame({'id': train_id, 'beh_pred': oof})

In [20]:
beh_feature_test_pred.to_csv('data/beh_feature_test_pred.csv', index = False)
beh_feature_train_pred.to_csv('data/beh_feature_train_pred.csv')

In [21]:
table = beh_feature_train_pred.groupby('id').max()

In [22]:
table.beh_pred

id
U0001B8   0.18592
U000437   0.19843
U0015B2   0.44844
U0016FF   0.19797
U001945   0.19260
            ...  
UFFF2E7   0.20977
UFFF441   0.25839
UFFF7F4   0.23511
UFFFC56   0.30409
UFFFF38   0.30440
Name: beh_pred, Length: 11913, dtype: float64

In [23]:
beh_train_pred = pd.DataFrame({'id': table.index, 'beh_pred': table.beh_pred})

In [24]:
beh_train_pred

Unnamed: 0_level_0,id,beh_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1
U0001B8,U0001B8,0.18592
U000437,U000437,0.19843
U0015B2,U0015B2,0.44844
U0016FF,U0016FF,0.19797
U001945,U001945,0.19260
...,...,...
UFFF2E7,UFFF2E7,0.20977
UFFF441,UFFF441,0.25839
UFFF7F4,UFFF7F4,0.23511
UFFFC56,UFFFC56,0.30409


In [25]:
beh_train_pred.to_csv('data/beh_feature_train_pred.csv', index = False)

In [26]:
ttable = beh_feature_test_pred.groupby('id').max()

In [27]:
ttable

Unnamed: 0_level_0,beh_pred
id,Unnamed: 1_level_1
U0013E0,0.23085
U002CBC,0.28586
U003DF1,0.33399
U005066,0.16667
U008890,0.18918
...,...
U547028,0.15168
U547D67,0.23720
U5480E9,0.20935
U548AFC,0.17644


In [28]:
beh_test_pred = pd.DataFrame({'id': ttable.index, 'beh_pred': ttable.beh_pred})

In [29]:
beh_test_pred

Unnamed: 0_level_0,id,beh_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1
U0013E0,U0013E0,0.23085
U002CBC,U002CBC,0.28586
U003DF1,U003DF1,0.33399
U005066,U005066,0.16667
U008890,U008890,0.18918
...,...,...
U547028,U547028,0.15168
U547D67,U547D67,0.23720
U5480E9,U5480E9,0.20935
U548AFC,U548AFC,0.17644


In [30]:
beh_test_pred.to_csv('data/beh_feature_test_pred.csv', index = False)

In [31]:
table = beh_feature_train_pred.groupby('id').mean()
beh_train_pred = pd.DataFrame({'id': table.index, 'beh_pred_mean': table.beh_pred})
ttable = beh_feature_test_pred.groupby('id').mean()
beh_test_pred = pd.DataFrame({'id': ttable.index, 'beh_pred_mean': ttable.beh_pred})

beh_train_pred.to_csv('data/beh_feature_train_pred_mean.csv', index = False)
beh_test_pred.to_csv('data/beh_feature_test_pred_mean.csv', index = False)

In [32]:
table = beh_feature_train_pred.groupby('id').sum()
beh_train_pred = pd.DataFrame({'id': table.index, 'beh_pred_sum': table.beh_pred})
ttable = beh_feature_test_pred.groupby('id').sum()
beh_test_pred = pd.DataFrame({'id': ttable.index, 'beh_pred_sum': ttable.beh_pred})

beh_train_pred.to_csv('data/beh_feature_train_pred_sum.csv', index = False)
beh_test_pred.to_csv('data/beh_feature_test_pred_sum.csv', index = False)