In [1]:
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
 
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x) 
pd.options.display.max_rows = 200

In [2]:
train_df = pd.read_csv('data/train/train_trd.csv')
test_df = pd.read_csv('data/test/test_trd_b.csv')
print(train_df.shape)
print(test_df.shape)

(1367211, 8)
(142645, 7)


In [3]:
train_df.head(5)

Unnamed: 0,id,flag,Dat_Flg1_Cd,Dat_Flg3_Cd,Trx_Cod1_Cd,Trx_Cod2_Cd,trx_tm,cny_trx_amt
0,UFDC88A,0,B,B,1,116,2019-06-20 07:15:28,-127.99
1,UFDC88A,0,B,B,1,116,2019-06-16 10:09:13,-55.88
2,UFDC88A,0,B,B,1,136,2019-05-14 16:11:32,-557.0
3,UFDC88A,0,B,B,1,136,2019-05-19 21:54:40,-77.8
4,UFDC88A,0,B,B,1,113,2019-06-18 08:23:59,-271.62


In [4]:
test_df.head(5)

Unnamed: 0,id,Dat_Flg1_Cd,Dat_Flg3_Cd,Trx_Cod1_Cd,Trx_Cod2_Cd,trx_tm,cny_trx_amt
0,U452CA2,B,B,1,130,2019-06-01 00:29:32,-5.0
1,U452CA2,B,B,1,136,2019-05-15 00:00:00,-37.19
2,U452CA2,C,B,3,309,2019-05-30 13:22:08,249.47
3,U452CA2,B,B,1,108,2019-05-30 14:37:13,-29.94
4,U452CA2,B,B,1,136,2019-06-21 00:00:00,-111.31


In [5]:
train_df.describe(include='all')

Unnamed: 0,id,flag,Dat_Flg1_Cd,Dat_Flg3_Cd,Trx_Cod1_Cd,Trx_Cod2_Cd,trx_tm,cny_trx_amt
count,1367211,1367211.0,1367211,1367211,1367211.0,1367211.0,1367211,1367211.0
unique,31993,,2,3,,,1088223,
top,UBD9C48,,B,A,,,2019-06-01 00:29:49,
freq,5836,,1068119,695630,,,704,
mean,,0.15735,,,1.47515,166.65003,,4.89295
std,,0.36413,,,0.77402,71.77708,,69971.66252
min,,0.0,,,1.0,101.0,,-23900000.0
25%,,0.0,,,1.0,117.0,,-200.0
50%,,0.0,,,1.0,134.0,,-24.0
75%,,0.0,,,2.0,209.0,,-2.0


In [6]:
train_target = train_df['flag']
train_df.columns

Index(['id', 'flag', 'Dat_Flg1_Cd', 'Dat_Flg3_Cd', 'Trx_Cod1_Cd',
       'Trx_Cod2_Cd', 'trx_tm', 'cny_trx_amt'],
      dtype='object')

In [7]:
train_df.drop(['flag'], axis = 1, inplace=True)

In [8]:
print(train_df.shape)
print(test_df.shape)

(1367211, 7)
(142645, 7)


In [9]:
train_id = train_df['id']
test_id = test_df['id']
train_df.drop(['id'], axis = 1, inplace=True)
test_df.drop(['id'], axis = 1, inplace=True)

In [10]:
train_df.columns

Index(['Dat_Flg1_Cd', 'Dat_Flg3_Cd', 'Trx_Cod1_Cd', 'Trx_Cod2_Cd', 'trx_tm',
       'cny_trx_amt'],
      dtype='object')

In [11]:
train_df.dtypes

Dat_Flg1_Cd     object
Dat_Flg3_Cd     object
Trx_Cod1_Cd      int64
Trx_Cod2_Cd      int64
trx_tm          object
cny_trx_amt    float64
dtype: object

In [12]:
test_df.isnull().sum()

Dat_Flg1_Cd    0
Dat_Flg3_Cd    0
Trx_Cod1_Cd    0
Trx_Cod2_Cd    0
trx_tm         0
cny_trx_amt    0
dtype: int64

In [13]:
# 时间转换为秒数
from datetime import datetime
from datetime import timedelta
now = datetime.now()
now

datetime.datetime(2020, 5, 10, 12, 49, 48, 55792)

In [14]:
train_df['trx_tm']

0          2019-06-20 07:15:28
1          2019-06-16 10:09:13
2          2019-05-14 16:11:32
3          2019-05-19 21:54:40
4          2019-06-18 08:23:59
                  ...         
1367206    2019-06-24 15:16:12
1367207    2019-05-24 15:16:12
1367208    2019-05-24 15:16:12
1367209    2019-06-24 15:16:12
1367210    2019-06-01 11:52:22
Name: trx_tm, Length: 1367211, dtype: object

In [15]:
delta = (now - pd.to_datetime(train_df['trx_tm'])).dt

In [16]:
train_df['trx_tm_sec'] = delta.days * 24 * 60 * 60 + delta.seconds
train_df['trx_tm_day'] = delta.days

In [17]:
delta = (now - pd.to_datetime(test_df['trx_tm'])).dt
test_df['trx_tm_sec'] = delta.days * 24 * 60 * 60 + delta.seconds
test_df['trx_tm_day'] = delta.days

In [18]:
train_df['trx_tm_sec'].unique()

array([28100060, 28435235, 31264696, ..., 27725616, 30404016, 29725046])

In [19]:
train_df.drop(['trx_tm'], axis = 1, inplace=True)
test_df.drop(['trx_tm'], axis = 1, inplace=True)

In [20]:
n_splits = 5
random_state = 2000
# np.random.seed(random_state)
splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split(train_df, train_target))

cols = list(train_df.columns)
sparse_features = ['Dat_Flg1_Cd', 'Dat_Flg3_Cd', 'Trx_Cod1_Cd', 'Trx_Cod2_Cd']

feature_importance_df = pd.DataFrame()
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [21]:
for i, (train_idx, valid_idx) in enumerate(splits):
    print('Folder', i)
    x_tr, y_tr = train_df.iloc[train_idx], train_target.iloc[train_idx]
    x_valid, y_valid = train_df.iloc[valid_idx], train_target.iloc[valid_idx]
    
    # x_tr, y_tr = augment(x_tr, y_tr, cols)
    num_round = 100000  
    clf = cat.CatBoostClassifier(iterations=num_round, depth=7,\
                                 l2_leaf_reg=4, learning_rate=0.1, verbose=1000, \
                                 loss_function='Logloss', eval_metric='AUC', \
                                early_stopping_rounds=2000, random_seed=random_state, \
                                task_type='GPU', devices='2',\
                                 cat_features=sparse_features,\
                                 bootstrap_type='Poisson', subsample=0.99)
    
    clf.fit(x_tr, y_tr, eval_set=cat.Pool(x_valid, y_valid, cat_features=sparse_features))
    
    print(f'best score {clf.get_best_score()}')
    print(f'best_iteration {clf.get_best_iteration()}')
    oof[valid_idx] = clf.predict_proba(x_valid)[:, 1]
    predictions += clf.predict_proba(test_df)[:, 1] / n_splits
    del x_tr
    del y_tr
    
print(metrics.roc_auc_score(train_target.values, oof))

Folder 0
0:	learn: 0.6153222	test: 0.6159795	best: 0.6159795 (0)	total: 61.8ms	remaining: 1h 42m 56s
1000:	learn: 0.6496538	test: 0.6354358	best: 0.6354358 (1000)	total: 1m 1s	remaining: 1h 41m 17s
2000:	learn: 0.6615166	test: 0.6369233	best: 0.6369233 (2000)	total: 2m 4s	remaining: 1h 41m 35s
3000:	learn: 0.6709138	test: 0.6377584	best: 0.6377698 (2995)	total: 3m 7s	remaining: 1h 41m 9s
4000:	learn: 0.6789054	test: 0.6380326	best: 0.6381088 (3848)	total: 4m 10s	remaining: 1h 40m 16s
5000:	learn: 0.6862119	test: 0.6382243	best: 0.6382243 (5000)	total: 5m 14s	remaining: 1h 39m 33s
6000:	learn: 0.6925678	test: 0.6382032	best: 0.6383160 (5500)	total: 6m 26s	remaining: 1h 40m 54s
7000:	learn: 0.6981902	test: 0.6379768	best: 0.6383160 (5500)	total: 7m 29s	remaining: 1h 39m 36s
bestTest = 0.6383160353
bestIteration = 5500
Shrink model to first 5501 iterations.
best score {'learn': {'Logloss': 0.39814504767007264, 'AUC': 0.7009848654270172}, 'validation': {'Logloss': 0.41765682770266566, 'AUC

In [22]:
# 0.6386233616870445
trd_feature_test_pred = pd.DataFrame({'id': test_id, 'trd_pred': predictions})
trd_feature_train_pred = pd.DataFrame({'id': train_id, 'trd_pred': oof})

In [23]:
table = trd_feature_train_pred.groupby('id').max()

trd_train_pred = pd.DataFrame({'id': table.index, 'trd_pred': table.trd_pred})

ttable = trd_feature_test_pred.groupby('id').max()

trd_test_pred = pd.DataFrame({'id': ttable.index, 'trd_pred': ttable.trd_pred})

trd_train_pred.to_csv('data/trd_feature_train_pred_cat.csv', index = False)
trd_test_pred.to_csv('data/trd_feature_test_pred_cat.csv', index = False)

In [24]:
table = trd_feature_train_pred.groupby('id').mean()
trd_train_pred = pd.DataFrame({'id': table.index, 'trd_pred_mean': table.trd_pred})

ttable = trd_feature_test_pred.groupby('id').mean()
trd_test_pred = pd.DataFrame({'id': ttable.index, 'trd_pred_mean': ttable.trd_pred})

trd_train_pred.to_csv('data/trd_feature_train_pred_mean_cat.csv', index = False)
trd_test_pred.to_csv('data/trd_feature_test_pred_mean_cat.csv', index = False)

In [25]:
table = trd_feature_train_pred.groupby('id').sum()
trd_train_pred = pd.DataFrame({'id': table.index, 'trd_pred_sum': table.trd_pred})

ttable = trd_feature_test_pred.groupby('id').sum()
trd_test_pred = pd.DataFrame({'id': ttable.index, 'trd_pred_sum': ttable.trd_pred})

trd_train_pred.to_csv('data/trd_feature_train_pred_sum_cat.csv', index = False)
trd_test_pred.to_csv('data/trd_feature_test_pred_sum_cat.csv', index = False)