In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cab
from tqdm import tqdm
from sklearn import base

import warnings
warnings.filterwarnings('ignore')

In [63]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [64]:
train_df.shape, test_df.shape

((620356, 13), (206785, 12))

In [65]:
train_df.head()

Unnamed: 0,uuid,eid,udmap,common_ts,x1,x2,x3,x4,x5,x6,x7,x8,target
0,0,26,"{""key3"":""67804"",""key2"":""650""}",1689673468244,4,0,41,107,206,1,0,1,0
1,1,26,"{""key3"":""67804"",""key2"":""484""}",1689082941469,4,0,41,24,283,4,8,1,0
2,2,8,unknown,1689407393040,4,0,41,71,288,4,7,1,0
3,3,11,unknown,1689467815688,1,3,41,17,366,1,6,1,0
4,4,26,"{""key3"":""67804"",""key2"":""650""}",1689491751442,0,3,41,92,383,4,8,1,0


In [66]:
# 去重：删除掉除了uuid标识外其他特征字段都相同的重复样本
train_df = train_df.drop_duplicates(subset=[col for col in train_df.columns if col != 'uuid'])
train_df.shape

(617688, 13)

In [67]:
train_df.head()

Unnamed: 0,uuid,eid,udmap,common_ts,x1,x2,x3,x4,x5,x6,x7,x8,target
0,0,26,"{""key3"":""67804"",""key2"":""650""}",1689673468244,4,0,41,107,206,1,0,1,0
1,1,26,"{""key3"":""67804"",""key2"":""484""}",1689082941469,4,0,41,24,283,4,8,1,0
2,2,8,unknown,1689407393040,4,0,41,71,288,4,7,1,0
3,3,11,unknown,1689467815688,1,3,41,17,366,1,6,1,0
4,4,26,"{""key3"":""67804"",""key2"":""650""}",1689491751442,0,3,41,92,383,4,8,1,0


In [68]:
df = pd.concat([train_df, test_df]).reset_index(drop=True)

In [69]:
keys = ['key' + str(i) for i in range(1, 10)]
keys_df = []
for v in tqdm(df['udmap'].values):
    if v == 'unknown':
        keys_df.append([-1]*9)
    else:
        v = eval(v)
        this_key = []
        for key in keys:
            if key in v.keys():
                this_key.append(int(v[key]))
            else:
                this_key.append(-1)
        keys_df.append(this_key)
keys_df = pd.DataFrame(keys_df, columns=keys)

100%|███████████████████████████████████████████████████████████████████████| 824473/824473 [00:09<00:00, 91466.79it/s]


In [70]:
# keys_df

In [71]:
df = pd.concat([df, keys_df], axis=1)

### 提取时间，交叉特征

In [72]:
df['tm'] = pd.to_datetime(df['common_ts'],unit='ms')
df['day'] = df['tm'].dt.day
df['hour'] = df['tm'].dt.hour
df['dayofweek'] = df['tm'].dt.dayofweek

df['x5_x7'] = df['x5'].astype(str) + '_' + df['x7'].astype(str)
df['x5_x4'] = df['x5'].astype(str) + '_' + df['x4'].astype(str)
df['x7_x4'] = df['x7'].astype(str) + '_' + df['x4'].astype(str)

# df['x8_x6'] = df['x8'].astype(str) + '_' + df['x6'].astype(str)
# df['x8_x7'] = df['x8'].astype(str) + '_' + df['x7'].astype(str)
# df['x6_x7'] = df['x6'].astype(str) + '_' + df['x7'].astype(str)
# df['x8_x6_x7'] = df['x8'].astype(str) + '_' + df['x6'].astype(str) + '_' + df['x7'].astype(str)
# df['hour_x8_x6_x7'] = df['hour'].astype(str) + '_' + df['x8'].astype(str) + '_' + df['x6'].astype(str) + '_' + df['x7'].astype(str)

x_std = np.std(df[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']], axis=1)
x_mean = np.mean(df[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']], axis=1)
df['x_std'] = x_std
df['x_cv'] = x_std / x_mean

df['hour_x7'] = df['hour'].astype(str) + '_' + df['x7'].astype(str)
df['hour_x7_x5'] = df['hour_x7'] + '_' + df['x5'].astype(str)
df['x_all'] = df['x1'].astype(str) + '_'+ df['x2'].astype(str) + '_' + df['x3'].astype(str) + '_' + df['x4'].astype(str)+ '_' + df['x5'].astype(str)+ '_' + df['x6'].astype(str)+ '_' + df['x7'].astype(str)+ '_' + df['x8'].astype(str)

df['hour_x_all'] = df['hour'].astype(str) + '_' + df['x_all']
df['day_hour_x_all'] = df['day'].astype(str) + '_' + df['hour'].astype(str) + '_' + df['x_all']

df['day_hour'] = df['day'].astype(str) + '_' + df['hour'].astype(str)
df['day_hour_x7'] = df['day'].astype(str) + '_' + df['hour'].astype(str) + '_' + df['x7'].astype(str)
df['day_hour_x7_x5'] = df['day'].astype(str) + '_' + df['hour_x7'] + '_' + df['x5'].astype(str)

# df['eid_hour_x_all'] = df['eid'].astype(str) + '_' + df['hour'].astype(str) + '_' + df['x_all']
# df['day_eid_hour_x_all'] = df['day'].astype(str) + '_' + df['eid'].astype(str) + '_' + df['hour'].astype(str) + '_' + df['x_all']

df['hour_x5'] = df['hour'].astype(str) + '_' + df['x5'].astype(str)
df['day_hour_x5'] = df['day'].astype(str) + '_' + df['hour'].astype(str) + '_' + df['x5'].astype(str)

# df['hour_key3'] = df['hour'].astype(str) + '_' + df['key3'].astype(str)
# df['day_hour_key3'] = df['day'].astype(str) + '_' + df['hour'].astype(str) + '_' + df['key3'].astype(str)

# 'x8_x6', 'x8_x7', 'x6_x7', 'x8_x6_x7'
for f in ['x5_x7', 'x5_x4', 'x7_x4', 'hour_x7', 'hour_x7_x5', 'x_all', 'hour_x_all', 'day_hour_x_all', 'hour_x5', 'day_hour', 'day_hour_x7', 'day_hour_x7_x5', 'day_hour_x5']:
    df[f] = df[f].factorize()[0]

### rank排序，统计特征

In [73]:
df = df.reset_index()
df = df.sort_values(by=['common_ts'])
for f in ['eid', 'x4', 'x5', 'x7', 'hour', 'hour_x7_x5', 'hour_x7', 'x5_x7', 'x_all', 'hour_x_all', 'day_hour_x_all', 'hour_x5', 'day_hour', 'day_hour_x7', 'day_hour_x7_x5', 'day_hour_x5']:
    df[f+'_sort_ts_rank'] = df.groupby(f)['common_ts'].rank()
df = df.sort_values(by=['index'])

In [74]:
for f in ['eid', 'x4', 'x5', 'x7', 'hour', 'hour_x7_x5', 'hour_x7', 'x5_x7', 'x_all', 'hour_x_all', 'day_hour_x_all', 'hour_x5', 'day_hour', 'day_hour_x7', 'day_hour_x7_x5', 'day_hour_x5']:
    df[f+'_ts_std'] = df.groupby([f])['common_ts'].transform('std')
    df[f+'_ts_skew'] = df.groupby([f])['common_ts'].transform('skew')
    kurt_map = df.groupby([f])['common_ts'].apply(lambda x: x.kurt())
    df[f+'_ts_kurt'] = df[f].map(kurt_map)
    
    max_value = df.groupby([f])['common_ts'].transform('max')
    min_value = df.groupby([f])['common_ts'].transform('min')
    count_value = df.groupby([f])['common_ts'].transform('count')
    df[f+'_ts_freq'] = (max_value - min_value) / count_value
    
    count_value = df.groupby([f])['common_ts'].transform('nunique')
    df[f+'_tss_freq'] = (max_value - min_value) / count_value
    
    count_value = df.groupby([f])['eid'].transform('nunique')
    df[f+'_eid_freq'] = (max_value - min_value) / count_value

In [75]:
train_df = df[~df['target'].isnull()]
test_df = df[df['target'].isnull()]

In [76]:
"""目标编码"""
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):

        self.colnames   = colnames
        self.targetName = targetName
        self.n_fold     = n_fold
        self.verbosity  = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self


    def transform(self,X):

        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)

        mean_of_target = X[self.targetName].mean()
        # kf = KFold(n_splits = self.n_fold, shuffle = True, random_state=2023)
        skf = StratifiedKFold(n_splits = self.n_fold, shuffle = True, random_state=2023)



        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan

        # for tr_ind, val_ind in kf.split(X):
        #     X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind] 
        #     X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
        
        for tr_ind, val_ind in skf.split(X,X[self.targetName]):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind] 
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())

        X[col_mean_name].fillna(mean_of_target, inplace = True)

        if self.verbosity:

            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name, self.targetName,
                                                                                  np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
            

        return X
    
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self,train,colNames,encodedName):
        
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        
        
    def fit(self, X, y=None):
        return self

    def transform(self,X):


        mean = self.train[[self.colNames,self.encodedName]].groupby(self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]

        
        X[self.encodedName] = X[self.colNames]
        X = X.replace({self.encodedName: dd})

        return X

In [77]:
# tar_cols = ['x_all', 'hour_x5', 'hour_x7']
# target = 'target'
# for col in tar_cols:
#     target_encoder = KFoldTargetEncoderTrain(col, target, n_fold=5)
#     train_df = target_encoder.fit_transform(train_df)

# for col in tar_cols:
#     target_encoder_test = KFoldTargetEncoderTest(train_df, col, col+'_Kfold_Target_Enc')
#     test_df = target_encoder_test.fit_transform(test_df)

In [78]:
train_df.shape, test_df.shape

((617688, 170), (206785, 170))

In [79]:
feats = [f for f in train_df if f not in ['target', 'udmap', 'uuid',  'key4', 'key5', 'key6', 'key7', 'key8', 'key9', 'tm', 'index']]

In [80]:
x = train_df[feats]
y = train_df['target']
test_x = test_df[feats]

In [81]:
seed = 2222
n_fold = 5
skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
oof = np.zeros(len(y))
pred_y = np.zeros(len(test_x))
feats_weight = pd.DataFrame(x.columns.values, columns=['feat_name'])
feats_weight['importance'] = 0

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'learning_rate': 0.03,
    'metric': 'auc',
    'max_depth': 8,
    'num_leaves': 64,
    'feature_fraction':0.8,
    'bagging_fraction':0.9,
    'bagging_freq': 4,
    'verbose': -1,
    # 'lambda_l2':10,
    'seed': seed,
    # 'is_unbalance': True,
    # 'scale_pos_weight': 64
}

xgb_params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'tree_method': 'gpu_hist',
    'learning_rate': 0.03,  # 0.03, 0.1
    # 'eval_metric': 'auc',
    'max_depth': 8,  # 8, 6
    'subsample': 0.9,
    'colsample_bytree': 0.85,
    'min_child_weight': 10,
    # 'verbosity': 0,
    # 'reg_lambda': 10,
    'seed': seed,
    # 'scale_pos_weight': 74
}

cab_params = {
    'learning_rate': 0.01,
    'loss_function': "Logloss",
    'eval_metric': "AUC",
    'depth': 6,
    'min_data_in_leaf': 20,
    'random_seed': seed,
    'logging_level': 'Silent',
    'use_best_model': True,
    'one_hot_max_size': 5,   #类别数量多于此数将使用ordered target statistics编码方法,默认值为2。
    'boosting_type':"Ordered", #Ordered 或者Plain,数据量较少时建议使用Ordered,训练更慢但能够缓解梯度估计偏差。
    'max_ctr_complexity': 2, #特征组合的最大特征数量，设置为1取消特征组合，设置为2只做两个特征的组合,默认为4。
    'nan_mode': 'Min',
    'scale_pos_weight': 74
}

In [82]:
for fold, (train_idx, val_idx) in enumerate(skf.split(x, y)):
    print(f'==========fold {fold}==========')
    train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
    val_x, val_y = x.iloc[val_idx], y.iloc[val_idx]
    
    model = lgb.LGBMClassifier(**lgb_params, n_estimators=30000, n_jobs=-1)
    model.fit(train_x, train_y, eval_set=(val_x, val_y), eval_metric='auc', verbose=2000, early_stopping_rounds=200)
    
    oof[val_idx] = model.predict(val_x, num_iteration=model.best_iteration_)
    pred_y += model.predict(test_x, num_iteration=model.best_iteration_) / n_fold
    feats_weight['importance'] += model.feature_importances_ / n_fold

[2000]	valid_0's auc: 0.975511
[4000]	valid_0's auc: 0.983216
[6000]	valid_0's auc: 0.985244
[8000]	valid_0's auc: 0.986032
[2000]	valid_0's auc: 0.973951
[4000]	valid_0's auc: 0.98194
[6000]	valid_0's auc: 0.984219
[8000]	valid_0's auc: 0.984918
[10000]	valid_0's auc: 0.985216
[2000]	valid_0's auc: 0.97375
[4000]	valid_0's auc: 0.981755
[6000]	valid_0's auc: 0.983726
[8000]	valid_0's auc: 0.984417
[10000]	valid_0's auc: 0.984797
[2000]	valid_0's auc: 0.975426
[4000]	valid_0's auc: 0.983018
[6000]	valid_0's auc: 0.985092
[8000]	valid_0's auc: 0.985683
[2000]	valid_0's auc: 0.97449
[4000]	valid_0's auc: 0.982968
[6000]	valid_0's auc: 0.985231
[8000]	valid_0's auc: 0.986098
[10000]	valid_0's auc: 0.986392


In [383]:
# for fold, (train_idx, val_idx) in enumerate(skf.split(x, y)):
#     print(f'==========fold {fold}==========')
#     train_x, train_y = x.iloc[train_idx], y.iloc[train_idx]
#     val_x, val_y = x.iloc[val_idx], y.iloc[val_idx]
    
#     model = xgb.XGBClassifier(**xgb_params, n_estimators=30000, n_jobs=-1)
#     model.fit(train_x, train_y, eval_set=[(val_x, val_y)], eval_metric='auc', verbose=2000, early_stopping_rounds=200)
    
#     oof[val_idx] = model.predict(val_x, iteration_range=(0, model.best_iteration))
#     pred_y += model.predict(test_x, iteration_range=(0, model.best_iteration)) / n_fold
#     feats_weight['importance'] += model.feature_importances_ / n_fold

[0]	validation_0-auc:0.79795
[2000]	validation_0-auc:0.98101
[4000]	validation_0-auc:0.98525
[6000]	validation_0-auc:0.98611
[6165]	validation_0-auc:0.98611
[0]	validation_0-auc:0.79881
[2000]	validation_0-auc:0.97951
[4000]	validation_0-auc:0.98415
[6000]	validation_0-auc:0.98495
[6856]	validation_0-auc:0.98499
[0]	validation_0-auc:0.79730
[2000]	validation_0-auc:0.97934
[4000]	validation_0-auc:0.98389
[6000]	validation_0-auc:0.98463
[6670]	validation_0-auc:0.98469
[0]	validation_0-auc:0.79864
[2000]	validation_0-auc:0.98053
[4000]	validation_0-auc:0.98492
[6000]	validation_0-auc:0.98564
[6684]	validation_0-auc:0.98567
[0]	validation_0-auc:0.80133
[2000]	validation_0-auc:0.98057
[4000]	validation_0-auc:0.98527
[6000]	validation_0-auc:0.98613
[6872]	validation_0-auc:0.98619


In [84]:
# score = roc_auc_score(y, oof)
score = f1_score(y, oof)
score

0.8928593236691377

In [86]:
print(feats_weight.sort_values(by='importance', ascending=False).iloc[:50])

                       feat_name  importance
45         day_hour_sort_ts_rank     12403.8
46      day_hour_x7_sort_ts_rank     12107.8
112       day_hour_x_all_ts_skew     11591.0
113       day_hour_x_all_ts_kurt     11475.2
111        day_hour_x_all_ts_std     11232.8
48      day_hour_x5_sort_ts_rank     10358.8
107           hour_x_all_ts_kurt     10149.2
106           hour_x_all_ts_skew      9578.6
41            x_all_sort_ts_rank      9106.8
101                x_all_ts_kurt      8972.0
25                day_hour_x_all      8813.0
100                x_all_ts_skew      8319.8
47   day_hour_x7_x5_sort_ts_rank      8198.2
137       day_hour_x7_x5_ts_kurt      7935.0
142          day_hour_x5_ts_skew      7788.6
24                    hour_x_all      7736.8
136       day_hour_x7_x5_ts_skew      7721.4
23                         x_all      7699.0
105            hour_x_all_ts_std      7523.2
116      day_hour_x_all_eid_freq      7511.0
143          day_hour_x5_ts_kurt      7276.4
39        

In [87]:
submit_df = pd.read_csv('./提交示例.csv')
submit_df['target'] = (pred_y > 0.5).astype(int)

In [88]:
submit_df.target.value_counts()

0    180912
1     25873
Name: target, dtype: int64

In [89]:
submit_df.to_csv('./submit/lgb_{:.6f}.csv'.format(score), index=False)

In [62]:
train_df['day_hour_x_all'].value_counts()

327       261
861       256
606       187
2222      182
1142      174
         ... 
89505       1
89507       1
89508       1
89514       1
143993      1
Name: day_hour_x_all, Length: 143994, dtype: int64

In [222]:
df.groupby(['x5'])['common_ts'].agg('count')
df.groupby(['x5'])['common_ts'].transform('nunique')

0          1375
1          4909
2         47266
3         15984
4          4209
          ...  
824468      956
824469    20555
824470    21166
824471    24345
824472    24345
Name: common_ts, Length: 824473, dtype: int64

In [224]:
df.groupby(['x5'])['eid'].transform('nunique')

0         26
1         34
2         35
3         35
4         33
          ..
824468    31
824469    35
824470    35
824471    35
824472    35
Name: eid, Length: 824473, dtype: int64

In [97]:
np.array(['a', 'b', 'c']) == np.array(['a', 'c', 'c'])

array([ True, False,  True])