In [1]:
from tqdm import tqdm
import os
import numpy as np
import time
import random
import pandas as pd
from scipy import sparse
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

# 读取数据集

In [2]:
data_org_dir = 'data/data_tencent/'
data_prep_dir = 'data/preprocess/'
# data_prep_dir = 'data/sample/'
sample_test = False

In [3]:
if sample_test:
    df_data = pd.read_csv(data_prep_dir + 'train_feat_merge_mini.csv')
else:
    df_data = pd.read_csv(data_prep_dir + 'train_feat_merge.csv')

train_idx_lst = list(df_data[df_data['n_parts'] != 1].index)
valid_idx_lst = list(df_data[df_data['n_parts'] == 1].index)


In [4]:
df_data.columns

Index(['n_parts', 'aid', 'uid', 'label', 'LBS', 'age', 'appIdAction',
       'appIdInstall', 'carrier', 'consumptionAbility', 'ct', 'education',
       'gender', 'house', 'interest1', 'interest2', 'interest3', 'interest4',
       'interest5', 'kw1', 'kw2', 'kw3', 'marriageStatus', 'os', 'topic1',
       'topic2', 'topic3', 'advertiserId', 'campaignId', 'creativeId',
       'creativeSize', 'adCategoryId', 'productId', 'productType'],
      dtype='object')

# 缺失率筛选

In [5]:
# 缺失情况统计
total = len(df_data)
for col_name in list(df_data.columns):
    df_missing = (df_data[df_data[col_name] == '-1'])
    
    print(col_name, ':', round(len(df_missing)/total, 2) * 100, '%')

n_parts : 0.0 %
aid : 0.0 %
uid : 0.0 %
label : 0.0 %
LBS : 0.0 %
age : 0.0 %
appIdAction : 98.0 %
appIdInstall : 98.0 %
carrier : 0.0 %
consumptionAbility : 0.0 %
ct : 0.0 %
education : 0.0 %
gender : 0.0 %
house : 0.0 %
interest1 : 9.0 %
interest2 : 34.0 %
interest3 : 97.0 %
interest4 : 98.0 %
interest5 : 25.0 %
kw1 : 10.0 %
kw2 : 3.0 %
kw3 : 95.0 %
marriageStatus : 0.0 %
os : 0.0 %
topic1 : 9.0 %
topic2 : 4.0 %
topic3 : 95.0 %
advertiserId : 0.0 %
campaignId : 0.0 %
creativeId : 0.0 %
creativeSize : 0.0 %
adCategoryId : 0.0 %
productId : 0.0 %
productType : 0.0 %


## 删除特征

1. 删除category特别大特别稀疏特征: 'appIdInstall', 'appIdAction', 'marriageStatus';
2. 删除缺失情况严重特征: 'interest3', 'interest4', 'kw3', 'topic3';


In [6]:
feat2drop = ['appIdInstall', 'appIdAction', 'marriageStatus', 'interest3', 'interest4', 'kw3', 'topic3']
df_data.drop(feat2drop, axis=1, inplace=True)


In [7]:
df_data.columns

Index(['n_parts', 'aid', 'uid', 'label', 'LBS', 'age', 'carrier',
       'consumptionAbility', 'ct', 'education', 'gender', 'house', 'interest1',
       'interest2', 'interest5', 'kw1', 'kw2', 'os', 'topic1', 'topic2',
       'advertiserId', 'campaignId', 'creativeId', 'creativeSize',
       'adCategoryId', 'productId', 'productType'],
      dtype='object')

# 定长离散特征处理
- 用户类: 'LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender', 'house', 'os'
- 广告类: 'aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId','productId', 'productType'



In [8]:
discrete_feat_lst = ['LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender', 'house', 'os',
                     'aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize', 'adCategoryId',
                     'productId', 'productType']
t0 = time.time()
for discrete_feat in tqdm(discrete_feat_lst):
    enc = LabelEncoder()
    try:
        df_data[discrete_feat] = enc.fit_transform(df_data[discrete_feat].apply(int))
    except:
        df_data[discrete_feat] = enc.fit_transform(df_data[discrete_feat])


100%|██████████| 16/16 [01:02<00:00,  3.93s/it]


In [9]:
def one_hot_encoding(df_feat2enc):
    """

    :param df_feat2enc:
    :type df_feat2enc:
    :return:
    :rtype:
    """
    
    one_hot_enc.fit(df_feat2enc.values.reshape(-1, 1))
    feat_enc_arr = one_hot_enc.transform(df_feat2enc.values.reshape(-1, 1))#.toarray()

    return feat_enc_arr

# 数据量太大了, 一下子肝不完, 必须分开
discrete_feat_lst1 = ['LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender']
discrete_feat_lst2 = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize']
discrete_feat_lst3 = ['adCategoryId', 'productId', 'productType', 'house', 'os']

discrete_feat = [discrete_feat_lst1, discrete_feat_lst2, discrete_feat_lst3]
# 初始化
one_hot_enc = OneHotEncoder()

x_train = pd.DataFrame()
x_valid = pd.DataFrame()
###### 数据量很大的时候用这个 ######
for feat_lst in discrete_feat:
    
    for feat in tqdm(feat_lst):
        # 合并训练
        df_feat2enc = df_data[feat]
        one_hot_enc.fit(df_feat2enc.values.reshape(-1, 1))
        # 训练集
        train2enc = df_data.loc[train_idx_lst][feat]
        train_enc_arr = one_hot_enc.transform(train2enc.values.reshape(-1, 1))
        x_train = sparse.hstack([x_train, train_enc_arr])
        # 验证集
        valid2enc = df_data.loc[valid_idx_lst][feat]
        valid_enc_arr = one_hot_enc.transform(valid2enc.values.reshape(-1, 1))
        x_valid = sparse.hstack([x_valid, valid_enc_arr])
    
train_f_dir = data_prep_dir + 'train_x_sparse_onehot.npz'
sparse.save_npz(train_f_dir, x_train)

valid_f_dir = data_prep_dir + 'valid_x_sparse_onehot.npz'
sparse.save_npz(valid_f_dir, x_valid)

print(train_f_dir, 'is saved! Array shape is', x_train.shape)
print(valid_f_dir, 'is saved! Array shape is', x_valid.shape)

###### 测试数据用这个 ######
# train_x = np.empty([len(train_idx_lst), 1], dtype=int)
# valid_x = np.empty([len(valid_idx_lst), 1], dtype=int) 
# for feat in tqdm(discrete_feat_lst):
    
#     # 训练集
#     df_feat2enc_train = df_data.loc[train_idx_lst][feat]
#     train_enc_arr = one_hot_encoding(df_feat2enc_train)
#     train_x = sparse.hstack([train_x, train_enc_arr])
#     # 验证集
#     df_feat2enc_valid = df_data.loc[valid_idx_lst][feat]
#     valid_enc_arr = one_hot_encoding(df_feat2enc_valid)
#     valid_x = sparse.hstack([valid_x, valid_enc_arr])
    
# train_f_dir = data_prep_dir + 'train_x_sparse_onehot.npz'
# sparse.save_npz(train_f_dir, train_enc_arr)

# valid_f_dir = data_prep_dir + 'valid_x_sparse_onehot.npz'
# sparse.save_npz(valid_f_dir, valid_enc_arr)

# print(train_f_dir, 'is saved! Array shape is', train_x.shape)
# print(valid_f_dir, 'is saved! Array shape is', valid_x.shape)




100%|██████████| 6/6 [00:52<00:00,  8.73s/it]
100%|██████████| 5/5 [00:49<00:00,  9.98s/it]
100%|██████████| 5/5 [00:50<00:00, 10.15s/it]


data/preprocess/train_x_sparse_onehot.npz is saved! Array shape is (7038840, 1526)
data/preprocess/valid_x_sparse_onehot.npz is saved! Array shape is (1759974, 1526)


# 序列型特征处理
- 用户类: 'ct', 'interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'os'，'topic1', 'topic2', 

In [10]:
sequence_feat_lst = ['ct', 'interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'topic1', 'topic2']

## 构建序列长度特征
len_static_features = [x+'_len' for x in sequence_feat_lst]

In [11]:
# 初始化
x_train = pd.DataFrame()
x_valid = pd.DataFrame()

df_feat2len = df_data[sequence_feat_lst].applymap(lambda x: len(x.split(' ')))

# 构建训练集&验证集
df_train_enc = df_feat2len.loc[train_idx_lst].values
df_valid_enc = df_feat2len.loc[valid_idx_lst].values

x_train = sparse.hstack([x_train, df_train_enc])
x_valid = sparse.hstack([x_valid, df_valid_enc])

train_f_dir = data_prep_dir + 'train_x_sparse_seq_len.npz'
sparse.save_npz(train_f_dir, x_train)

valid_f_dir = data_prep_dir + 'valid_x_sparse_seq_len.npz'
sparse.save_npz(valid_f_dir, x_valid)

print(train_f_dir, 'is saved! Array shape is', x_train.shape)
print(valid_f_dir, 'is saved! Array shape is', x_valid.shape)

data/preprocess/train_x_sparse_seq_len.npz is saved! Array shape is (7038840, 8)
data/preprocess/valid_x_sparse_seq_len.npz is saved! Array shape is (1759974, 8)


In [12]:
df_feat2len.shape

(8798814, 8)

## 构建序列count特征

In [13]:
# 初始化
x_train = pd.DataFrame()
x_valid = pd.DataFrame()
cnt_enc = CountVectorizer()

for sequence_feat in tqdm(sequence_feat_lst[1:]):  # TODO: 'ct'未构建该类特征
    # 合并训练
    df_feat2cnt = df_data[sequence_feat]
    cnt_enc.fit(df_feat2cnt.values)
    # 训练集
    train2enc = df_data.loc[train_idx_lst][sequence_feat]
    train_enc_arr = cnt_enc.transform(train2enc.values)
    x_train = sparse.hstack([x_train, train_enc_arr])
    # 验证集
    valid2enc = df_data.loc[valid_idx_lst][sequence_feat]
    valid_enc_arr = cnt_enc.transform(valid2enc.values)
    x_valid = sparse.hstack([x_valid, valid_enc_arr])
    
train_f_dir = data_prep_dir + 'train_x_sparse_cntv.npz'
sparse.save_npz(train_f_dir, x_train)

valid_f_dir = data_prep_dir + 'valid_x_sparse_cntv.npz'
sparse.save_npz(valid_f_dir, x_valid)

print(train_f_dir, 'is saved! Array shape is', x_train.shape)
print(valid_f_dir, 'is saved! Array shape is', x_valid.shape)


  0%|          | 0/7 [03:33<?, ?it/s]


KeyboardInterrupt: 

# 用户id类特征处理（未处理）
- 用户类: 'uid'
另外，因为也存在量大稀疏的问题，但是比赛的问题就是针对用户对广告的点击，因此uid对模型来说还是很有必要进行建模的，但是uid不使用category的方式建模，而是使用count和转化率建模：
- uid_count : 对uid进行出现频次的建模。
- uid_pos_count : 对uid进行正样本中的出现频次的建模。
- uid_ad_features_pos_count : 对uid组合所有广告特征'ad_static_feature'进行正样本中的出现频次的建模。
- 对uid进行如此多的建模方式是为了能对uid进行更加详细的表述，因为category的每一个特征就是一对一的，而count特征是多对一的，存在大量的信息损失，因此需要进行更多不同角度的建模，才能更好地表述uid。
特征方面还有一些长尾处理，未出现id的统一映射，长度和count特征的未出现次数的取临近值等trick。

# 特征选择
## 加载&合并特征

In [6]:
feat_fname_train = ['train_x_sparse_onehot.npz', 'train_x_sparse_seq_len.npz', 'train_x_sparse_cntv.npz']
feat_fname_valid = ['valid_x_sparse_onehot.npz', 'valid_x_sparse_seq_len.npz', 'valid_x_sparse_cntv.npz']
x_train = pd.DataFrame()
x_valid = pd.DataFrame()
for tr, vl in zip(feat_fname_train, feat_fname_valid):
    train_f = sparse.load_npz(data_prep_dir + tr).tocsr()
    valid_f = sparse.load_npz(data_prep_dir + vl).tocsr()
    
    x_train = sparse.hstack((x_train, train_f)).tocsc()
    x_valid = sparse.hstack((x_valid, valid_f)).tocsc()
    print(tr, train_f.shape)
    print(vl, valid_f.shape)
    
print('x_train:', x_train.shape)
print('x_valid:', x_valid.shape)

print('Loading & merging features completed!')



train_x_sparse_onehot.npz (7038840, 1526)
valid_x_sparse_onehot.npz (1759974, 1526)
train_x_sparse_seq_len.npz (7038840, 8)
valid_x_sparse_seq_len.npz (1759974, 8)
train_x_sparse_cntv.npz (7038840, 323142)
valid_x_sparse_cntv.npz (1759974, 323142)
x_train: (7038840, 324676)
x_valid: (1759974, 324676)
Loading & merging features completed!


In [7]:
# construct y vector
y_train = np.array(df_data[df_data['n_parts'] != 1]['label'])
y_valid = np.array(df_data[df_data['n_parts'] == 1]['label'])

print('y_train:', y_train.shape)
print('y_valid:', y_valid.shape)

y_train: (7038840,)
y_valid: (1759974,)


## LGB选择特征

In [8]:
import pandas as pd
from lightgbm import LGBMClassifier
import time
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')



In [9]:
clf = LGBMClassifier(boosting_type='gbdt',
                     num_leaves=31, max_depth=-1, 
                     learning_rate=0.1, n_estimators=10000, 
                     subsample_for_bin=200000, objective=None,
                     class_weight=None, min_split_gain=0.0, 
                     min_child_weight=0.001,
                     min_child_samples=20, subsample=1.0, subsample_freq=1,
                     colsample_bytree=1.0,
                     reg_alpha=0.0, reg_lambda=0.0, random_state=None,
                     n_jobs=-1, silent=False)


In [10]:
clf.fit(x_train, y_train, eval_set=[(x_train, y_train),(x_valid, y_valid)], 
        eval_names =['train','valid'],
        eval_metric='auc', early_stopping_rounds=100)

[LightGBM] [Info] Number of positive: 337465, number of negative: 6701375
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 240984
[LightGBM] [Info] Number of data points in the train set: 7038840, number of used features: 120414
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047943 -> initscore=-2.988606
[LightGBM] [Info] Start training from score -2.988606
[1]	train's auc: 0.633328	train's binary_logloss: 0.189736	valid's auc: 0.632434	valid's binary_logloss: 0.189924
Training until validation scores don't improve for 100 rounds
[2]	train's auc: 0.642899	train's binary_logloss: 0.188124	valid's auc: 0.641948	valid's binary_logloss: 0.18831
[3]	train's auc: 0.651786	train's binary_logloss: 0.186984	valid's auc: 0.65073	valid's binary_logloss: 0.187172
[4]	train's auc: 0.65381	train's binary_logloss: 0.186042	valid's auc: 0.652756	valid's binary_logloss: 0.186225
[5]	train's auc: 0.6

[68]	train's auc: 0.711066	train's binary_logloss: 0.175881	valid's auc: 0.710173	valid's binary_logloss: 0.176176
[69]	train's auc: 0.71138	train's binary_logloss: 0.175826	valid's auc: 0.710459	valid's binary_logloss: 0.176124
[70]	train's auc: 0.711542	train's binary_logloss: 0.175784	valid's auc: 0.710615	valid's binary_logloss: 0.176086
[71]	train's auc: 0.711775	train's binary_logloss: 0.175744	valid's auc: 0.710852	valid's binary_logloss: 0.176045
[72]	train's auc: 0.712111	train's binary_logloss: 0.175693	valid's auc: 0.71116	valid's binary_logloss: 0.176
[73]	train's auc: 0.712392	train's binary_logloss: 0.175644	valid's auc: 0.711463	valid's binary_logloss: 0.175952
[74]	train's auc: 0.712606	train's binary_logloss: 0.175607	valid's auc: 0.711684	valid's binary_logloss: 0.17592
[75]	train's auc: 0.712826	train's binary_logloss: 0.175564	valid's auc: 0.711878	valid's binary_logloss: 0.17588
[76]	train's auc: 0.713025	train's binary_logloss: 0.175526	valid's auc: 0.712083	valid

[140]	train's auc: 0.724414	train's binary_logloss: 0.173507	valid's auc: 0.722513	valid's binary_logloss: 0.173975
[141]	train's auc: 0.72449	train's binary_logloss: 0.173488	valid's auc: 0.722587	valid's binary_logloss: 0.173957
[142]	train's auc: 0.724685	train's binary_logloss: 0.173466	valid's auc: 0.722752	valid's binary_logloss: 0.173938
[143]	train's auc: 0.724803	train's binary_logloss: 0.173444	valid's auc: 0.722849	valid's binary_logloss: 0.17392
[144]	train's auc: 0.724926	train's binary_logloss: 0.173426	valid's auc: 0.722954	valid's binary_logloss: 0.173903
[145]	train's auc: 0.72506	train's binary_logloss: 0.173403	valid's auc: 0.723069	valid's binary_logloss: 0.173883
[146]	train's auc: 0.725153	train's binary_logloss: 0.173383	valid's auc: 0.723134	valid's binary_logloss: 0.173866
[147]	train's auc: 0.725216	train's binary_logloss: 0.173365	valid's auc: 0.723204	valid's binary_logloss: 0.173849
[148]	train's auc: 0.725311	train's binary_logloss: 0.173346	valid's auc: 0

[211]	train's auc: 0.731072	train's binary_logloss: 0.172304	valid's auc: 0.727713	valid's binary_logloss: 0.173015
[212]	train's auc: 0.731153	train's binary_logloss: 0.172291	valid's auc: 0.727791	valid's binary_logloss: 0.173006
[213]	train's auc: 0.731209	train's binary_logloss: 0.172281	valid's auc: 0.727826	valid's binary_logloss: 0.172997
[214]	train's auc: 0.731292	train's binary_logloss: 0.172269	valid's auc: 0.727863	valid's binary_logloss: 0.172991
[215]	train's auc: 0.731367	train's binary_logloss: 0.172259	valid's auc: 0.727886	valid's binary_logloss: 0.172986
[216]	train's auc: 0.731428	train's binary_logloss: 0.172248	valid's auc: 0.727928	valid's binary_logloss: 0.172981
[217]	train's auc: 0.731497	train's binary_logloss: 0.172235	valid's auc: 0.72798	valid's binary_logloss: 0.172972
[218]	train's auc: 0.731575	train's binary_logloss: 0.17222	valid's auc: 0.728033	valid's binary_logloss: 0.172961
[219]	train's auc: 0.731657	train's binary_logloss: 0.172207	valid's auc: 

[282]	train's auc: 0.735903	train's binary_logloss: 0.171463	valid's auc: 0.730484	valid's binary_logloss: 0.172498
[283]	train's auc: 0.735951	train's binary_logloss: 0.171453	valid's auc: 0.730493	valid's binary_logloss: 0.172494
[284]	train's auc: 0.736058	train's binary_logloss: 0.171438	valid's auc: 0.730564	valid's binary_logloss: 0.172484
[285]	train's auc: 0.73613	train's binary_logloss: 0.171429	valid's auc: 0.730573	valid's binary_logloss: 0.172483
[286]	train's auc: 0.736219	train's binary_logloss: 0.171419	valid's auc: 0.730594	valid's binary_logloss: 0.172479
[287]	train's auc: 0.736277	train's binary_logloss: 0.17141	valid's auc: 0.73062	valid's binary_logloss: 0.172475
[288]	train's auc: 0.736338	train's binary_logloss: 0.1714	valid's auc: 0.730644	valid's binary_logloss: 0.172469
[289]	train's auc: 0.736383	train's binary_logloss: 0.171392	valid's auc: 0.730666	valid's binary_logloss: 0.172465
[290]	train's auc: 0.736429	train's binary_logloss: 0.171383	valid's auc: 0.7

[353]	train's auc: 0.739858	train's binary_logloss: 0.170818	valid's auc: 0.731902	valid's binary_logloss: 0.172225
[354]	train's auc: 0.739907	train's binary_logloss: 0.170809	valid's auc: 0.731916	valid's binary_logloss: 0.172222
[355]	train's auc: 0.73997	train's binary_logloss: 0.1708	valid's auc: 0.731933	valid's binary_logloss: 0.172219
[356]	train's auc: 0.740031	train's binary_logloss: 0.170791	valid's auc: 0.731936	valid's binary_logloss: 0.172217
[357]	train's auc: 0.740074	train's binary_logloss: 0.170783	valid's auc: 0.73194	valid's binary_logloss: 0.172216
[358]	train's auc: 0.740119	train's binary_logloss: 0.170774	valid's auc: 0.731962	valid's binary_logloss: 0.172212
[359]	train's auc: 0.74018	train's binary_logloss: 0.170766	valid's auc: 0.731981	valid's binary_logloss: 0.172209
[360]	train's auc: 0.740211	train's binary_logloss: 0.170758	valid's auc: 0.731995	valid's binary_logloss: 0.172205
[361]	train's auc: 0.740299	train's binary_logloss: 0.170744	valid's auc: 0.7

[424]	train's auc: 0.743325	train's binary_logloss: 0.17026	valid's auc: 0.732674	valid's binary_logloss: 0.172066
[425]	train's auc: 0.743358	train's binary_logloss: 0.170253	valid's auc: 0.732693	valid's binary_logloss: 0.172061
[426]	train's auc: 0.743416	train's binary_logloss: 0.170243	valid's auc: 0.732712	valid's binary_logloss: 0.172058
[427]	train's auc: 0.743482	train's binary_logloss: 0.170234	valid's auc: 0.732712	valid's binary_logloss: 0.172055
[428]	train's auc: 0.743557	train's binary_logloss: 0.170224	valid's auc: 0.732722	valid's binary_logloss: 0.172053
[429]	train's auc: 0.743611	train's binary_logloss: 0.170217	valid's auc: 0.732722	valid's binary_logloss: 0.172053
[430]	train's auc: 0.743663	train's binary_logloss: 0.17021	valid's auc: 0.732729	valid's binary_logloss: 0.172051
[431]	train's auc: 0.743714	train's binary_logloss: 0.170201	valid's auc: 0.732743	valid's binary_logloss: 0.172047
[432]	train's auc: 0.74375	train's binary_logloss: 0.170194	valid's auc: 0

[495]	train's auc: 0.746692	train's binary_logloss: 0.16972	valid's auc: 0.733467	valid's binary_logloss: 0.171913
[496]	train's auc: 0.746787	train's binary_logloss: 0.169709	valid's auc: 0.733546	valid's binary_logloss: 0.171904
[497]	train's auc: 0.746841	train's binary_logloss: 0.169701	valid's auc: 0.733563	valid's binary_logloss: 0.171901
[498]	train's auc: 0.746872	train's binary_logloss: 0.169694	valid's auc: 0.733573	valid's binary_logloss: 0.171898
[499]	train's auc: 0.746939	train's binary_logloss: 0.169685	valid's auc: 0.733572	valid's binary_logloss: 0.171898
[500]	train's auc: 0.747004	train's binary_logloss: 0.169679	valid's auc: 0.733565	valid's binary_logloss: 0.171897
[501]	train's auc: 0.747065	train's binary_logloss: 0.169672	valid's auc: 0.73356	valid's binary_logloss: 0.171897
[502]	train's auc: 0.74711	train's binary_logloss: 0.169665	valid's auc: 0.733559	valid's binary_logloss: 0.171896
[503]	train's auc: 0.747169	train's binary_logloss: 0.169656	valid's auc: 0

[566]	train's auc: 0.749961	train's binary_logloss: 0.169204	valid's auc: 0.734272	valid's binary_logloss: 0.171761
[567]	train's auc: 0.749984	train's binary_logloss: 0.169198	valid's auc: 0.734277	valid's binary_logloss: 0.171758
[568]	train's auc: 0.75001	train's binary_logloss: 0.169192	valid's auc: 0.734279	valid's binary_logloss: 0.171757
[569]	train's auc: 0.750055	train's binary_logloss: 0.169187	valid's auc: 0.73428	valid's binary_logloss: 0.171756
[570]	train's auc: 0.750108	train's binary_logloss: 0.16918	valid's auc: 0.73428	valid's binary_logloss: 0.171756
[571]	train's auc: 0.750142	train's binary_logloss: 0.169174	valid's auc: 0.734284	valid's binary_logloss: 0.171755
[572]	train's auc: 0.750212	train's binary_logloss: 0.169166	valid's auc: 0.734297	valid's binary_logloss: 0.171754
[573]	train's auc: 0.750254	train's binary_logloss: 0.169159	valid's auc: 0.734294	valid's binary_logloss: 0.171754
[574]	train's auc: 0.7503	train's binary_logloss: 0.169153	valid's auc: 0.73

[637]	train's auc: 0.752785	train's binary_logloss: 0.168748	valid's auc: 0.734535	valid's binary_logloss: 0.171677
[638]	train's auc: 0.752809	train's binary_logloss: 0.168742	valid's auc: 0.73455	valid's binary_logloss: 0.171676
[639]	train's auc: 0.75285	train's binary_logloss: 0.168737	valid's auc: 0.734556	valid's binary_logloss: 0.171674
[640]	train's auc: 0.752888	train's binary_logloss: 0.168729	valid's auc: 0.734576	valid's binary_logloss: 0.171672
[641]	train's auc: 0.752923	train's binary_logloss: 0.168723	valid's auc: 0.73457	valid's binary_logloss: 0.171671
[642]	train's auc: 0.752971	train's binary_logloss: 0.168717	valid's auc: 0.734566	valid's binary_logloss: 0.171671
[643]	train's auc: 0.753012	train's binary_logloss: 0.168711	valid's auc: 0.734566	valid's binary_logloss: 0.171671
[644]	train's auc: 0.75305	train's binary_logloss: 0.168706	valid's auc: 0.734566	valid's binary_logloss: 0.171671
[645]	train's auc: 0.753067	train's binary_logloss: 0.168699	valid's auc: 0.

[708]	train's auc: 0.755472	train's binary_logloss: 0.168314	valid's auc: 0.734888	valid's binary_logloss: 0.171603
[709]	train's auc: 0.755503	train's binary_logloss: 0.168308	valid's auc: 0.734893	valid's binary_logloss: 0.171602
[710]	train's auc: 0.755531	train's binary_logloss: 0.168303	valid's auc: 0.734904	valid's binary_logloss: 0.1716
[711]	train's auc: 0.755564	train's binary_logloss: 0.168297	valid's auc: 0.734902	valid's binary_logloss: 0.1716
[712]	train's auc: 0.755595	train's binary_logloss: 0.168292	valid's auc: 0.734901	valid's binary_logloss: 0.1716
[713]	train's auc: 0.755648	train's binary_logloss: 0.168282	valid's auc: 0.73492	valid's binary_logloss: 0.171597
[714]	train's auc: 0.755723	train's binary_logloss: 0.168272	valid's auc: 0.734923	valid's binary_logloss: 0.171596
[715]	train's auc: 0.755754	train's binary_logloss: 0.168266	valid's auc: 0.734934	valid's binary_logloss: 0.171594
[716]	train's auc: 0.755789	train's binary_logloss: 0.16826	valid's auc: 0.7349

[779]	train's auc: 0.758145	train's binary_logloss: 0.167876	valid's auc: 0.735232	valid's binary_logloss: 0.171527
[780]	train's auc: 0.758178	train's binary_logloss: 0.16787	valid's auc: 0.73523	valid's binary_logloss: 0.171527
[781]	train's auc: 0.758206	train's binary_logloss: 0.167865	valid's auc: 0.735225	valid's binary_logloss: 0.171527
[782]	train's auc: 0.758231	train's binary_logloss: 0.167859	valid's auc: 0.735233	valid's binary_logloss: 0.171525
[783]	train's auc: 0.758255	train's binary_logloss: 0.167853	valid's auc: 0.735234	valid's binary_logloss: 0.171523
[784]	train's auc: 0.758275	train's binary_logloss: 0.167848	valid's auc: 0.73524	valid's binary_logloss: 0.171522
[785]	train's auc: 0.758315	train's binary_logloss: 0.167842	valid's auc: 0.735242	valid's binary_logloss: 0.171522
[786]	train's auc: 0.758339	train's binary_logloss: 0.167837	valid's auc: 0.735242	valid's binary_logloss: 0.171522
[787]	train's auc: 0.758358	train's binary_logloss: 0.167832	valid's auc: 0

[850]	train's auc: 0.760667	train's binary_logloss: 0.167444	valid's auc: 0.735681	valid's binary_logloss: 0.171438
[851]	train's auc: 0.760714	train's binary_logloss: 0.167438	valid's auc: 0.735679	valid's binary_logloss: 0.171438
[852]	train's auc: 0.760739	train's binary_logloss: 0.167433	valid's auc: 0.735678	valid's binary_logloss: 0.171438
[853]	train's auc: 0.760775	train's binary_logloss: 0.167428	valid's auc: 0.735669	valid's binary_logloss: 0.171439
[854]	train's auc: 0.760811	train's binary_logloss: 0.167422	valid's auc: 0.735668	valid's binary_logloss: 0.171439
[855]	train's auc: 0.760847	train's binary_logloss: 0.167416	valid's auc: 0.735666	valid's binary_logloss: 0.171439
[856]	train's auc: 0.760862	train's binary_logloss: 0.167411	valid's auc: 0.735668	valid's binary_logloss: 0.171437
[857]	train's auc: 0.760904	train's binary_logloss: 0.167404	valid's auc: 0.735686	valid's binary_logloss: 0.171434
[858]	train's auc: 0.760934	train's binary_logloss: 0.167399	valid's auc

[921]	train's auc: 0.763214	train's binary_logloss: 0.167045	valid's auc: 0.73582	valid's binary_logloss: 0.171397
[922]	train's auc: 0.763239	train's binary_logloss: 0.167041	valid's auc: 0.735816	valid's binary_logloss: 0.171397
[923]	train's auc: 0.763265	train's binary_logloss: 0.167036	valid's auc: 0.735811	valid's binary_logloss: 0.171398
[924]	train's auc: 0.763302	train's binary_logloss: 0.167031	valid's auc: 0.735805	valid's binary_logloss: 0.171398
[925]	train's auc: 0.763321	train's binary_logloss: 0.167026	valid's auc: 0.735799	valid's binary_logloss: 0.171398
[926]	train's auc: 0.763343	train's binary_logloss: 0.167021	valid's auc: 0.735795	valid's binary_logloss: 0.171399
[927]	train's auc: 0.763384	train's binary_logloss: 0.167016	valid's auc: 0.735801	valid's binary_logloss: 0.171398
[928]	train's auc: 0.763412	train's binary_logloss: 0.167011	valid's auc: 0.735802	valid's binary_logloss: 0.171398
[929]	train's auc: 0.763446	train's binary_logloss: 0.167006	valid's auc:

[992]	train's auc: 0.765661	train's binary_logloss: 0.166631	valid's auc: 0.736141	valid's binary_logloss: 0.171331
[993]	train's auc: 0.765682	train's binary_logloss: 0.166626	valid's auc: 0.736147	valid's binary_logloss: 0.171329
[994]	train's auc: 0.765719	train's binary_logloss: 0.166621	valid's auc: 0.736146	valid's binary_logloss: 0.171329
[995]	train's auc: 0.765761	train's binary_logloss: 0.166616	valid's auc: 0.736146	valid's binary_logloss: 0.171329
[996]	train's auc: 0.765793	train's binary_logloss: 0.166611	valid's auc: 0.736152	valid's binary_logloss: 0.171328
[997]	train's auc: 0.765828	train's binary_logloss: 0.166606	valid's auc: 0.736147	valid's binary_logloss: 0.171329
[998]	train's auc: 0.765859	train's binary_logloss: 0.166602	valid's auc: 0.736151	valid's binary_logloss: 0.171328
[999]	train's auc: 0.765887	train's binary_logloss: 0.166597	valid's auc: 0.736146	valid's binary_logloss: 0.171329
[1000]	train's auc: 0.765917	train's binary_logloss: 0.166593	valid's au

[1063]	train's auc: 0.767807	train's binary_logloss: 0.166269	valid's auc: 0.736329	valid's binary_logloss: 0.171283
[1064]	train's auc: 0.767841	train's binary_logloss: 0.166263	valid's auc: 0.736324	valid's binary_logloss: 0.171283
[1065]	train's auc: 0.767889	train's binary_logloss: 0.166258	valid's auc: 0.736325	valid's binary_logloss: 0.171283
[1066]	train's auc: 0.767918	train's binary_logloss: 0.166253	valid's auc: 0.736323	valid's binary_logloss: 0.171282
[1067]	train's auc: 0.767948	train's binary_logloss: 0.166248	valid's auc: 0.73632	valid's binary_logloss: 0.171282
[1068]	train's auc: 0.767965	train's binary_logloss: 0.166242	valid's auc: 0.736319	valid's binary_logloss: 0.171281
[1069]	train's auc: 0.76799	train's binary_logloss: 0.166238	valid's auc: 0.736319	valid's binary_logloss: 0.171281
[1070]	train's auc: 0.768015	train's binary_logloss: 0.166234	valid's auc: 0.736326	valid's binary_logloss: 0.17128
[1071]	train's auc: 0.768054	train's binary_logloss: 0.166229	valid

[1134]	train's auc: 0.77003	train's binary_logloss: 0.165886	valid's auc: 0.736531	valid's binary_logloss: 0.171238
[1135]	train's auc: 0.770054	train's binary_logloss: 0.165882	valid's auc: 0.736537	valid's binary_logloss: 0.171237
[1136]	train's auc: 0.770082	train's binary_logloss: 0.165877	valid's auc: 0.736531	valid's binary_logloss: 0.171237
[1137]	train's auc: 0.770117	train's binary_logloss: 0.165872	valid's auc: 0.736529	valid's binary_logloss: 0.171238
[1138]	train's auc: 0.770143	train's binary_logloss: 0.165866	valid's auc: 0.736546	valid's binary_logloss: 0.171236
[1139]	train's auc: 0.770159	train's binary_logloss: 0.165862	valid's auc: 0.736546	valid's binary_logloss: 0.171235
[1140]	train's auc: 0.770177	train's binary_logloss: 0.165858	valid's auc: 0.736545	valid's binary_logloss: 0.171235
[1141]	train's auc: 0.770207	train's binary_logloss: 0.165853	valid's auc: 0.736544	valid's binary_logloss: 0.171235
[1142]	train's auc: 0.770235	train's binary_logloss: 0.165849	val

[1205]	train's auc: 0.772183	train's binary_logloss: 0.165523	valid's auc: 0.736635	valid's binary_logloss: 0.171212
[1206]	train's auc: 0.772228	train's binary_logloss: 0.165516	valid's auc: 0.736632	valid's binary_logloss: 0.171212
[1207]	train's auc: 0.772267	train's binary_logloss: 0.165509	valid's auc: 0.736634	valid's binary_logloss: 0.171211
[1208]	train's auc: 0.772293	train's binary_logloss: 0.165505	valid's auc: 0.736629	valid's binary_logloss: 0.171211
[1209]	train's auc: 0.772309	train's binary_logloss: 0.1655	valid's auc: 0.736639	valid's binary_logloss: 0.171209
[1210]	train's auc: 0.772336	train's binary_logloss: 0.165496	valid's auc: 0.736634	valid's binary_logloss: 0.17121
[1211]	train's auc: 0.772361	train's binary_logloss: 0.165492	valid's auc: 0.736627	valid's binary_logloss: 0.17121
[1212]	train's auc: 0.772401	train's binary_logloss: 0.165487	valid's auc: 0.736629	valid's binary_logloss: 0.17121
[1213]	train's auc: 0.772414	train's binary_logloss: 0.165482	valid's

[1276]	train's auc: 0.774359	train's binary_logloss: 0.165144	valid's auc: 0.736749	valid's binary_logloss: 0.171174
[1277]	train's auc: 0.774405	train's binary_logloss: 0.165137	valid's auc: 0.736742	valid's binary_logloss: 0.171175
[1278]	train's auc: 0.774429	train's binary_logloss: 0.165133	valid's auc: 0.736741	valid's binary_logloss: 0.171176
[1279]	train's auc: 0.774454	train's binary_logloss: 0.165129	valid's auc: 0.736739	valid's binary_logloss: 0.171176
[1280]	train's auc: 0.774478	train's binary_logloss: 0.165124	valid's auc: 0.736745	valid's binary_logloss: 0.171175
[1281]	train's auc: 0.77449	train's binary_logloss: 0.165119	valid's auc: 0.736744	valid's binary_logloss: 0.171175
[1282]	train's auc: 0.77454	train's binary_logloss: 0.16511	valid's auc: 0.736755	valid's binary_logloss: 0.171173
[1283]	train's auc: 0.774564	train's binary_logloss: 0.165106	valid's auc: 0.736756	valid's binary_logloss: 0.171172
[1284]	train's auc: 0.774605	train's binary_logloss: 0.165101	valid

[1347]	train's auc: 0.776576	train's binary_logloss: 0.164766	valid's auc: 0.736939	valid's binary_logloss: 0.171129
[1348]	train's auc: 0.776608	train's binary_logloss: 0.164762	valid's auc: 0.736938	valid's binary_logloss: 0.171129
[1349]	train's auc: 0.776626	train's binary_logloss: 0.164758	valid's auc: 0.736941	valid's binary_logloss: 0.171128
[1350]	train's auc: 0.77666	train's binary_logloss: 0.164753	valid's auc: 0.736945	valid's binary_logloss: 0.171127
[1351]	train's auc: 0.776696	train's binary_logloss: 0.164748	valid's auc: 0.736944	valid's binary_logloss: 0.171127
[1352]	train's auc: 0.776715	train's binary_logloss: 0.164744	valid's auc: 0.736947	valid's binary_logloss: 0.171127
[1353]	train's auc: 0.776725	train's binary_logloss: 0.16474	valid's auc: 0.736948	valid's binary_logloss: 0.171126
[1354]	train's auc: 0.776739	train's binary_logloss: 0.164736	valid's auc: 0.736947	valid's binary_logloss: 0.171126
[1355]	train's auc: 0.776767	train's binary_logloss: 0.164731	vali

[1418]	train's auc: 0.778639	train's binary_logloss: 0.16443	valid's auc: 0.736937	valid's binary_logloss: 0.17112
[1419]	train's auc: 0.778653	train's binary_logloss: 0.164426	valid's auc: 0.736936	valid's binary_logloss: 0.17112
[1420]	train's auc: 0.778693	train's binary_logloss: 0.16442	valid's auc: 0.736941	valid's binary_logloss: 0.171118
[1421]	train's auc: 0.77872	train's binary_logloss: 0.164415	valid's auc: 0.736934	valid's binary_logloss: 0.171118
[1422]	train's auc: 0.778723	train's binary_logloss: 0.164411	valid's auc: 0.736938	valid's binary_logloss: 0.171116
[1423]	train's auc: 0.778742	train's binary_logloss: 0.164407	valid's auc: 0.736942	valid's binary_logloss: 0.171115
[1424]	train's auc: 0.778778	train's binary_logloss: 0.164401	valid's auc: 0.73695	valid's binary_logloss: 0.171115
[1425]	train's auc: 0.778813	train's binary_logloss: 0.164396	valid's auc: 0.736951	valid's binary_logloss: 0.171114
[1426]	train's auc: 0.778833	train's binary_logloss: 0.164392	valid's 

[1489]	train's auc: 0.780451	train's binary_logloss: 0.164104	valid's auc: 0.73706	valid's binary_logloss: 0.171086
[1490]	train's auc: 0.780481	train's binary_logloss: 0.1641	valid's auc: 0.737052	valid's binary_logloss: 0.171087
[1491]	train's auc: 0.780525	train's binary_logloss: 0.164095	valid's auc: 0.737044	valid's binary_logloss: 0.171087
[1492]	train's auc: 0.780559	train's binary_logloss: 0.16409	valid's auc: 0.737044	valid's binary_logloss: 0.171087
[1493]	train's auc: 0.780584	train's binary_logloss: 0.164086	valid's auc: 0.737034	valid's binary_logloss: 0.171088
[1494]	train's auc: 0.780598	train's binary_logloss: 0.164082	valid's auc: 0.737031	valid's binary_logloss: 0.171089
[1495]	train's auc: 0.780609	train's binary_logloss: 0.164078	valid's auc: 0.737034	valid's binary_logloss: 0.171087
[1496]	train's auc: 0.780648	train's binary_logloss: 0.164073	valid's auc: 0.737035	valid's binary_logloss: 0.171087
[1497]	train's auc: 0.780694	train's binary_logloss: 0.164067	valid'

[1560]	train's auc: 0.78238	train's binary_logloss: 0.163775	valid's auc: 0.737075	valid's binary_logloss: 0.171074
[1561]	train's auc: 0.782414	train's binary_logloss: 0.16377	valid's auc: 0.737065	valid's binary_logloss: 0.171075
[1562]	train's auc: 0.782436	train's binary_logloss: 0.163767	valid's auc: 0.737062	valid's binary_logloss: 0.171075
[1563]	train's auc: 0.782462	train's binary_logloss: 0.163762	valid's auc: 0.737057	valid's binary_logloss: 0.171076
[1564]	train's auc: 0.782481	train's binary_logloss: 0.163758	valid's auc: 0.737053	valid's binary_logloss: 0.171076
[1565]	train's auc: 0.782515	train's binary_logloss: 0.163753	valid's auc: 0.737051	valid's binary_logloss: 0.171076
[1566]	train's auc: 0.782526	train's binary_logloss: 0.163749	valid's auc: 0.73705	valid's binary_logloss: 0.171077
[1567]	train's auc: 0.782568	train's binary_logloss: 0.163742	valid's auc: 0.737054	valid's binary_logloss: 0.171076
[1568]	train's auc: 0.78259	train's binary_logloss: 0.163738	valid'

[1631]	train's auc: 0.784191	train's binary_logloss: 0.163452	valid's auc: 0.737121	valid's binary_logloss: 0.171055
[1632]	train's auc: 0.784223	train's binary_logloss: 0.163448	valid's auc: 0.737134	valid's binary_logloss: 0.171053
[1633]	train's auc: 0.784245	train's binary_logloss: 0.163444	valid's auc: 0.737131	valid's binary_logloss: 0.171053
[1634]	train's auc: 0.784272	train's binary_logloss: 0.16344	valid's auc: 0.737132	valid's binary_logloss: 0.171053
[1635]	train's auc: 0.78429	train's binary_logloss: 0.163436	valid's auc: 0.737135	valid's binary_logloss: 0.171052
[1636]	train's auc: 0.784311	train's binary_logloss: 0.163432	valid's auc: 0.737132	valid's binary_logloss: 0.171052
[1637]	train's auc: 0.784326	train's binary_logloss: 0.163429	valid's auc: 0.737132	valid's binary_logloss: 0.171052
[1638]	train's auc: 0.784351	train's binary_logloss: 0.163425	valid's auc: 0.737127	valid's binary_logloss: 0.171053
[1639]	train's auc: 0.784364	train's binary_logloss: 0.163422	vali

[1702]	train's auc: 0.786159	train's binary_logloss: 0.163112	valid's auc: 0.737204	valid's binary_logloss: 0.171031
[1703]	train's auc: 0.786183	train's binary_logloss: 0.163108	valid's auc: 0.737203	valid's binary_logloss: 0.171031
[1704]	train's auc: 0.786202	train's binary_logloss: 0.163105	valid's auc: 0.737204	valid's binary_logloss: 0.171031
[1705]	train's auc: 0.786242	train's binary_logloss: 0.1631	valid's auc: 0.737208	valid's binary_logloss: 0.171031
[1706]	train's auc: 0.786276	train's binary_logloss: 0.163096	valid's auc: 0.737202	valid's binary_logloss: 0.171031
[1707]	train's auc: 0.786291	train's binary_logloss: 0.163091	valid's auc: 0.737211	valid's binary_logloss: 0.171029
[1708]	train's auc: 0.786318	train's binary_logloss: 0.163086	valid's auc: 0.737214	valid's binary_logloss: 0.171029
[1709]	train's auc: 0.786344	train's binary_logloss: 0.163082	valid's auc: 0.737212	valid's binary_logloss: 0.171028
[1710]	train's auc: 0.786373	train's binary_logloss: 0.163078	vali

[1773]	train's auc: 0.787948	train's binary_logloss: 0.162803	valid's auc: 0.737284	valid's binary_logloss: 0.171007
[1774]	train's auc: 0.787969	train's binary_logloss: 0.162799	valid's auc: 0.73729	valid's binary_logloss: 0.171007
[1775]	train's auc: 0.787994	train's binary_logloss: 0.162795	valid's auc: 0.737288	valid's binary_logloss: 0.171007
[1776]	train's auc: 0.788011	train's binary_logloss: 0.162792	valid's auc: 0.737295	valid's binary_logloss: 0.171006
[1777]	train's auc: 0.788037	train's binary_logloss: 0.162787	valid's auc: 0.737301	valid's binary_logloss: 0.171006
[1778]	train's auc: 0.788054	train's binary_logloss: 0.162782	valid's auc: 0.737305	valid's binary_logloss: 0.171005
[1779]	train's auc: 0.788074	train's binary_logloss: 0.162778	valid's auc: 0.737312	valid's binary_logloss: 0.171004
[1780]	train's auc: 0.788102	train's binary_logloss: 0.162774	valid's auc: 0.73731	valid's binary_logloss: 0.171004
[1781]	train's auc: 0.78813	train's binary_logloss: 0.162769	valid

[1844]	train's auc: 0.789666	train's binary_logloss: 0.162491	valid's auc: 0.737382	valid's binary_logloss: 0.170981
[1845]	train's auc: 0.789706	train's binary_logloss: 0.162484	valid's auc: 0.737389	valid's binary_logloss: 0.17098
[1846]	train's auc: 0.789729	train's binary_logloss: 0.16248	valid's auc: 0.737388	valid's binary_logloss: 0.17098
[1847]	train's auc: 0.789755	train's binary_logloss: 0.162477	valid's auc: 0.737392	valid's binary_logloss: 0.17098
[1848]	train's auc: 0.789777	train's binary_logloss: 0.162473	valid's auc: 0.737389	valid's binary_logloss: 0.17098
[1849]	train's auc: 0.789797	train's binary_logloss: 0.162469	valid's auc: 0.737393	valid's binary_logloss: 0.17098
[1850]	train's auc: 0.789829	train's binary_logloss: 0.162463	valid's auc: 0.737391	valid's binary_logloss: 0.17098
[1851]	train's auc: 0.789853	train's binary_logloss: 0.16246	valid's auc: 0.737393	valid's binary_logloss: 0.17098
[1852]	train's auc: 0.78987	train's binary_logloss: 0.162455	valid's auc:

[1915]	train's auc: 0.791461	train's binary_logloss: 0.162171	valid's auc: 0.737518	valid's binary_logloss: 0.170958
[1916]	train's auc: 0.791485	train's binary_logloss: 0.162167	valid's auc: 0.737514	valid's binary_logloss: 0.170959
[1917]	train's auc: 0.791503	train's binary_logloss: 0.162164	valid's auc: 0.73751	valid's binary_logloss: 0.17096
[1918]	train's auc: 0.791523	train's binary_logloss: 0.16216	valid's auc: 0.737514	valid's binary_logloss: 0.170959
[1919]	train's auc: 0.791536	train's binary_logloss: 0.162156	valid's auc: 0.73752	valid's binary_logloss: 0.170958
[1920]	train's auc: 0.791556	train's binary_logloss: 0.162152	valid's auc: 0.737521	valid's binary_logloss: 0.170958
[1921]	train's auc: 0.791585	train's binary_logloss: 0.162148	valid's auc: 0.737523	valid's binary_logloss: 0.170958
[1922]	train's auc: 0.791611	train's binary_logloss: 0.162143	valid's auc: 0.737515	valid's binary_logloss: 0.17096
[1923]	train's auc: 0.791641	train's binary_logloss: 0.162138	valid's

[1986]	train's auc: 0.793082	train's binary_logloss: 0.161874	valid's auc: 0.737578	valid's binary_logloss: 0.170948
[1987]	train's auc: 0.79314	train's binary_logloss: 0.161862	valid's auc: 0.737627	valid's binary_logloss: 0.170941
[1988]	train's auc: 0.793163	train's binary_logloss: 0.161858	valid's auc: 0.737629	valid's binary_logloss: 0.170941
[1989]	train's auc: 0.793182	train's binary_logloss: 0.161854	valid's auc: 0.737622	valid's binary_logloss: 0.17094
[1990]	train's auc: 0.793215	train's binary_logloss: 0.16185	valid's auc: 0.737625	valid's binary_logloss: 0.170939
[1991]	train's auc: 0.793235	train's binary_logloss: 0.161845	valid's auc: 0.737624	valid's binary_logloss: 0.170939
[1992]	train's auc: 0.793274	train's binary_logloss: 0.161839	valid's auc: 0.737619	valid's binary_logloss: 0.17094
[1993]	train's auc: 0.793295	train's binary_logloss: 0.161835	valid's auc: 0.737614	valid's binary_logloss: 0.17094
[1994]	train's auc: 0.793331	train's binary_logloss: 0.161831	valid's

[2057]	train's auc: 0.79492	train's binary_logloss: 0.16156	valid's auc: 0.737633	valid's binary_logloss: 0.170934
[2058]	train's auc: 0.794943	train's binary_logloss: 0.161557	valid's auc: 0.73763	valid's binary_logloss: 0.170934
[2059]	train's auc: 0.794965	train's binary_logloss: 0.161553	valid's auc: 0.737626	valid's binary_logloss: 0.170935
[2060]	train's auc: 0.794982	train's binary_logloss: 0.16155	valid's auc: 0.737626	valid's binary_logloss: 0.170935
[2061]	train's auc: 0.795005	train's binary_logloss: 0.161546	valid's auc: 0.737632	valid's binary_logloss: 0.170934
[2062]	train's auc: 0.795025	train's binary_logloss: 0.161542	valid's auc: 0.737635	valid's binary_logloss: 0.170934
[2063]	train's auc: 0.795045	train's binary_logloss: 0.161538	valid's auc: 0.737639	valid's binary_logloss: 0.170934
[2064]	train's auc: 0.79509	train's binary_logloss: 0.161526	valid's auc: 0.737705	valid's binary_logloss: 0.170926
[2065]	train's auc: 0.79514	train's binary_logloss: 0.161518	valid's 

[2128]	train's auc: 0.796576	train's binary_logloss: 0.161247	valid's auc: 0.737724	valid's binary_logloss: 0.17091
[2129]	train's auc: 0.796592	train's binary_logloss: 0.161242	valid's auc: 0.737723	valid's binary_logloss: 0.17091
[2130]	train's auc: 0.796625	train's binary_logloss: 0.161237	valid's auc: 0.737727	valid's binary_logloss: 0.17091
[2131]	train's auc: 0.796634	train's binary_logloss: 0.161234	valid's auc: 0.737728	valid's binary_logloss: 0.17091
[2132]	train's auc: 0.796653	train's binary_logloss: 0.16123	valid's auc: 0.737723	valid's binary_logloss: 0.17091
[2133]	train's auc: 0.796672	train's binary_logloss: 0.161226	valid's auc: 0.737724	valid's binary_logloss: 0.17091
[2134]	train's auc: 0.796697	train's binary_logloss: 0.161222	valid's auc: 0.737726	valid's binary_logloss: 0.17091
[2135]	train's auc: 0.796728	train's binary_logloss: 0.161216	valid's auc: 0.737728	valid's binary_logloss: 0.170911
[2136]	train's auc: 0.79676	train's binary_logloss: 0.161212	valid's auc

[2199]	train's auc: 0.79826	train's binary_logloss: 0.160933	valid's auc: 0.737745	valid's binary_logloss: 0.170898
[2200]	train's auc: 0.79828	train's binary_logloss: 0.160929	valid's auc: 0.73775	valid's binary_logloss: 0.170897
[2201]	train's auc: 0.798301	train's binary_logloss: 0.160926	valid's auc: 0.737747	valid's binary_logloss: 0.170897
[2202]	train's auc: 0.798326	train's binary_logloss: 0.160922	valid's auc: 0.737744	valid's binary_logloss: 0.170897
[2203]	train's auc: 0.798339	train's binary_logloss: 0.160919	valid's auc: 0.737746	valid's binary_logloss: 0.170896
[2204]	train's auc: 0.798355	train's binary_logloss: 0.160916	valid's auc: 0.737739	valid's binary_logloss: 0.170897
[2205]	train's auc: 0.798383	train's binary_logloss: 0.160912	valid's auc: 0.737752	valid's binary_logloss: 0.170895
[2206]	train's auc: 0.798407	train's binary_logloss: 0.160907	valid's auc: 0.737752	valid's binary_logloss: 0.170895
[2207]	train's auc: 0.798422	train's binary_logloss: 0.160903	valid

LGBMClassifier(n_estimators=10000, silent=False, subsample_freq=1)

In [11]:
feat_imp = pd.Series(clf.feature_importances_)
feat_imp = feat_imp[feat_imp > 0]

imp_col_lst = list(feat_imp.sort_values(ascending=False).index)

pd.Series(imp_col_lst).to_csv(data_prep_dir + 'feature_importance.csv',index=False)

print('There are {} features with importance > 0.'.format(len(imp_col_lst)))

There are 11762 features with importance > 0.


In [12]:
n = clf.best_iteration_
base_loss = clf.best_score_['valid']['binary_logloss']
base_auc = clf.best_score_['valid']['auc']
print('Base loss is', base_loss, 'and AUC is', base_auc)

Base loss is 0.17089956808501636 and AUC is 0.7377981587205131


In [13]:
clf = LGBMClassifier(boosting_type='gbdt',
                     num_leaves=31, max_depth=-1, 
                     learning_rate=0.1, n_estimators=n, 
                     subsample_for_bin=200000, objective=None,
                     class_weight=None, min_split_gain=0.0, 
                     min_child_weight=0.001,
                     min_child_samples=20, subsample=1.0, subsample_freq=1,
                     colsample_bytree=1.0,
                     reg_alpha=0.0, reg_lambda=0.0, random_state=None,
                     n_jobs=-1, silent=False)

In [14]:
def valid_loss(cols):
    
    print('Running...')
    
    t0 = time.time()
    # select subset of features
    clf.fit(x_train[:, cols], y_train)
    
    y_pred = clf.predict_proba(x_valid[:, cols])[:, 1]
    
    t1 = time.time()
    print(t1 - t0, "s")
    
    return roc_auc_score(y_valid, y_pred)

all_num = int(len(feat_imp) / 100) * 100
print('There are', all_num, 'features waiting to be calculated')

loss = []
break_num = 0

print('Begin to feature selection...')
for i in range(500, all_num, 100):
    loss.append(valid_loss(imp_col_lst[:i]))
    if loss[-1] > base_auc:
        best_num = i
        base_auc = loss[-1]
        break_num += 1
    print('前', i, '个特征的得分为', loss[-1], '而全量得分', base_auc)
    print('\n')
    if break_num == 2:
        
        break
        
print('筛选出来最佳特征个数为', best_num, '这下子训练速度终于可以大大提升了')

There are 11700 features waiting to be calculated
Begin to feature selection...
Running...
[LightGBM] [Info] Number of positive: 337465, number of negative: 6701375
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1156
[LightGBM] [Info] Number of data points in the train set: 7038840, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047943 -> initscore=-2.988606
[LightGBM] [Info] Start training from score -2.988606
1739.7665176391602 s
前 500 个特征的得分为 0.7358498342854578 而全量得分 0.7377981587205131


Running...
[LightGBM] [Info] Number of positive: 337465, number of negative: 6701375
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1356
[LightGBM] [Info] Number of data points in the train set: 7038840, number of used features: 600
[LightGBM] [Info] [binary:Boost

## 保存被选择特征集

In [15]:
train_part_x = pd.DataFrame()
evals_x = pd.DataFrame()

# 重新读取数据集, 避免保存错了
feat_fname_train = ['train_x_sparse_onehot.npz', 'train_x_sparse_seq_len.npz', 'train_x_sparse_cntv.npz']
feat_fname_valid = ['valid_x_sparse_onehot.npz', 'valid_x_sparse_seq_len.npz', 'valid_x_sparse_cntv.npz']
x_train = pd.DataFrame()
x_valid = pd.DataFrame()
for tr, vl in zip(feat_fname_train, feat_fname_valid):
    train_f = sparse.load_npz(data_prep_dir + tr).tocsr()
    valid_f = sparse.load_npz(data_prep_dir + vl).tocsr()
    
    x_train = sparse.hstack((x_train, train_f)).tocsc()
    x_valid = sparse.hstack((x_valid, valid_f)).tocsc()
    print(tr, train_f.shape)
    print(vl, valid_f.shape)
    
print('x_train:', x_train.shape)
print('x_valid:', x_valid.shape)

print('Loading & merging features completed!')

train_x_sparse_onehot.npz (7038840, 1526)
valid_x_sparse_onehot.npz (1759974, 1526)
train_x_sparse_seq_len.npz (7038840, 8)
valid_x_sparse_seq_len.npz (1759974, 8)
train_x_sparse_cntv.npz (7038840, 323142)
valid_x_sparse_cntv.npz (1759974, 323142)
x_train: (7038840, 324676)
x_valid: (1759974, 324676)
Loading & merging features completed!


In [17]:
col2save = imp_col_lst[: best_num]
x_train2save = x_train[:, col2save]
x_valid2save = x_valid[:, col2save]

sparse.save_npz(data_prep_dir + 'train_x_sparse_selection.npz', x_train2save)
sparse.save_npz(data_prep_dir + 'valid_x_sparse_selection.npz', x_valid2save)

print('train_x_sparse_selection.npz', x_train2save.shape)
print('valid_x_sparse_selection.npz', x_valid2save.shape)


train_x_sparse_selection.npz (7038840, 800)
valid_x_sparse_selection.npz (1759974, 800)


# 模型构建

In [26]:
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.1, 
                n_estimators=10000, subsample_for_bin=200000, objective=None, 
                class_weight=None, min_split_gain=0.0, min_child_weight=0.001, 
                min_child_samples=20, subsample=0.7, subsample_freq=1, 
                colsample_bytree=0.7, 
                reg_alpha=6, reg_lambda=3,
                random_state=2018, n_jobs=-1, silent=False)

In [27]:
clf.fit(x_train2save, y_train, eval_set = [(x_train2save, y_train), (x_valid2save, y_valid)], 
        eval_names =['train','valid'],
        eval_metric='auc', early_stopping_rounds=50)

auc = int(clf.best_score_['valid']['auc']*1000000)
train_part_x = []
train_part_y = []

[LightGBM] [Info] Number of positive: 337465, number of negative: 6701375
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1756
[LightGBM] [Info] Number of data points in the train set: 7038840, number of used features: 800
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047943 -> initscore=-2.988606
[LightGBM] [Info] Start training from score -2.988606
[1]	train's auc: 0.641229	train's binary_logloss: 0.189886	valid's auc: 0.640498	valid's binary_logloss: 0.190082
Training until validation scores don't improve for 50 rounds
[2]	train's auc: 0.66071	train's binary_logloss: 0.188155	valid's auc: 0.659761	valid's binary_logloss: 0.18835
[3]	train's auc: 0.66179	train's binary_logloss: 0.186925	valid's auc: 0.660796	valid's binary_logloss: 0.187111
[4]	train's auc: 0.66314	train's binary_logloss: 0.185959	valid's auc: 0.661957	valid's binary_logloss: 0.186149
[5]	train's auc: 0.665551	t

[68]	train's auc: 0.714034	train's binary_logloss: 0.175509	valid's auc: 0.712814	valid's binary_logloss: 0.175812
[69]	train's auc: 0.714337	train's binary_logloss: 0.17546	valid's auc: 0.71313	valid's binary_logloss: 0.175761
[70]	train's auc: 0.714535	train's binary_logloss: 0.175419	valid's auc: 0.713293	valid's binary_logloss: 0.175723
[71]	train's auc: 0.714837	train's binary_logloss: 0.175371	valid's auc: 0.713602	valid's binary_logloss: 0.175674
[72]	train's auc: 0.715156	train's binary_logloss: 0.175324	valid's auc: 0.713886	valid's binary_logloss: 0.175632
[73]	train's auc: 0.715352	train's binary_logloss: 0.175281	valid's auc: 0.714078	valid's binary_logloss: 0.175592
[74]	train's auc: 0.715577	train's binary_logloss: 0.175242	valid's auc: 0.714337	valid's binary_logloss: 0.17555
[75]	train's auc: 0.715742	train's binary_logloss: 0.175195	valid's auc: 0.714506	valid's binary_logloss: 0.175504
[76]	train's auc: 0.715926	train's binary_logloss: 0.175155	valid's auc: 0.714661	v

[140]	train's auc: 0.726183	train's binary_logloss: 0.173346	valid's auc: 0.723904	valid's binary_logloss: 0.173811
[141]	train's auc: 0.726313	train's binary_logloss: 0.173326	valid's auc: 0.724011	valid's binary_logloss: 0.173795
[142]	train's auc: 0.726398	train's binary_logloss: 0.173305	valid's auc: 0.724082	valid's binary_logloss: 0.173777
[143]	train's auc: 0.726488	train's binary_logloss: 0.173289	valid's auc: 0.724174	valid's binary_logloss: 0.173762
[144]	train's auc: 0.726614	train's binary_logloss: 0.173271	valid's auc: 0.724279	valid's binary_logloss: 0.173747
[145]	train's auc: 0.726688	train's binary_logloss: 0.173255	valid's auc: 0.724336	valid's binary_logloss: 0.173734
[146]	train's auc: 0.72681	train's binary_logloss: 0.173234	valid's auc: 0.724478	valid's binary_logloss: 0.173713
[147]	train's auc: 0.726885	train's binary_logloss: 0.173218	valid's auc: 0.724544	valid's binary_logloss: 0.173699
[148]	train's auc: 0.726997	train's binary_logloss: 0.173201	valid's auc:

[211]	train's auc: 0.732173	train's binary_logloss: 0.172281	valid's auc: 0.729009	valid's binary_logloss: 0.172913
[212]	train's auc: 0.73234	train's binary_logloss: 0.172257	valid's auc: 0.72915	valid's binary_logloss: 0.172892
[213]	train's auc: 0.732401	train's binary_logloss: 0.172245	valid's auc: 0.729218	valid's binary_logloss: 0.172881
[214]	train's auc: 0.732513	train's binary_logloss: 0.17223	valid's auc: 0.729312	valid's binary_logloss: 0.172869
[215]	train's auc: 0.732667	train's binary_logloss: 0.172212	valid's auc: 0.729456	valid's binary_logloss: 0.172853
[216]	train's auc: 0.732763	train's binary_logloss: 0.172197	valid's auc: 0.729528	valid's binary_logloss: 0.172841
[217]	train's auc: 0.732819	train's binary_logloss: 0.172187	valid's auc: 0.729564	valid's binary_logloss: 0.172834
[218]	train's auc: 0.732921	train's binary_logloss: 0.172172	valid's auc: 0.729643	valid's binary_logloss: 0.172821
[219]	train's auc: 0.732985	train's binary_logloss: 0.172158	valid's auc: 0

[282]	train's auc: 0.736672	train's binary_logloss: 0.171521	valid's auc: 0.732414	valid's binary_logloss: 0.172329
[283]	train's auc: 0.736717	train's binary_logloss: 0.171514	valid's auc: 0.73244	valid's binary_logloss: 0.172323
[284]	train's auc: 0.736756	train's binary_logloss: 0.171507	valid's auc: 0.732462	valid's binary_logloss: 0.17232
[285]	train's auc: 0.736878	train's binary_logloss: 0.171491	valid's auc: 0.73257	valid's binary_logloss: 0.172306
[286]	train's auc: 0.736942	train's binary_logloss: 0.171482	valid's auc: 0.732607	valid's binary_logloss: 0.172298
[287]	train's auc: 0.736989	train's binary_logloss: 0.171473	valid's auc: 0.732643	valid's binary_logloss: 0.172292
[288]	train's auc: 0.737044	train's binary_logloss: 0.171466	valid's auc: 0.732669	valid's binary_logloss: 0.172288
[289]	train's auc: 0.737085	train's binary_logloss: 0.171459	valid's auc: 0.732712	valid's binary_logloss: 0.172282
[290]	train's auc: 0.737121	train's binary_logloss: 0.171452	valid's auc: 0

[353]	train's auc: 0.739982	train's binary_logloss: 0.170956	valid's auc: 0.734583	valid's binary_logloss: 0.171949
[354]	train's auc: 0.74002	train's binary_logloss: 0.170952	valid's auc: 0.734593	valid's binary_logloss: 0.171947
[355]	train's auc: 0.740058	train's binary_logloss: 0.170946	valid's auc: 0.734606	valid's binary_logloss: 0.171944
[356]	train's auc: 0.740087	train's binary_logloss: 0.170939	valid's auc: 0.734625	valid's binary_logloss: 0.17194
[357]	train's auc: 0.740122	train's binary_logloss: 0.170929	valid's auc: 0.734647	valid's binary_logloss: 0.171932
[358]	train's auc: 0.740149	train's binary_logloss: 0.170924	valid's auc: 0.734662	valid's binary_logloss: 0.17193
[359]	train's auc: 0.740183	train's binary_logloss: 0.170919	valid's auc: 0.734667	valid's binary_logloss: 0.171928
[360]	train's auc: 0.740207	train's binary_logloss: 0.170914	valid's auc: 0.73468	valid's binary_logloss: 0.171925
[361]	train's auc: 0.740238	train's binary_logloss: 0.170909	valid's auc: 0.

[424]	train's auc: 0.742495	train's binary_logloss: 0.170517	valid's auc: 0.735992	valid's binary_logloss: 0.171691
[425]	train's auc: 0.742512	train's binary_logloss: 0.170512	valid's auc: 0.736003	valid's binary_logloss: 0.171689
[426]	train's auc: 0.742544	train's binary_logloss: 0.170507	valid's auc: 0.736019	valid's binary_logloss: 0.171686
[427]	train's auc: 0.742565	train's binary_logloss: 0.1705	valid's auc: 0.736036	valid's binary_logloss: 0.171682
[428]	train's auc: 0.7426	train's binary_logloss: 0.170494	valid's auc: 0.736059	valid's binary_logloss: 0.171678
[429]	train's auc: 0.742628	train's binary_logloss: 0.170488	valid's auc: 0.736081	valid's binary_logloss: 0.171674
[430]	train's auc: 0.742653	train's binary_logloss: 0.170483	valid's auc: 0.736088	valid's binary_logloss: 0.171672
[431]	train's auc: 0.74268	train's binary_logloss: 0.170478	valid's auc: 0.736084	valid's binary_logloss: 0.171672
[432]	train's auc: 0.74271	train's binary_logloss: 0.170472	valid's auc: 0.73

[495]	train's auc: 0.744893	train's binary_logloss: 0.170102	valid's auc: 0.737292	valid's binary_logloss: 0.171462
[496]	train's auc: 0.744949	train's binary_logloss: 0.170094	valid's auc: 0.737323	valid's binary_logloss: 0.171457
[497]	train's auc: 0.744973	train's binary_logloss: 0.170089	valid's auc: 0.737333	valid's binary_logloss: 0.171456
[498]	train's auc: 0.745	train's binary_logloss: 0.170085	valid's auc: 0.737342	valid's binary_logloss: 0.171454
[499]	train's auc: 0.74503	train's binary_logloss: 0.17008	valid's auc: 0.73735	valid's binary_logloss: 0.171453
[500]	train's auc: 0.745056	train's binary_logloss: 0.170076	valid's auc: 0.737352	valid's binary_logloss: 0.171451
[501]	train's auc: 0.745091	train's binary_logloss: 0.170069	valid's auc: 0.737378	valid's binary_logloss: 0.171447
[502]	train's auc: 0.745115	train's binary_logloss: 0.170064	valid's auc: 0.737388	valid's binary_logloss: 0.171445
[503]	train's auc: 0.745135	train's binary_logloss: 0.17006	valid's auc: 0.737

[566]	train's auc: 0.746939	train's binary_logloss: 0.169745	valid's auc: 0.738161	valid's binary_logloss: 0.1713
[567]	train's auc: 0.74696	train's binary_logloss: 0.169741	valid's auc: 0.73817	valid's binary_logloss: 0.171299
[568]	train's auc: 0.746986	train's binary_logloss: 0.169737	valid's auc: 0.738179	valid's binary_logloss: 0.171297
[569]	train's auc: 0.747012	train's binary_logloss: 0.169733	valid's auc: 0.73819	valid's binary_logloss: 0.171295
[570]	train's auc: 0.747042	train's binary_logloss: 0.169729	valid's auc: 0.738197	valid's binary_logloss: 0.171294
[571]	train's auc: 0.747078	train's binary_logloss: 0.169723	valid's auc: 0.73821	valid's binary_logloss: 0.17129
[572]	train's auc: 0.747102	train's binary_logloss: 0.169719	valid's auc: 0.738215	valid's binary_logloss: 0.17129
[573]	train's auc: 0.747141	train's binary_logloss: 0.169711	valid's auc: 0.738245	valid's binary_logloss: 0.171284
[574]	train's auc: 0.747166	train's binary_logloss: 0.169707	valid's auc: 0.7382

[637]	train's auc: 0.748963	train's binary_logloss: 0.1694	valid's auc: 0.739044	valid's binary_logloss: 0.171136
[638]	train's auc: 0.748984	train's binary_logloss: 0.169397	valid's auc: 0.739044	valid's binary_logloss: 0.171135
[639]	train's auc: 0.74901	train's binary_logloss: 0.169391	valid's auc: 0.739065	valid's binary_logloss: 0.171132
[640]	train's auc: 0.749034	train's binary_logloss: 0.169388	valid's auc: 0.739064	valid's binary_logloss: 0.171132
[641]	train's auc: 0.749079	train's binary_logloss: 0.169381	valid's auc: 0.739083	valid's binary_logloss: 0.171128
[642]	train's auc: 0.749093	train's binary_logloss: 0.169377	valid's auc: 0.739082	valid's binary_logloss: 0.171128
[643]	train's auc: 0.749113	train's binary_logloss: 0.169373	valid's auc: 0.739088	valid's binary_logloss: 0.171127
[644]	train's auc: 0.749141	train's binary_logloss: 0.169369	valid's auc: 0.739101	valid's binary_logloss: 0.171124
[645]	train's auc: 0.749164	train's binary_logloss: 0.169366	valid's auc: 0

[708]	train's auc: 0.750823	train's binary_logloss: 0.169085	valid's auc: 0.739743	valid's binary_logloss: 0.171005
[709]	train's auc: 0.750846	train's binary_logloss: 0.169081	valid's auc: 0.739745	valid's binary_logloss: 0.171005
[710]	train's auc: 0.75088	train's binary_logloss: 0.169075	valid's auc: 0.739766	valid's binary_logloss: 0.171001
[711]	train's auc: 0.7509	train's binary_logloss: 0.169072	valid's auc: 0.739769	valid's binary_logloss: 0.171001
[712]	train's auc: 0.750924	train's binary_logloss: 0.169067	valid's auc: 0.739774	valid's binary_logloss: 0.171
[713]	train's auc: 0.750949	train's binary_logloss: 0.169064	valid's auc: 0.73978	valid's binary_logloss: 0.170999
[714]	train's auc: 0.750963	train's binary_logloss: 0.169061	valid's auc: 0.739782	valid's binary_logloss: 0.170999
[715]	train's auc: 0.750997	train's binary_logloss: 0.169053	valid's auc: 0.739787	valid's binary_logloss: 0.170996
[716]	train's auc: 0.751018	train's binary_logloss: 0.169049	valid's auc: 0.739

[779]	train's auc: 0.752469	train's binary_logloss: 0.168799	valid's auc: 0.740249	valid's binary_logloss: 0.170909
[780]	train's auc: 0.752514	train's binary_logloss: 0.168792	valid's auc: 0.740278	valid's binary_logloss: 0.170902
[781]	train's auc: 0.75254	train's binary_logloss: 0.168788	valid's auc: 0.740281	valid's binary_logloss: 0.170902
[782]	train's auc: 0.752564	train's binary_logloss: 0.168783	valid's auc: 0.740295	valid's binary_logloss: 0.1709
[783]	train's auc: 0.752583	train's binary_logloss: 0.16878	valid's auc: 0.740291	valid's binary_logloss: 0.1709
[784]	train's auc: 0.752618	train's binary_logloss: 0.168774	valid's auc: 0.74031	valid's binary_logloss: 0.170897
[785]	train's auc: 0.752646	train's binary_logloss: 0.168768	valid's auc: 0.740324	valid's binary_logloss: 0.170894
[786]	train's auc: 0.752667	train's binary_logloss: 0.168765	valid's auc: 0.74033	valid's binary_logloss: 0.170893
[787]	train's auc: 0.752684	train's binary_logloss: 0.168761	valid's auc: 0.7403

[850]	train's auc: 0.754153	train's binary_logloss: 0.168516	valid's auc: 0.740758	valid's binary_logloss: 0.170811
[851]	train's auc: 0.75417	train's binary_logloss: 0.168513	valid's auc: 0.740762	valid's binary_logloss: 0.17081
[852]	train's auc: 0.754191	train's binary_logloss: 0.168509	valid's auc: 0.740764	valid's binary_logloss: 0.170809
[853]	train's auc: 0.754213	train's binary_logloss: 0.168506	valid's auc: 0.740763	valid's binary_logloss: 0.170809
[854]	train's auc: 0.754241	train's binary_logloss: 0.168502	valid's auc: 0.740776	valid's binary_logloss: 0.170808
[855]	train's auc: 0.754264	train's binary_logloss: 0.168498	valid's auc: 0.740779	valid's binary_logloss: 0.170807
[856]	train's auc: 0.754283	train's binary_logloss: 0.168494	valid's auc: 0.740779	valid's binary_logloss: 0.170807
[857]	train's auc: 0.75432	train's binary_logloss: 0.168489	valid's auc: 0.740795	valid's binary_logloss: 0.170804
[858]	train's auc: 0.754332	train's binary_logloss: 0.168486	valid's auc: 0

[921]	train's auc: 0.755581	train's binary_logloss: 0.168269	valid's auc: 0.741133	valid's binary_logloss: 0.170744
[922]	train's auc: 0.755602	train's binary_logloss: 0.168265	valid's auc: 0.741144	valid's binary_logloss: 0.170742
[923]	train's auc: 0.755619	train's binary_logloss: 0.168261	valid's auc: 0.741153	valid's binary_logloss: 0.170741
[924]	train's auc: 0.75564	train's binary_logloss: 0.168258	valid's auc: 0.741165	valid's binary_logloss: 0.170739
[925]	train's auc: 0.755667	train's binary_logloss: 0.168254	valid's auc: 0.741172	valid's binary_logloss: 0.170739
[926]	train's auc: 0.755687	train's binary_logloss: 0.16825	valid's auc: 0.741178	valid's binary_logloss: 0.170737
[927]	train's auc: 0.755702	train's binary_logloss: 0.168248	valid's auc: 0.741179	valid's binary_logloss: 0.170737
[928]	train's auc: 0.755721	train's binary_logloss: 0.168246	valid's auc: 0.741179	valid's binary_logloss: 0.170737
[929]	train's auc: 0.755751	train's binary_logloss: 0.168241	valid's auc: 

[992]	train's auc: 0.757041	train's binary_logloss: 0.168018	valid's auc: 0.741523	valid's binary_logloss: 0.170672
[993]	train's auc: 0.757057	train's binary_logloss: 0.168014	valid's auc: 0.741528	valid's binary_logloss: 0.170671
[994]	train's auc: 0.757076	train's binary_logloss: 0.168011	valid's auc: 0.741535	valid's binary_logloss: 0.17067
[995]	train's auc: 0.757103	train's binary_logloss: 0.168008	valid's auc: 0.741544	valid's binary_logloss: 0.170669
[996]	train's auc: 0.757124	train's binary_logloss: 0.168003	valid's auc: 0.741547	valid's binary_logloss: 0.170668
[997]	train's auc: 0.757159	train's binary_logloss: 0.167999	valid's auc: 0.741562	valid's binary_logloss: 0.170666
[998]	train's auc: 0.757188	train's binary_logloss: 0.167994	valid's auc: 0.741567	valid's binary_logloss: 0.170664
[999]	train's auc: 0.757207	train's binary_logloss: 0.16799	valid's auc: 0.741576	valid's binary_logloss: 0.170662
[1000]	train's auc: 0.757222	train's binary_logloss: 0.167988	valid's auc:

[1063]	train's auc: 0.758443	train's binary_logloss: 0.167769	valid's auc: 0.741955	valid's binary_logloss: 0.170596
[1064]	train's auc: 0.758465	train's binary_logloss: 0.167765	valid's auc: 0.741964	valid's binary_logloss: 0.170595
[1065]	train's auc: 0.758483	train's binary_logloss: 0.16776	valid's auc: 0.741974	valid's binary_logloss: 0.170593
[1066]	train's auc: 0.758505	train's binary_logloss: 0.167757	valid's auc: 0.741974	valid's binary_logloss: 0.170592
[1067]	train's auc: 0.758525	train's binary_logloss: 0.167754	valid's auc: 0.741968	valid's binary_logloss: 0.170592
[1068]	train's auc: 0.758544	train's binary_logloss: 0.16775	valid's auc: 0.741952	valid's binary_logloss: 0.170593
[1069]	train's auc: 0.758565	train's binary_logloss: 0.167747	valid's auc: 0.741951	valid's binary_logloss: 0.170593
[1070]	train's auc: 0.758586	train's binary_logloss: 0.167743	valid's auc: 0.741954	valid's binary_logloss: 0.170593
[1071]	train's auc: 0.758601	train's binary_logloss: 0.16774	valid

[1134]	train's auc: 0.759823	train's binary_logloss: 0.167529	valid's auc: 0.74215	valid's binary_logloss: 0.170547
[1135]	train's auc: 0.759843	train's binary_logloss: 0.167525	valid's auc: 0.742149	valid's binary_logloss: 0.170547
[1136]	train's auc: 0.759862	train's binary_logloss: 0.167522	valid's auc: 0.742158	valid's binary_logloss: 0.170545
[1137]	train's auc: 0.759882	train's binary_logloss: 0.167518	valid's auc: 0.742168	valid's binary_logloss: 0.170543
[1138]	train's auc: 0.759917	train's binary_logloss: 0.167512	valid's auc: 0.742191	valid's binary_logloss: 0.170539
[1139]	train's auc: 0.759936	train's binary_logloss: 0.167509	valid's auc: 0.7422	valid's binary_logloss: 0.170538
[1140]	train's auc: 0.759957	train's binary_logloss: 0.167506	valid's auc: 0.742202	valid's binary_logloss: 0.170538
[1141]	train's auc: 0.759977	train's binary_logloss: 0.167503	valid's auc: 0.742201	valid's binary_logloss: 0.170538
[1142]	train's auc: 0.759999	train's binary_logloss: 0.1675	valid's

[1205]	train's auc: 0.761133	train's binary_logloss: 0.167303	valid's auc: 0.742352	valid's binary_logloss: 0.170502
[1206]	train's auc: 0.761155	train's binary_logloss: 0.1673	valid's auc: 0.742353	valid's binary_logloss: 0.170501
[1207]	train's auc: 0.761174	train's binary_logloss: 0.167296	valid's auc: 0.74235	valid's binary_logloss: 0.170501
[1208]	train's auc: 0.761192	train's binary_logloss: 0.167293	valid's auc: 0.742353	valid's binary_logloss: 0.1705
[1209]	train's auc: 0.761212	train's binary_logloss: 0.16729	valid's auc: 0.742356	valid's binary_logloss: 0.1705
[1210]	train's auc: 0.761226	train's binary_logloss: 0.167288	valid's auc: 0.74236	valid's binary_logloss: 0.170499
[1211]	train's auc: 0.761245	train's binary_logloss: 0.167285	valid's auc: 0.74237	valid's binary_logloss: 0.170497
[1212]	train's auc: 0.761261	train's binary_logloss: 0.167282	valid's auc: 0.742373	valid's binary_logloss: 0.170497
[1213]	train's auc: 0.761275	train's binary_logloss: 0.167278	valid's auc:

[1276]	train's auc: 0.762399	train's binary_logloss: 0.167081	valid's auc: 0.742562	valid's binary_logloss: 0.170452
[1277]	train's auc: 0.762416	train's binary_logloss: 0.167079	valid's auc: 0.742567	valid's binary_logloss: 0.170451
[1278]	train's auc: 0.762437	train's binary_logloss: 0.167076	valid's auc: 0.742571	valid's binary_logloss: 0.170451
[1279]	train's auc: 0.762455	train's binary_logloss: 0.167072	valid's auc: 0.742571	valid's binary_logloss: 0.170451
[1280]	train's auc: 0.76247	train's binary_logloss: 0.16707	valid's auc: 0.742573	valid's binary_logloss: 0.170451
[1281]	train's auc: 0.762491	train's binary_logloss: 0.167067	valid's auc: 0.742586	valid's binary_logloss: 0.17045
[1282]	train's auc: 0.762509	train's binary_logloss: 0.167064	valid's auc: 0.742587	valid's binary_logloss: 0.17045
[1283]	train's auc: 0.762529	train's binary_logloss: 0.167061	valid's auc: 0.742582	valid's binary_logloss: 0.17045
[1284]	train's auc: 0.762549	train's binary_logloss: 0.167057	valid's

[1347]	train's auc: 0.763624	train's binary_logloss: 0.166866	valid's auc: 0.742811	valid's binary_logloss: 0.17041
[1348]	train's auc: 0.763644	train's binary_logloss: 0.166863	valid's auc: 0.742812	valid's binary_logloss: 0.17041
[1349]	train's auc: 0.763662	train's binary_logloss: 0.16686	valid's auc: 0.742817	valid's binary_logloss: 0.170409
[1350]	train's auc: 0.763681	train's binary_logloss: 0.166857	valid's auc: 0.742818	valid's binary_logloss: 0.170409
[1351]	train's auc: 0.763698	train's binary_logloss: 0.166854	valid's auc: 0.742818	valid's binary_logloss: 0.170409
[1352]	train's auc: 0.76372	train's binary_logloss: 0.166851	valid's auc: 0.742816	valid's binary_logloss: 0.170409
[1353]	train's auc: 0.763743	train's binary_logloss: 0.166848	valid's auc: 0.742809	valid's binary_logloss: 0.17041
[1354]	train's auc: 0.763757	train's binary_logloss: 0.166845	valid's auc: 0.742812	valid's binary_logloss: 0.170409
[1355]	train's auc: 0.763771	train's binary_logloss: 0.166842	valid's

[1418]	train's auc: 0.764856	train's binary_logloss: 0.166656	valid's auc: 0.742928	valid's binary_logloss: 0.170386
[1419]	train's auc: 0.764871	train's binary_logloss: 0.166654	valid's auc: 0.742924	valid's binary_logloss: 0.170387
[1420]	train's auc: 0.764886	train's binary_logloss: 0.166651	valid's auc: 0.742925	valid's binary_logloss: 0.170386
[1421]	train's auc: 0.764902	train's binary_logloss: 0.166647	valid's auc: 0.74293	valid's binary_logloss: 0.170385
[1422]	train's auc: 0.764923	train's binary_logloss: 0.166644	valid's auc: 0.742933	valid's binary_logloss: 0.170385
[1423]	train's auc: 0.764948	train's binary_logloss: 0.166641	valid's auc: 0.742926	valid's binary_logloss: 0.170386
[1424]	train's auc: 0.764959	train's binary_logloss: 0.166638	valid's auc: 0.742928	valid's binary_logloss: 0.170385
[1425]	train's auc: 0.764977	train's binary_logloss: 0.166633	valid's auc: 0.742934	valid's binary_logloss: 0.170384
[1426]	train's auc: 0.76499	train's binary_logloss: 0.16663	valid

[1489]	train's auc: 0.76607	train's binary_logloss: 0.166444	valid's auc: 0.743091	valid's binary_logloss: 0.170351
[1490]	train's auc: 0.766091	train's binary_logloss: 0.166441	valid's auc: 0.7431	valid's binary_logloss: 0.170351
[1491]	train's auc: 0.766106	train's binary_logloss: 0.166439	valid's auc: 0.743102	valid's binary_logloss: 0.170351
[1492]	train's auc: 0.766125	train's binary_logloss: 0.166436	valid's auc: 0.743102	valid's binary_logloss: 0.170351
[1493]	train's auc: 0.766145	train's binary_logloss: 0.166433	valid's auc: 0.743095	valid's binary_logloss: 0.170352
[1494]	train's auc: 0.766161	train's binary_logloss: 0.166431	valid's auc: 0.743102	valid's binary_logloss: 0.170351
[1495]	train's auc: 0.76617	train's binary_logloss: 0.166428	valid's auc: 0.743102	valid's binary_logloss: 0.170351
[1496]	train's auc: 0.766187	train's binary_logloss: 0.166425	valid's auc: 0.7431	valid's binary_logloss: 0.170351
[1497]	train's auc: 0.766214	train's binary_logloss: 0.166421	valid's 

[1560]	train's auc: 0.767253	train's binary_logloss: 0.16623	valid's auc: 0.74325	valid's binary_logloss: 0.170317
[1561]	train's auc: 0.767268	train's binary_logloss: 0.166227	valid's auc: 0.743254	valid's binary_logloss: 0.170316
[1562]	train's auc: 0.767289	train's binary_logloss: 0.166224	valid's auc: 0.743258	valid's binary_logloss: 0.170315
[1563]	train's auc: 0.767303	train's binary_logloss: 0.166222	valid's auc: 0.743256	valid's binary_logloss: 0.170316
[1564]	train's auc: 0.767317	train's binary_logloss: 0.16622	valid's auc: 0.743257	valid's binary_logloss: 0.170315
[1565]	train's auc: 0.767331	train's binary_logloss: 0.166217	valid's auc: 0.743249	valid's binary_logloss: 0.170316
[1566]	train's auc: 0.767352	train's binary_logloss: 0.166214	valid's auc: 0.743246	valid's binary_logloss: 0.170316
[1567]	train's auc: 0.767373	train's binary_logloss: 0.16621	valid's auc: 0.743245	valid's binary_logloss: 0.170316
[1568]	train's auc: 0.76739	train's binary_logloss: 0.166208	valid's

[1631]	train's auc: 0.768435	train's binary_logloss: 0.166031	valid's auc: 0.743344	valid's binary_logloss: 0.170296
[1632]	train's auc: 0.768453	train's binary_logloss: 0.166028	valid's auc: 0.743349	valid's binary_logloss: 0.170295
[1633]	train's auc: 0.76847	train's binary_logloss: 0.166024	valid's auc: 0.743342	valid's binary_logloss: 0.170296
[1634]	train's auc: 0.768488	train's binary_logloss: 0.166021	valid's auc: 0.743339	valid's binary_logloss: 0.170296
[1635]	train's auc: 0.768512	train's binary_logloss: 0.166018	valid's auc: 0.743342	valid's binary_logloss: 0.170296
[1636]	train's auc: 0.768528	train's binary_logloss: 0.166014	valid's auc: 0.743347	valid's binary_logloss: 0.170296
[1637]	train's auc: 0.768549	train's binary_logloss: 0.166011	valid's auc: 0.743347	valid's binary_logloss: 0.170296
[1638]	train's auc: 0.768564	train's binary_logloss: 0.166008	valid's auc: 0.743345	valid's binary_logloss: 0.170296
[1639]	train's auc: 0.76858	train's binary_logloss: 0.166005	vali

[1702]	train's auc: 0.769563	train's binary_logloss: 0.165831	valid's auc: 0.743472	valid's binary_logloss: 0.170272
[1703]	train's auc: 0.76958	train's binary_logloss: 0.165828	valid's auc: 0.743476	valid's binary_logloss: 0.170272
[1704]	train's auc: 0.7696	train's binary_logloss: 0.165825	valid's auc: 0.743478	valid's binary_logloss: 0.170272
[1705]	train's auc: 0.769624	train's binary_logloss: 0.165822	valid's auc: 0.743477	valid's binary_logloss: 0.170271
[1706]	train's auc: 0.769637	train's binary_logloss: 0.165819	valid's auc: 0.743477	valid's binary_logloss: 0.170271
[1707]	train's auc: 0.769652	train's binary_logloss: 0.165816	valid's auc: 0.743484	valid's binary_logloss: 0.17027
[1708]	train's auc: 0.769664	train's binary_logloss: 0.165815	valid's auc: 0.743478	valid's binary_logloss: 0.170271
[1709]	train's auc: 0.769682	train's binary_logloss: 0.165812	valid's auc: 0.743474	valid's binary_logloss: 0.170271
[1710]	train's auc: 0.769693	train's binary_logloss: 0.165809	valid'

[1773]	train's auc: 0.770709	train's binary_logloss: 0.16563	valid's auc: 0.743558	valid's binary_logloss: 0.170253
[1774]	train's auc: 0.770731	train's binary_logloss: 0.165627	valid's auc: 0.74355	valid's binary_logloss: 0.170254
[1775]	train's auc: 0.770742	train's binary_logloss: 0.165625	valid's auc: 0.743548	valid's binary_logloss: 0.170254
[1776]	train's auc: 0.770754	train's binary_logloss: 0.165623	valid's auc: 0.743542	valid's binary_logloss: 0.170255
[1777]	train's auc: 0.770771	train's binary_logloss: 0.16562	valid's auc: 0.743552	valid's binary_logloss: 0.170254
[1778]	train's auc: 0.770793	train's binary_logloss: 0.165617	valid's auc: 0.743551	valid's binary_logloss: 0.170254
[1779]	train's auc: 0.770808	train's binary_logloss: 0.165614	valid's auc: 0.743553	valid's binary_logloss: 0.170254
[1780]	train's auc: 0.770828	train's binary_logloss: 0.165611	valid's auc: 0.743555	valid's binary_logloss: 0.170254
[1781]	train's auc: 0.770838	train's binary_logloss: 0.165608	valid

[1844]	train's auc: 0.771813	train's binary_logloss: 0.165438	valid's auc: 0.743682	valid's binary_logloss: 0.170231
[1845]	train's auc: 0.771819	train's binary_logloss: 0.165435	valid's auc: 0.743685	valid's binary_logloss: 0.17023
[1846]	train's auc: 0.771832	train's binary_logloss: 0.165433	valid's auc: 0.743688	valid's binary_logloss: 0.17023
[1847]	train's auc: 0.771842	train's binary_logloss: 0.165431	valid's auc: 0.743688	valid's binary_logloss: 0.17023
[1848]	train's auc: 0.771854	train's binary_logloss: 0.165428	valid's auc: 0.743697	valid's binary_logloss: 0.170229
[1849]	train's auc: 0.771869	train's binary_logloss: 0.165425	valid's auc: 0.743696	valid's binary_logloss: 0.170229
[1850]	train's auc: 0.771888	train's binary_logloss: 0.165422	valid's auc: 0.743697	valid's binary_logloss: 0.170229
[1851]	train's auc: 0.771902	train's binary_logloss: 0.16542	valid's auc: 0.743696	valid's binary_logloss: 0.170229
[1852]	train's auc: 0.771917	train's binary_logloss: 0.165418	valid'

[1915]	train's auc: 0.772879	train's binary_logloss: 0.165249	valid's auc: 0.743736	valid's binary_logloss: 0.170219
[1916]	train's auc: 0.772896	train's binary_logloss: 0.165246	valid's auc: 0.743737	valid's binary_logloss: 0.170218
[1917]	train's auc: 0.772909	train's binary_logloss: 0.165244	valid's auc: 0.743733	valid's binary_logloss: 0.170219
[1918]	train's auc: 0.772923	train's binary_logloss: 0.16524	valid's auc: 0.743735	valid's binary_logloss: 0.170218
[1919]	train's auc: 0.77295	train's binary_logloss: 0.165237	valid's auc: 0.743741	valid's binary_logloss: 0.170217
[1920]	train's auc: 0.772966	train's binary_logloss: 0.165235	valid's auc: 0.743741	valid's binary_logloss: 0.170217
[1921]	train's auc: 0.772978	train's binary_logloss: 0.165232	valid's auc: 0.743741	valid's binary_logloss: 0.170217
[1922]	train's auc: 0.772987	train's binary_logloss: 0.16523	valid's auc: 0.743738	valid's binary_logloss: 0.170217
[1923]	train's auc: 0.773002	train's binary_logloss: 0.165227	valid

[1986]	train's auc: 0.773895	train's binary_logloss: 0.165058	valid's auc: 0.743801	valid's binary_logloss: 0.170204
[1987]	train's auc: 0.773905	train's binary_logloss: 0.165055	valid's auc: 0.743801	valid's binary_logloss: 0.170203
[1988]	train's auc: 0.773915	train's binary_logloss: 0.165053	valid's auc: 0.743799	valid's binary_logloss: 0.170203
[1989]	train's auc: 0.773939	train's binary_logloss: 0.16505	valid's auc: 0.74379	valid's binary_logloss: 0.170204
[1990]	train's auc: 0.773945	train's binary_logloss: 0.165047	valid's auc: 0.743791	valid's binary_logloss: 0.170204
[1991]	train's auc: 0.773966	train's binary_logloss: 0.165043	valid's auc: 0.743794	valid's binary_logloss: 0.170203
[1992]	train's auc: 0.773983	train's binary_logloss: 0.16504	valid's auc: 0.743795	valid's binary_logloss: 0.170203
[1993]	train's auc: 0.773999	train's binary_logloss: 0.165038	valid's auc: 0.743798	valid's binary_logloss: 0.170203
[1994]	train's auc: 0.774012	train's binary_logloss: 0.165035	valid

[2057]	train's auc: 0.77491	train's binary_logloss: 0.164867	valid's auc: 0.743801	valid's binary_logloss: 0.170196
[2058]	train's auc: 0.774924	train's binary_logloss: 0.164864	valid's auc: 0.743797	valid's binary_logloss: 0.170197
[2059]	train's auc: 0.774933	train's binary_logloss: 0.164862	valid's auc: 0.7438	valid's binary_logloss: 0.170196
[2060]	train's auc: 0.77495	train's binary_logloss: 0.164859	valid's auc: 0.743797	valid's binary_logloss: 0.170197
[2061]	train's auc: 0.774967	train's binary_logloss: 0.164857	valid's auc: 0.743799	valid's binary_logloss: 0.170197
[2062]	train's auc: 0.774984	train's binary_logloss: 0.164854	valid's auc: 0.743803	valid's binary_logloss: 0.170197
[2063]	train's auc: 0.774995	train's binary_logloss: 0.164852	valid's auc: 0.743803	valid's binary_logloss: 0.170197
[2064]	train's auc: 0.775002	train's binary_logloss: 0.164849	valid's auc: 0.743805	valid's binary_logloss: 0.170196
[2065]	train's auc: 0.775021	train's binary_logloss: 0.164847	valid'

[2128]	train's auc: 0.775937	train's binary_logloss: 0.164684	valid's auc: 0.743805	valid's binary_logloss: 0.170191
[2129]	train's auc: 0.77595	train's binary_logloss: 0.164681	valid's auc: 0.743805	valid's binary_logloss: 0.17019
[2130]	train's auc: 0.775965	train's binary_logloss: 0.164678	valid's auc: 0.743807	valid's binary_logloss: 0.17019
[2131]	train's auc: 0.775984	train's binary_logloss: 0.164675	valid's auc: 0.743808	valid's binary_logloss: 0.17019
[2132]	train's auc: 0.776001	train's binary_logloss: 0.164673	valid's auc: 0.743807	valid's binary_logloss: 0.17019
[2133]	train's auc: 0.77602	train's binary_logloss: 0.16467	valid's auc: 0.743807	valid's binary_logloss: 0.170189
[2134]	train's auc: 0.776035	train's binary_logloss: 0.164667	valid's auc: 0.743801	valid's binary_logloss: 0.17019
[2135]	train's auc: 0.77605	train's binary_logloss: 0.164665	valid's auc: 0.743804	valid's binary_logloss: 0.17019
[2136]	train's auc: 0.776073	train's binary_logloss: 0.164661	valid's auc:

In [32]:
from sklearn.metrics import roc_auc_score, log_loss
print('===================Test Set Performance===================')
y_valid_pred = clf.predict_proba(x_valid2save, num_iteration = clf.best_iteration_)[:,1]
print(pd.Series(y_valid_pred).describe())

# round(pd.Series(evals_ypre),6).to_csv('data_preprocessing/evals_ypre_'+str(i+1)+'.csv',index=False)
print('Test AUC', roc_auc_score(y_valid, y_valid_pred))
print('Test Logloss', log_loss(y_valid, y_valid_pred))

count    1.759974e+06
mean     4.796217e-02
std      5.839436e-02
min      2.186377e-04
25%      2.071137e-02
50%      3.481173e-02
75%      5.510371e-02
max      9.953930e-01
dtype: float64
Test AUC 0.7438440879170812
Test Logloss 0.17018985844814913


# 组合ID特征

In [20]:
label_feat_lst = ['aid', 'advertiserId', 'campaignId', 'creativeId',
                  'creativeSize', 'adCategoryId', 'productId', 'productType', 'age',
                  'gender','education', 'consumptionAbility', 'LBS',
                  'os', 'carrier', 'house']

In [22]:
df_data = df_data[label_feat_lst]

for feature in tqdm(label_feat_lst):
    try:
        df_data[feature] = LabelEncoder().fit_transform(df_data[feature].apply(int))
    except:
        df_data[feature] = LabelEncoder().fit_transform(df_data[feature])


100%|██████████| 16/16 [00:56<00:00,  3.51s/it]


In [23]:
train_part_x_sparse = pd.DataFrame()
evals_x_sparse = pd.DataFrame()

feat_len = len(label_feat_lst)
enc = OneHotEncoder()
num = 0
for i in range(feat_len):
    for j in range(feat_len - i - 1):
        s = time.time()
        # 握手问题, 每一个feat都要和其他feat计算一次
        print(label_feat_lst[i], label_feat_lst[i + j + 1])
#         se = df_data[label_feat_lst[i]] * 100000 + df_data[label_feat_lst[i + j + 1]] * 1
#         enc.fit(se.values.reshape(-1, 1))
        
#         se = data.loc[train_part_index][col[i]]*100000+data.loc[train_part_index][col[i+j+1]]*1
#         arr =enc.transform(se.values.reshape(-1, 1))
#         train_part_x_sparse = sparse.hstack((train_part_x_sparse, arr))

        
#         se = data.loc[evals_index][col[i]]*100000+data.loc[evals_index][col[i+j+1]]*1
#         arr = enc.transform(se.values.reshape(-1, 1))
#         evals_x_sparse = sparse.hstack((evals_x_sparse,arr))

#         num+=1
#         arr = []
#         print(num,col[i],col[i+j+1],int(time.time()-s),"s")
#         if num % 12==0:
#             k = num//12
#             ##存写组合稀疏矩阵
#             print("Saving...")
#             print(k)
#             print('train_part_x...')
#             sparse.save_npz('data_preprocessing/train_part_x_sparse_two_'+str(k)+'.npz',train_part_x_sparse)
#             print('evals_x...')
#             sparse.save_npz('data_preprocessing/evals_x_sparse_two_'+str(k)+'.npz',evals_x_sparse)
#             print('test1_x...')
#             sparse.save_npz('data_preprocessing/test1_x_sparse_two_'+str(k)+'.npz',test1_x_sparse)
#             print('test2_x...')
#             sparse.save_npz('data_preprocessing/test2_x_sparse_two_'+str(k)+'.npz',test2_x_sparse)
#             print('Over')
#             train_part_x_sparse=pd.DataFrame()
#             evals_x_sparse=pd.DataFrame()
#             test1_x_sparse=pd.DataFrame()
#             test2_x_sparse=pd.DataFrame()

aid advertiserId
aid campaignId
aid creativeId
aid creativeSize
aid adCategoryId
aid productId
aid productType
aid age
aid gender
aid education
aid consumptionAbility
aid LBS
aid os
aid carrier
aid house
advertiserId campaignId
advertiserId creativeId
advertiserId creativeSize
advertiserId adCategoryId
advertiserId productId
advertiserId productType
advertiserId age
advertiserId gender
advertiserId education
advertiserId consumptionAbility
advertiserId LBS
advertiserId os
advertiserId carrier
advertiserId house
campaignId creativeId
campaignId creativeSize
campaignId adCategoryId
campaignId productId
campaignId productType
campaignId age
campaignId gender
campaignId education
campaignId consumptionAbility
campaignId LBS
campaignId os
campaignId carrier
campaignId house
creativeId creativeSize
creativeId adCategoryId
creativeId productId
creativeId productType
creativeId age
creativeId gender
creativeId education
creativeId consumptionAbility
creativeId LBS
creativeId os
creativeId carr

# 模型构建

In [14]:
def model_evaluation(y_train, y_train_pred, y_test, y_test_pred, model_name=''):
    """

    :param y_train:
    :type y_train:
    :param y_train_pred:
    :type y_train_pred:
    :param y_test:
    :type y_test:
    :param y_test_pred:
    :type y_test_pred:
    :param model_name:
    :type model_name:
    :return:
    :rtype:
    """
    print("{} Train AUC: {}, logloss: {}".format(model_name, roc_auc_score(y_train, y_train_pred), log_loss(y_train, y_train_pred)))
    # print("{} Train confusion matrix: {}".format(model_name, confusion_matrix(y_train, y_train_pred)))

    print("{} Test AUC: {}, logloss: {}".format(model_name, roc_auc_score(y_test, y_test_pred), log_loss(y_test, y_test_pred)))
    # print("{} Test confusion matrix: {}".format(model_name, confusion_matrix(y_test, y_test_pred)))

## Baseline版本
- 只使用离散特征，进行onehot编码
- 线性模型：LogisticRegression

In [13]:
print(x_train.shape, x_valid.shape)
# baseline版本
turned_param = {
        'penalty': 'l2',
        'C': 10,
        'solver': 'lbfgs',
        'tol': 1e-4,
        'max_iter': 10000
    }

clf = LogisticRegression(random_state=1, **turned_param)
clf.fit(x_train, y_train)

y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_valid)

model_evaluation(y_train, y_train_pred, y_valid, y_test_pred, model_name='LogisticRegression')


(7038840, 1526) (1759974, 1526)


NameError: name 'model_evaluation' is not defined

In [15]:
# 基于相似度


LogisticRegression Train AUC: 0.5000014816351326, logloss: 1.6558969428673103
LogisticRegression Test AUC: 0.5, logloss: 1.6581997519647336


In [15]:
for i, j in enumerate([1,1,1]):
    print(i, j)

0 1
1 1
2 1
