In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline

In [2]:
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics
from sklearn import cluster
from sklearn import naive_bayes
from sklearn import neural_network
from sklearn import ensemble
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import lightgbm as lgb

import zipfile
import os

# 数据

In [14]:
df = pd.read_table('../out/train_visual_10.txt', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,0.089277,0.210253,630,442,620,1151,806,894,967,628,551,1017
1,0,0.089277,0.210253,2009,2045,1942,1931,1917,1878,1194,2046,1860,1980
2,1,0.089277,0.210253,449,369,1768,344,806,1039,1745,1960,1942,939
3,0,0.089277,0.210253,1771,469,1454,1074,1481,1805,723,1340,703,1330
4,0,0.089277,0.210253,1431,1728,1581,1825,1852,1699,1186,1422,1114,1608


In [13]:
df_weight = pd.read_table('../out/train_weight.txt', names=['w'])
df_weight.head()

Unnamed: 0,w
0,0.1
1,0.1
2,0.4
3,0.1
4,0.1


In [15]:
df = pd.concat([df, df_weight], axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,w
0,0,0.089277,0.210253,630,442,620,1151,806,894,967,628,551,1017,0.1
1,0,0.089277,0.210253,2009,2045,1942,1931,1917,1878,1194,2046,1860,1980,0.1
2,1,0.089277,0.210253,449,369,1768,344,806,1039,1745,1960,1942,939,0.4
3,0,0.089277,0.210253,1771,469,1454,1074,1481,1805,723,1340,703,1330,0.1
4,0,0.089277,0.210253,1431,1728,1581,1825,1852,1699,1186,1422,1114,1608,0.1


In [16]:
# 拆分训练集、验证集
df_train,df_valid = train_test_split(df, test_size = 0.2, random_state=21)

In [17]:
print(df_train.shape, df_valid.shape)

(16683475, 14) (4170869, 14)


In [21]:
# 训练集
X_train = df_train.drop(columns=[0, 'w']).values
y_train = df_train[0].values
w_train = df_train['w'].values

In [22]:
# 验证集
X_valid = df_valid.drop(columns=[0, 'w']).values
y_valid = df_valid[0].values
w_valid = df_valid['w'].values

In [31]:
# 构造LGB数据集
ds_train = lgb.Dataset(X_train, label=y_train, weight=w_train, categorical_feature=[2,3,4,5,6,7,8,9,10,11], free_raw_data=False)
ds_valid = lgb.Dataset(X_valid, label=y_valid, weight=w_valid, categorical_feature=[2,3,4,5,6,7,8,9,10,11], reference=ds_train, free_raw_data=False)

# 调参

In [32]:
# 设置初始参数：不含交叉验证参数
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
}

In [33]:
# 交叉验证(调参)
min_merror = float('Inf')
best_params = {}

In [None]:
# 准确率
for num_leaves in range(20,200,5):
    for max_depth in range(3,8,1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth

        cv_results = lgb.cv(params, ds_train, metrics=['binary_error'], early_stopping_rounds=10, verbose_eval=True)
        mean_merror = pd.Series(cv_results['binary_error-mean']).min()
        boost_rounds = pd.Series(cv_results['binary_error-mean']).argmin()
            
        if mean_merror < min_merror:
            min_merror = mean_merror
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
            
params['num_leaves'] = best_params['num_leaves']
params['max_depth'] = best_params['max_depth']



[1]	cv_agg's binary_error: 0.344499 + 0.0013503
[2]	cv_agg's binary_error: 0.343449 + 0.000244708
[3]	cv_agg's binary_error: 0.343449 + 0.000244708
[4]	cv_agg's binary_error: 0.34221 + 0.000243084
[5]	cv_agg's binary_error: 0.341723 + 0.000352326
[6]	cv_agg's binary_error: 0.341825 + 4.27373e-05
[7]	cv_agg's binary_error: 0.341525 + 0.000212094
[8]	cv_agg's binary_error: 0.341323 + 0.000407272
[9]	cv_agg's binary_error: 0.341063 + 0.000131333
[10]	cv_agg's binary_error: 0.340759 + 0.000218406
[11]	cv_agg's binary_error: 0.340519 + 0.000206603
[12]	cv_agg's binary_error: 0.340469 + 0.000189659
[13]	cv_agg's binary_error: 0.340389 + 0.000174832
[14]	cv_agg's binary_error: 0.340251 + 0.0002036
[15]	cv_agg's binary_error: 0.34015 + 0.000208965
[16]	cv_agg's binary_error: 0.340072 + 0.000221483
[17]	cv_agg's binary_error: 0.340012 + 0.000204492
[18]	cv_agg's binary_error: 0.33994 + 0.000185735
[19]	cv_agg's binary_error: 0.339852 + 0.000176857
[20]	cv_agg's binary_error: 0.33979 + 0.0001576

  if __name__ == '__main__':


[1]	cv_agg's binary_error: 0.343091 + 0.000632022
[2]	cv_agg's binary_error: 0.342561 + 0.000253993
[3]	cv_agg's binary_error: 0.34141 + 0.00013702
[4]	cv_agg's binary_error: 0.340937 + 0.000363835
[5]	cv_agg's binary_error: 0.340376 + 0.00019064
[6]	cv_agg's binary_error: 0.340103 + 0.000330361
[7]	cv_agg's binary_error: 0.340134 + 0.000334913
[8]	cv_agg's binary_error: 0.339914 + 0.000136906
[9]	cv_agg's binary_error: 0.339768 + 0.000165484
[10]	cv_agg's binary_error: 0.339676 + 0.000156997
[11]	cv_agg's binary_error: 0.339595 + 0.000134283
[12]	cv_agg's binary_error: 0.339528 + 0.000155528
[13]	cv_agg's binary_error: 0.33947 + 0.000130711
[14]	cv_agg's binary_error: 0.339364 + 0.00016413
[15]	cv_agg's binary_error: 0.339268 + 0.000129563
[16]	cv_agg's binary_error: 0.339186 + 0.000144043
[17]	cv_agg's binary_error: 0.339095 + 0.000146957
[18]	cv_agg's binary_error: 0.338966 + 0.000165786
[19]	cv_agg's binary_error: 0.338863 + 0.000176947
[20]	cv_agg's binary_error: 0.33875 + 0.00016

[63]	cv_agg's binary_error: 0.333677 + 7.52419e-05
[64]	cv_agg's binary_error: 0.333605 + 9.3051e-05
[65]	cv_agg's binary_error: 0.333513 + 8.6523e-05
[66]	cv_agg's binary_error: 0.333446 + 9.84735e-05
[67]	cv_agg's binary_error: 0.333378 + 9.26077e-05
[68]	cv_agg's binary_error: 0.333313 + 8.78408e-05
[69]	cv_agg's binary_error: 0.333224 + 8.38608e-05
[70]	cv_agg's binary_error: 0.33316 + 8.85872e-05
[71]	cv_agg's binary_error: 0.333084 + 9.25505e-05
[72]	cv_agg's binary_error: 0.333007 + 7.97039e-05
[73]	cv_agg's binary_error: 0.332948 + 8.91904e-05
[74]	cv_agg's binary_error: 0.332875 + 9.20823e-05
[75]	cv_agg's binary_error: 0.332792 + 9.66734e-05
[76]	cv_agg's binary_error: 0.33272 + 8.59921e-05
[77]	cv_agg's binary_error: 0.332635 + 9.156e-05
[78]	cv_agg's binary_error: 0.332558 + 0.000131867
[79]	cv_agg's binary_error: 0.332486 + 0.000119712
[80]	cv_agg's binary_error: 0.332422 + 0.000118006
[81]	cv_agg's binary_error: 0.33236 + 0.000142041
[82]	cv_agg's binary_error: 0.332282 +

[25]	cv_agg's binary_error: 0.337314 + 0.000116875
[26]	cv_agg's binary_error: 0.337165 + 0.000121531
[27]	cv_agg's binary_error: 0.337059 + 0.000108116
[28]	cv_agg's binary_error: 0.336924 + 0.000149827
[29]	cv_agg's binary_error: 0.336759 + 0.000100029
[30]	cv_agg's binary_error: 0.336744 + 0.000163489
[31]	cv_agg's binary_error: 0.336554 + 8.00638e-05
[32]	cv_agg's binary_error: 0.336413 + 0.000113431
[33]	cv_agg's binary_error: 0.336283 + 0.0001019
[34]	cv_agg's binary_error: 0.336153 + 9.70521e-05
[35]	cv_agg's binary_error: 0.336012 + 9.49663e-05
[36]	cv_agg's binary_error: 0.335899 + 0.000100283
[37]	cv_agg's binary_error: 0.33573 + 9.85594e-05
[38]	cv_agg's binary_error: 0.335614 + 0.000139635
[39]	cv_agg's binary_error: 0.335467 + 0.000134881
[40]	cv_agg's binary_error: 0.335317 + 0.000146764
[41]	cv_agg's binary_error: 0.335167 + 0.00011507
[42]	cv_agg's binary_error: 0.335055 + 0.000155296
[43]	cv_agg's binary_error: 0.334934 + 0.000149062
[44]	cv_agg's binary_error: 0.33478

[87]	cv_agg's binary_error: 0.334885 + 0.000156782
[88]	cv_agg's binary_error: 0.334847 + 0.000177287
[89]	cv_agg's binary_error: 0.334796 + 0.000176928
[90]	cv_agg's binary_error: 0.334739 + 0.000167214
[91]	cv_agg's binary_error: 0.334717 + 0.000151362
[92]	cv_agg's binary_error: 0.334681 + 0.000148762
[93]	cv_agg's binary_error: 0.334632 + 0.000144889
[94]	cv_agg's binary_error: 0.334577 + 0.000134859
[95]	cv_agg's binary_error: 0.334543 + 0.000142134
[96]	cv_agg's binary_error: 0.334496 + 0.000149959
[97]	cv_agg's binary_error: 0.334455 + 0.000143608
[98]	cv_agg's binary_error: 0.334408 + 0.000155498
[99]	cv_agg's binary_error: 0.334347 + 0.000154349
[100]	cv_agg's binary_error: 0.334317 + 0.000165987
[1]	cv_agg's binary_error: 0.343091 + 0.000632022
[2]	cv_agg's binary_error: 0.342561 + 0.000253993
[3]	cv_agg's binary_error: 0.34141 + 0.00013702
[4]	cv_agg's binary_error: 0.340937 + 0.000363835
[5]	cv_agg's binary_error: 0.340376 + 0.00019064
[6]	cv_agg's binary_error: 0.340103 + 

# 训练

In [31]:
!../bin/lightgbm \
    config=../conf/train.conf \
    data=../out/df_train.txt \
    valid=../out/df_test.txt \
    input_model=../out/model_visual_10_4.txt \
    output_model=../out/model_visual_10_5.txt \

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Using column number 0 as label
[LightGBM] [Info] Loading weights...
[LightGBM] [Info] Loading weights...
[LightGBM] [Info] Finished loading data in 1234.791463 seconds
[LightGBM] [Info] Number of positive: 3650353, number of negative: 14349647
[LightGBM] [Info] Total Bins 18258
[LightGBM] [Info] Number of data: 18000000, number of used features: 12
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration:1, training auc : 0.769272
[LightGBM] [Info] Iteration:1, valid_1 auc : 0.747031
[LightGBM] [Info] 9.427032 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration:2, training auc : 0.769324
[LightGBM] [Info] Iteration:2, valid_1 auc : 0.747049
[LightGBM] [Info] 16.853592 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration:3, training auc : 0.769375
[LightGBM] [Info] Iteration:3, valid_1 auc : 0.747068
[LightGBM] [Info] 24.443923 seconds elap

[LightGBM] [Info] Iteration:44, valid_1 auc : 0.747669
[LightGBM] [Info] 311.771470 seconds elapsed, finished iteration 44
[LightGBM] [Info] Iteration:45, training auc : 0.771352
[LightGBM] [Info] Iteration:45, valid_1 auc : 0.747674
[LightGBM] [Info] 318.621219 seconds elapsed, finished iteration 45
[LightGBM] [Info] Iteration:46, training auc : 0.771402
[LightGBM] [Info] Iteration:46, valid_1 auc : 0.747692
[LightGBM] [Info] 325.790192 seconds elapsed, finished iteration 46
[LightGBM] [Info] Iteration:47, training auc : 0.771445
[LightGBM] [Info] Iteration:47, valid_1 auc : 0.747688
[LightGBM] [Info] 332.612404 seconds elapsed, finished iteration 47
[LightGBM] [Info] Iteration:48, training auc : 0.771494
[LightGBM] [Info] Iteration:48, valid_1 auc : 0.747702
[LightGBM] [Info] 339.576703 seconds elapsed, finished iteration 48
[LightGBM] [Info] Iteration:49, training auc : 0.771543
[LightGBM] [Info] Iteration:49, valid_1 auc : 0.747709
[LightGBM] [Info] 346.965141 seconds elapsed, fini

[LightGBM] [Info] Iteration:90, valid_1 auc : 0.748289
[LightGBM] [Info] 631.440195 seconds elapsed, finished iteration 90
[LightGBM] [Info] Iteration:91, training auc : 0.773358
[LightGBM] [Info] Iteration:91, valid_1 auc : 0.748302
[LightGBM] [Info] 639.204678 seconds elapsed, finished iteration 91
[LightGBM] [Info] Iteration:92, training auc : 0.7734
[LightGBM] [Info] Iteration:92, valid_1 auc : 0.748311
[LightGBM] [Info] 645.026729 seconds elapsed, finished iteration 92
[LightGBM] [Info] Iteration:93, training auc : 0.773439
[LightGBM] [Info] Iteration:93, valid_1 auc : 0.748322
[LightGBM] [Info] 651.658712 seconds elapsed, finished iteration 93
[LightGBM] [Info] Iteration:94, training auc : 0.773482
[LightGBM] [Info] Iteration:94, valid_1 auc : 0.748333
[LightGBM] [Info] 658.763999 seconds elapsed, finished iteration 94
[LightGBM] [Info] Iteration:95, training auc : 0.773522
[LightGBM] [Info] Iteration:95, valid_1 auc : 0.748345
[LightGBM] [Info] 665.509241 seconds elapsed, finish

[LightGBM] [Info] Iteration:136, training auc : 0.775193
[LightGBM] [Info] Iteration:136, valid_1 auc : 0.748828
[LightGBM] [Info] 950.849880 seconds elapsed, finished iteration 136
[LightGBM] [Info] Iteration:137, training auc : 0.775235
[LightGBM] [Info] Iteration:137, valid_1 auc : 0.748842
[LightGBM] [Info] 958.363038 seconds elapsed, finished iteration 137
[LightGBM] [Info] Iteration:138, training auc : 0.77527
[LightGBM] [Info] Iteration:138, valid_1 auc : 0.748847
[LightGBM] [Info] 965.115267 seconds elapsed, finished iteration 138
[LightGBM] [Info] Iteration:139, training auc : 0.775309
[LightGBM] [Info] Iteration:139, valid_1 auc : 0.748856
[LightGBM] [Info] 972.060554 seconds elapsed, finished iteration 139
[LightGBM] [Info] Iteration:140, training auc : 0.775343
[LightGBM] [Info] Iteration:140, valid_1 auc : 0.748865
[LightGBM] [Info] 979.518791 seconds elapsed, finished iteration 140
[LightGBM] [Info] Iteration:141, training auc : 0.775388
[LightGBM] [Info] Iteration:141, v

[LightGBM] [Info] Iteration:181, training auc : 0.77695
[LightGBM] [Info] Iteration:181, valid_1 auc : 0.749298
[LightGBM] [Info] 1280.445068 seconds elapsed, finished iteration 181
[LightGBM] [Info] Iteration:182, training auc : 0.776987
[LightGBM] [Info] Iteration:182, valid_1 auc : 0.74931
[LightGBM] [Info] 1288.933113 seconds elapsed, finished iteration 182
[LightGBM] [Info] Iteration:183, training auc : 0.777023
[LightGBM] [Info] Iteration:183, valid_1 auc : 0.74932
[LightGBM] [Info] 1296.336562 seconds elapsed, finished iteration 183
[LightGBM] [Info] Iteration:184, training auc : 0.777058
[LightGBM] [Info] Iteration:184, valid_1 auc : 0.749327
[LightGBM] [Info] 1302.824091 seconds elapsed, finished iteration 184
[LightGBM] [Info] Iteration:185, training auc : 0.777093
[LightGBM] [Info] Iteration:185, valid_1 auc : 0.749337
[LightGBM] [Info] 1310.132791 seconds elapsed, finished iteration 185
[LightGBM] [Info] Iteration:186, training auc : 0.777127
[LightGBM] [Info] Iteration:186

[LightGBM] [Info] Iteration:226, training auc : 0.778556
[LightGBM] [Info] Iteration:226, valid_1 auc : 0.749665
[LightGBM] [Info] 1615.102384 seconds elapsed, finished iteration 226
[LightGBM] [Info] Iteration:227, training auc : 0.778592
[LightGBM] [Info] Iteration:227, valid_1 auc : 0.749674
[LightGBM] [Info] 1622.466219 seconds elapsed, finished iteration 227
[LightGBM] [Info] Iteration:228, training auc : 0.778627
[LightGBM] [Info] Iteration:228, valid_1 auc : 0.749683
[LightGBM] [Info] 1631.556831 seconds elapsed, finished iteration 228
[LightGBM] [Info] Iteration:229, training auc : 0.77866
[LightGBM] [Info] Iteration:229, valid_1 auc : 0.749691
[LightGBM] [Info] 1641.033457 seconds elapsed, finished iteration 229
[LightGBM] [Info] Iteration:230, training auc : 0.778696
[LightGBM] [Info] Iteration:230, valid_1 auc : 0.749694
[LightGBM] [Info] 1648.546926 seconds elapsed, finished iteration 230
[LightGBM] [Info] Iteration:231, training auc : 0.778732
[LightGBM] [Info] Iteration:2

[LightGBM] [Info] Iteration:271, training auc : 0.780059
[LightGBM] [Info] Iteration:271, valid_1 auc : 0.750015
[LightGBM] [Info] 1977.189288 seconds elapsed, finished iteration 271
[LightGBM] [Info] Iteration:272, training auc : 0.780091
[LightGBM] [Info] Iteration:272, valid_1 auc : 0.750019
[LightGBM] [Info] 1985.341205 seconds elapsed, finished iteration 272
[LightGBM] [Info] Iteration:273, training auc : 0.780121
[LightGBM] [Info] Iteration:273, valid_1 auc : 0.750025
[LightGBM] [Info] 1993.387289 seconds elapsed, finished iteration 273
[LightGBM] [Info] Iteration:274, training auc : 0.780154
[LightGBM] [Info] Iteration:274, valid_1 auc : 0.750032
[LightGBM] [Info] 2003.938104 seconds elapsed, finished iteration 274
[LightGBM] [Info] Iteration:275, training auc : 0.780188
[LightGBM] [Info] Iteration:275, valid_1 auc : 0.75004
[LightGBM] [Info] 2013.452696 seconds elapsed, finished iteration 275
[LightGBM] [Info] Iteration:276, training auc : 0.780223
[LightGBM] [Info] Iteration:2

[LightGBM] [Info] Iteration:316, training auc : 0.781455
[LightGBM] [Info] Iteration:316, valid_1 auc : 0.750305
[LightGBM] [Info] 2458.771565 seconds elapsed, finished iteration 316
[LightGBM] [Info] Iteration:317, training auc : 0.78149
[LightGBM] [Info] Iteration:317, valid_1 auc : 0.750299
[LightGBM] [Info] 2465.952519 seconds elapsed, finished iteration 317
[LightGBM] [Info] Iteration:318, training auc : 0.781518
[LightGBM] [Info] Iteration:318, valid_1 auc : 0.750303
[LightGBM] [Info] 2473.923851 seconds elapsed, finished iteration 318
[LightGBM] [Info] Iteration:319, training auc : 0.781547
[LightGBM] [Info] Iteration:319, valid_1 auc : 0.750308
[LightGBM] [Info] 2484.104309 seconds elapsed, finished iteration 319
[LightGBM] [Info] Iteration:320, training auc : 0.781575
[LightGBM] [Info] Iteration:320, valid_1 auc : 0.750313
[LightGBM] [Info] 2493.707779 seconds elapsed, finished iteration 320
[LightGBM] [Info] Iteration:321, training auc : 0.781608
[LightGBM] [Info] Iteration:3

[LightGBM] [Info] Iteration:361, training auc : 0.78281
[LightGBM] [Info] Iteration:361, valid_1 auc : 0.750515
[LightGBM] [Info] 2817.788730 seconds elapsed, finished iteration 361
[LightGBM] [Info] Iteration:362, training auc : 0.782839
[LightGBM] [Info] Iteration:362, valid_1 auc : 0.75052
[LightGBM] [Info] 2826.057741 seconds elapsed, finished iteration 362
[LightGBM] [Info] Iteration:363, training auc : 0.782868
[LightGBM] [Info] Iteration:363, valid_1 auc : 0.750526
[LightGBM] [Info] 2833.833501 seconds elapsed, finished iteration 363
[LightGBM] [Info] Iteration:364, training auc : 0.782896
[LightGBM] [Info] Iteration:364, valid_1 auc : 0.750533
[LightGBM] [Info] 2841.418627 seconds elapsed, finished iteration 364
[LightGBM] [Info] Iteration:365, training auc : 0.782924
[LightGBM] [Info] Iteration:365, valid_1 auc : 0.750541
[LightGBM] [Info] 2848.478286 seconds elapsed, finished iteration 365
[LightGBM] [Info] Iteration:366, training auc : 0.782954
[LightGBM] [Info] Iteration:36

[LightGBM] [Info] Iteration:406, training auc : 0.784085
[LightGBM] [Info] Iteration:406, valid_1 auc : 0.750725
[LightGBM] [Info] 3184.617015 seconds elapsed, finished iteration 406
[LightGBM] [Info] Iteration:407, training auc : 0.784111
[LightGBM] [Info] Iteration:407, valid_1 auc : 0.750729
[LightGBM] [Info] 3192.558202 seconds elapsed, finished iteration 407
[LightGBM] [Info] Iteration:408, training auc : 0.784136
[LightGBM] [Info] Iteration:408, valid_1 auc : 0.750734
[LightGBM] [Info] 3200.386316 seconds elapsed, finished iteration 408
[LightGBM] [Info] Iteration:409, training auc : 0.784165
[LightGBM] [Info] Iteration:409, valid_1 auc : 0.750743
[LightGBM] [Info] 3208.235350 seconds elapsed, finished iteration 409
[LightGBM] [Info] Iteration:410, training auc : 0.784194
[LightGBM] [Info] Iteration:410, valid_1 auc : 0.750749
[LightGBM] [Info] 3216.005880 seconds elapsed, finished iteration 410
[LightGBM] [Info] Iteration:411, training auc : 0.784221
[LightGBM] [Info] Iteration:

[LightGBM] [Info] Iteration:451, training auc : 0.785266
[LightGBM] [Info] Iteration:451, valid_1 auc : 0.750914
[LightGBM] [Info] 3545.683299 seconds elapsed, finished iteration 451
[LightGBM] [Info] Iteration:452, training auc : 0.785291
[LightGBM] [Info] Iteration:452, valid_1 auc : 0.750918
[LightGBM] [Info] 3553.826396 seconds elapsed, finished iteration 452
[LightGBM] [Info] Iteration:453, training auc : 0.785317
[LightGBM] [Info] Iteration:453, valid_1 auc : 0.750922
[LightGBM] [Info] 3561.930321 seconds elapsed, finished iteration 453
[LightGBM] [Info] Iteration:454, training auc : 0.785343
[LightGBM] [Info] Iteration:454, valid_1 auc : 0.750924
[LightGBM] [Info] 3570.522562 seconds elapsed, finished iteration 454
[LightGBM] [Info] Iteration:455, training auc : 0.785367
[LightGBM] [Info] Iteration:455, valid_1 auc : 0.750929
[LightGBM] [Info] 3577.153273 seconds elapsed, finished iteration 455
[LightGBM] [Info] Iteration:456, training auc : 0.785393
[LightGBM] [Info] Iteration:

[LightGBM] [Info] Iteration:496, training auc : 0.786391
[LightGBM] [Info] Iteration:496, valid_1 auc : 0.751067
[LightGBM] [Info] 3924.167877 seconds elapsed, finished iteration 496
[LightGBM] [Info] Iteration:497, training auc : 0.786416
[LightGBM] [Info] Iteration:497, valid_1 auc : 0.751069
[LightGBM] [Info] 3931.337915 seconds elapsed, finished iteration 497
[LightGBM] [Info] Iteration:498, training auc : 0.78644
[LightGBM] [Info] Iteration:498, valid_1 auc : 0.751073
[LightGBM] [Info] 3941.531075 seconds elapsed, finished iteration 498
[LightGBM] [Info] Iteration:499, training auc : 0.786462
[LightGBM] [Info] Iteration:499, valid_1 auc : 0.751077
[LightGBM] [Info] 3950.421331 seconds elapsed, finished iteration 499
[LightGBM] [Info] Iteration:500, training auc : 0.786485
[LightGBM] [Info] Iteration:500, valid_1 auc : 0.751079
[LightGBM] [Info] 3958.028876 seconds elapsed, finished iteration 500
[LightGBM] [Info] Iteration:501, training auc : 0.786509
[LightGBM] [Info] Iteration:5

[LightGBM] [Info] Iteration:541, training auc : 0.787486
[LightGBM] [Info] Iteration:541, valid_1 auc : 0.751218
[LightGBM] [Info] 4288.572800 seconds elapsed, finished iteration 541
[LightGBM] [Info] Iteration:542, training auc : 0.787509
[LightGBM] [Info] Iteration:542, valid_1 auc : 0.751223
[LightGBM] [Info] 4295.024548 seconds elapsed, finished iteration 542
[LightGBM] [Info] Iteration:543, training auc : 0.787533
[LightGBM] [Info] Iteration:543, valid_1 auc : 0.751225
[LightGBM] [Info] 4302.680034 seconds elapsed, finished iteration 543
[LightGBM] [Info] Iteration:544, training auc : 0.787554
[LightGBM] [Info] Iteration:544, valid_1 auc : 0.751226
[LightGBM] [Info] 4309.906525 seconds elapsed, finished iteration 544
[LightGBM] [Info] Iteration:545, training auc : 0.787578
[LightGBM] [Info] Iteration:545, valid_1 auc : 0.75123
[LightGBM] [Info] 4317.778646 seconds elapsed, finished iteration 545
[LightGBM] [Info] Iteration:546, training auc : 0.787603
[LightGBM] [Info] Iteration:5

[LightGBM] [Info] Iteration:586, training auc : 0.788528
[LightGBM] [Info] Iteration:586, valid_1 auc : 0.751334
[LightGBM] [Info] 4677.029419 seconds elapsed, finished iteration 586
[LightGBM] [Info] Iteration:587, training auc : 0.788548
[LightGBM] [Info] Iteration:587, valid_1 auc : 0.751336
[LightGBM] [Info] 4685.616561 seconds elapsed, finished iteration 587
[LightGBM] [Info] Iteration:588, training auc : 0.788575
[LightGBM] [Info] Iteration:588, valid_1 auc : 0.751338
[LightGBM] [Info] 4694.597843 seconds elapsed, finished iteration 588
[LightGBM] [Info] Iteration:589, training auc : 0.788597
[LightGBM] [Info] Iteration:589, valid_1 auc : 0.751338
[LightGBM] [Info] 4702.739050 seconds elapsed, finished iteration 589
[LightGBM] [Info] Iteration:590, training auc : 0.788624
[LightGBM] [Info] Iteration:590, valid_1 auc : 0.751338
[LightGBM] [Info] 4710.283696 seconds elapsed, finished iteration 590
[LightGBM] [Info] Iteration:591, training auc : 0.788648
[LightGBM] [Info] Iteration:

[LightGBM] [Info] Iteration:631, training auc : 0.789517
[LightGBM] [Info] Iteration:631, valid_1 auc : 0.751458
[LightGBM] [Info] 5054.277028 seconds elapsed, finished iteration 631
[LightGBM] [Info] Iteration:632, training auc : 0.789539
[LightGBM] [Info] Iteration:632, valid_1 auc : 0.751463
[LightGBM] [Info] 5063.171252 seconds elapsed, finished iteration 632
[LightGBM] [Info] Iteration:633, training auc : 0.789558
[LightGBM] [Info] Iteration:633, valid_1 auc : 0.751466
[LightGBM] [Info] 5072.122383 seconds elapsed, finished iteration 633
[LightGBM] [Info] Iteration:634, training auc : 0.789578
[LightGBM] [Info] Iteration:634, valid_1 auc : 0.751466
[LightGBM] [Info] 5081.374468 seconds elapsed, finished iteration 634
[LightGBM] [Info] Iteration:635, training auc : 0.789599
[LightGBM] [Info] Iteration:635, valid_1 auc : 0.751468
[LightGBM] [Info] 5089.153044 seconds elapsed, finished iteration 635
[LightGBM] [Info] Iteration:636, training auc : 0.789622
[LightGBM] [Info] Iteration:

[LightGBM] [Info] Iteration:676, training auc : 0.790475
[LightGBM] [Info] Iteration:676, valid_1 auc : 0.751536
[LightGBM] [Info] 5421.617425 seconds elapsed, finished iteration 676
[LightGBM] [Info] Iteration:677, training auc : 0.790496
[LightGBM] [Info] Iteration:677, valid_1 auc : 0.751537
[LightGBM] [Info] 5429.185715 seconds elapsed, finished iteration 677
[LightGBM] [Info] Iteration:678, training auc : 0.790517
[LightGBM] [Info] Iteration:678, valid_1 auc : 0.751536
[LightGBM] [Info] 5440.480796 seconds elapsed, finished iteration 678
[LightGBM] [Info] Iteration:679, training auc : 0.790537
[LightGBM] [Info] Iteration:679, valid_1 auc : 0.751537
[LightGBM] [Info] 5449.218685 seconds elapsed, finished iteration 679
[LightGBM] [Info] Iteration:680, training auc : 0.790558
[LightGBM] [Info] Iteration:680, valid_1 auc : 0.751537
[LightGBM] [Info] 5456.621745 seconds elapsed, finished iteration 680
[LightGBM] [Info] Iteration:681, training auc : 0.790577
[LightGBM] [Info] Iteration:

[LightGBM] [Info] Iteration:721, training auc : 0.791407
[LightGBM] [Info] Iteration:721, valid_1 auc : 0.751603
[LightGBM] [Info] 5793.641077 seconds elapsed, finished iteration 721
[LightGBM] [Info] Iteration:722, training auc : 0.791428
[LightGBM] [Info] Iteration:722, valid_1 auc : 0.751604
[LightGBM] [Info] 5801.199795 seconds elapsed, finished iteration 722
[LightGBM] [Info] Iteration:723, training auc : 0.791447
[LightGBM] [Info] Iteration:723, valid_1 auc : 0.751606
[LightGBM] [Info] 5809.281411 seconds elapsed, finished iteration 723
[LightGBM] [Info] Iteration:724, training auc : 0.791467
[LightGBM] [Info] Iteration:724, valid_1 auc : 0.751606
[LightGBM] [Info] 5816.522378 seconds elapsed, finished iteration 724
[LightGBM] [Info] Iteration:725, training auc : 0.791487
[LightGBM] [Info] Iteration:725, valid_1 auc : 0.751606
[LightGBM] [Info] 5823.514296 seconds elapsed, finished iteration 725
[LightGBM] [Info] Iteration:726, training auc : 0.791512
[LightGBM] [Info] Iteration:

[LightGBM] [Info] Iteration:766, training auc : 0.792305
[LightGBM] [Info] Iteration:766, valid_1 auc : 0.751667
[LightGBM] [Info] 6161.256499 seconds elapsed, finished iteration 766
[LightGBM] [Info] Iteration:767, training auc : 0.792325
[LightGBM] [Info] Iteration:767, valid_1 auc : 0.751673
[LightGBM] [Info] 6168.894277 seconds elapsed, finished iteration 767
[LightGBM] [Info] Iteration:768, training auc : 0.792343
[LightGBM] [Info] Iteration:768, valid_1 auc : 0.751675
[LightGBM] [Info] 6176.682286 seconds elapsed, finished iteration 768
[LightGBM] [Info] Iteration:769, training auc : 0.792364
[LightGBM] [Info] Iteration:769, valid_1 auc : 0.751678
[LightGBM] [Info] 6184.709105 seconds elapsed, finished iteration 769
[LightGBM] [Info] Iteration:770, training auc : 0.792393
[LightGBM] [Info] Iteration:770, valid_1 auc : 0.751673
[LightGBM] [Info] 6192.511285 seconds elapsed, finished iteration 770
[LightGBM] [Info] Iteration:771, training auc : 0.792413
[LightGBM] [Info] Iteration:

[LightGBM] [Info] Iteration:811, training auc : 0.793177
[LightGBM] [Info] Iteration:811, valid_1 auc : 0.751715
[LightGBM] [Info] 6526.957178 seconds elapsed, finished iteration 811
[LightGBM] [Info] Iteration:812, training auc : 0.793195
[LightGBM] [Info] Iteration:812, valid_1 auc : 0.751717
[LightGBM] [Info] 6540.152795 seconds elapsed, finished iteration 812
[LightGBM] [Info] Iteration:813, training auc : 0.793214
[LightGBM] [Info] Iteration:813, valid_1 auc : 0.751719
[LightGBM] [Info] 6548.512044 seconds elapsed, finished iteration 813
[LightGBM] [Info] Iteration:814, training auc : 0.793231
[LightGBM] [Info] Iteration:814, valid_1 auc : 0.751717
[LightGBM] [Info] 6555.946566 seconds elapsed, finished iteration 814
[LightGBM] [Info] Iteration:815, training auc : 0.793249
[LightGBM] [Info] Iteration:815, valid_1 auc : 0.751719
[LightGBM] [Info] 6565.716829 seconds elapsed, finished iteration 815
[LightGBM] [Info] Iteration:816, training auc : 0.793269
[LightGBM] [Info] Iteration:

[LightGBM] [Info] Iteration:856, training auc : 0.794014
[LightGBM] [Info] Iteration:856, valid_1 auc : 0.751748
[LightGBM] [Info] 6916.219043 seconds elapsed, finished iteration 856
[LightGBM] [Info] Iteration:857, training auc : 0.794032
[LightGBM] [Info] Iteration:857, valid_1 auc : 0.75175
[LightGBM] [Info] 6924.672725 seconds elapsed, finished iteration 857
[LightGBM] [Info] Iteration:858, training auc : 0.794052
[LightGBM] [Info] Iteration:858, valid_1 auc : 0.751751
[LightGBM] [Info] 6932.873153 seconds elapsed, finished iteration 858
[LightGBM] [Info] Iteration:859, training auc : 0.794069
[LightGBM] [Info] Iteration:859, valid_1 auc : 0.751751
[LightGBM] [Info] 6940.582089 seconds elapsed, finished iteration 859
[LightGBM] [Info] Iteration:860, training auc : 0.794087
[LightGBM] [Info] Iteration:860, valid_1 auc : 0.751752
[LightGBM] [Info] 6947.934382 seconds elapsed, finished iteration 860
[LightGBM] [Info] Iteration:861, training auc : 0.794106
[LightGBM] [Info] Iteration:8

[LightGBM] [Info] Iteration:901, training auc : 0.794835
[LightGBM] [Info] Iteration:901, valid_1 auc : 0.75178
[LightGBM] [Info] 7240.930659 seconds elapsed, finished iteration 901
[LightGBM] [Info] Iteration:902, training auc : 0.794852
[LightGBM] [Info] Iteration:902, valid_1 auc : 0.751781
[LightGBM] [Info] 7248.019646 seconds elapsed, finished iteration 902
[LightGBM] [Info] Iteration:903, training auc : 0.79487
[LightGBM] [Info] Iteration:903, valid_1 auc : 0.751782
[LightGBM] [Info] 7254.628485 seconds elapsed, finished iteration 903
[LightGBM] [Info] Iteration:904, training auc : 0.794888
[LightGBM] [Info] Iteration:904, valid_1 auc : 0.751782
[LightGBM] [Info] 7261.622035 seconds elapsed, finished iteration 904
[LightGBM] [Info] Iteration:905, training auc : 0.794916
[LightGBM] [Info] Iteration:905, valid_1 auc : 0.751782
[LightGBM] [Info] 7268.857277 seconds elapsed, finished iteration 905
[LightGBM] [Info] Iteration:906, training auc : 0.794935
[LightGBM] [Info] Iteration:90

[LightGBM] [Info] Iteration:946, training auc : 0.795632
[LightGBM] [Info] Iteration:946, valid_1 auc : 0.751806
[LightGBM] [Info] 7551.750907 seconds elapsed, finished iteration 946
[LightGBM] [Info] Iteration:947, training auc : 0.79565
[LightGBM] [Info] Iteration:947, valid_1 auc : 0.751807
[LightGBM] [Info] 7558.707985 seconds elapsed, finished iteration 947
[LightGBM] [Info] Iteration:948, training auc : 0.795666
[LightGBM] [Info] Iteration:948, valid_1 auc : 0.751806
[LightGBM] [Info] 7565.774326 seconds elapsed, finished iteration 948
[LightGBM] [Info] Iteration:949, training auc : 0.795685
[LightGBM] [Info] Iteration:949, valid_1 auc : 0.751806
[LightGBM] [Info] 7572.639636 seconds elapsed, finished iteration 949
[LightGBM] [Info] Iteration:950, training auc : 0.795701
[LightGBM] [Info] Iteration:950, valid_1 auc : 0.751806
[LightGBM] [Info] 7579.300609 seconds elapsed, finished iteration 950
[LightGBM] [Info] Iteration:951, training auc : 0.795719
[LightGBM] [Info] Iteration:9

# 预测

In [5]:
df_test = pd.read_table('../out/df_test.txt', header=None)

In [6]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,0.017588,0.28655,1872,1416,2001,628,1386,1545,2047,1343,1279,718
1,1,0.017588,0.28655,1895,945,423,1083,63,517,115,1942,275,2013
2,0,0.017588,0.28655,1914,1507,1895,1931,1856,780,1896,162,1666,75
3,0,0.017588,0.28655,1910,523,435,2034,855,576,1906,1898,924,1879
4,1,0.017588,0.28655,1062,818,2003,982,1044,1976,461,385,1991,1029


In [7]:
y_test = df_test[0].values

In [8]:
X_text = df_test.drop(columns=[0]).values

In [9]:
gbm = lgb.Booster(model_file='../out/model_visual_10.txt')

In [16]:
y_pred = gbm.predict(X_text, num_iteration=gbm.best_iteration)

In [11]:
y_pred

array([0.85433567, 0.84927492, 0.7893812 , ..., 0.70627589, 0.64974054,
       0.67789185])

In [14]:
gbm2 = lgb.Booster(model_file='../out/model_visual_10_2.txt')

In [17]:
y_pred_2 = gbm2.predict(X_text, num_iteration=gbm2.best_iteration)

In [18]:
y_pred_2

array([0.85555492, 0.8513872 , 0.7221916 , ..., 0.70893405, 0.68678881,
       0.68706441])

In [21]:
gbm3 = lgb.Booster(model_file='../out/model_visual_10_3.txt')
y_pred_3 = gbm3.predict(X_text, num_iteration=gbm3.best_iteration)
y_pred_3

array([0.83305734, 0.85407556, 0.66562197, ..., 0.6898479 , 0.70175338,
       0.71310363])

In [26]:
gbm4 = lgb.Booster(model_file='../out/model_visual_10_4.txt')
y_pred_4 = gbm4.predict(X_text, num_iteration=gbm4.best_iteration)
y_pred_4

array([0.83481518, 0.85307171, 0.64960886, ..., 0.68612105, 0.69890358,
       0.72659973])

In [32]:
gbm5 = lgb.Booster(model_file='../out/model_visual_10_5.txt')
y_pred_5 = gbm5.predict(X_text, num_iteration=gbm5.best_iteration)
y_pred_5

array([0.79921745, 0.86483877, 0.32247196, ..., 0.67142446, 0.72715468,
       0.78045557])

# 评分

In [12]:
# 迭代100次
metrics.roc_auc_score(y_test, y_pred)

0.7265064399530845

In [19]:
# 迭代200次
metrics.roc_auc_score(y_test, y_pred_2)

0.7348343165167756

In [22]:
# 迭代300次
metrics.roc_auc_score(y_test, y_pred_3)

0.7378997051458095

In [27]:
# 迭代400次
metrics.roc_auc_score(y_test, y_pred_4)

0.7394468903186304

In [33]:
# 迭代1400次
metrics.roc_auc_score(y_test, y_pred_5)

0.7419415035329826

# 提交

In [3]:
gbm = lgb.Booster(model_file='../out/model_visual_100.txt')

In [4]:
gbm.feature_importance()

array([ 125,  766,  753,  662,  663,  687,  735,  657,  705,  713,  671,
        708,  706,  763,  759,  785,  780,  845,  792,  800,  896,  871,
        902,  899,  838,  874,  925,  870,  860,  897,  918,  957,  930,
       1011,  941,  998,  995, 1020,  999, 1023,  968, 1059,  994,  975,
       1040, 1091, 1052, 1021, 1031, 1051, 1087, 1146, 1055, 1100, 1077,
       1174, 1094, 1100, 1112, 1128, 1078, 1212, 1120, 1104, 1116, 1131,
       1172, 1181, 1167, 1151, 1186, 1278, 1219, 1156, 1174, 1165, 1212,
       1246, 1181, 1223, 1165, 1114, 1175, 1178, 1163, 1177, 1236, 1178,
       1261, 1215, 1213, 1271, 1214, 1275, 1135, 1296, 1228, 1272, 1295,
       1205, 1266, 1209])

In [5]:
df_test = pd.read_table('../out/test_visual_100.txt', header=None)

In [6]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,0.013731,0.337079,644,839,1809,799,714,1726,1715,1964,...,1508,1062,1780,1225,530,311,1867,1557,423,377
1,0.013731,0.337079,2017,1976,1532,598,2022,2041,1158,1980,...,11,871,296,228,672,1723,307,1801,1203,1517
2,0.013731,0.337079,926,1878,2007,462,12,871,1964,796,...,1122,1884,979,454,514,184,1824,603,744,654
3,0.013731,0.337079,1781,1878,1910,1503,1906,1526,2029,1965,...,1918,1713,2047,1964,1074,1246,1545,1833,1357,1424
4,0.013731,0.337079,993,800,2034,2006,1910,1727,1742,282,...,23,1512,1901,682,226,707,926,1084,1222,1906


In [7]:
y_pred = gbm.predict(df_test.values, gbm.best_iteration)

In [8]:
y_pred

array([0.78549442, 0.84435209, 0.58405326, ..., 0.55213722, 0.44129657,
       0.44072578])

In [9]:
df_result = pd.read_table(
    '../out/test_interaction_ext.txt', 
    names=['user_id', 'photo_id', 'time', 'duration_time', 'activity', 'mean']
)

In [10]:
df_result.head()

Unnamed: 0,user_id,photo_id,time,duration_time,activity,mean
0,29999,8154819,761158905921,17,0.013731,0.337079
1,29999,8374672,761163438550,9,0.013731,0.337079
2,29999,7987126,761143659968,11,0.013731,0.337079
3,29999,7912672,761159000400,17,0.013731,0.337079
4,29999,9062638,761163738888,11,0.013731,0.337079


In [11]:
df_result.columns.insert(6, 'click').values

array(['user_id', 'photo_id', 'time', 'duration_time', 'activity', 'mean',
       'click'], dtype=object)

In [12]:
df_result = df_result.reindex(columns=df_result.columns.insert(6, 'click').values)

In [13]:
df_result['click'] = y_pred

In [14]:
df_result.head()

Unnamed: 0,user_id,photo_id,time,duration_time,activity,mean,click
0,29999,8154819,761158905921,17,0.013731,0.337079,0.785494
1,29999,8374672,761163438550,9,0.013731,0.337079,0.844352
2,29999,7987126,761143659968,11,0.013731,0.337079,0.584053
3,29999,7912672,761159000400,17,0.013731,0.337079,0.810503
4,29999,9062638,761163738888,11,0.013731,0.337079,0.705586


In [15]:
# 保留6位小数：
df_result['click'] = df_result['click'].apply(lambda x: round(x, 6))

In [16]:
df_result.head()

Unnamed: 0,user_id,photo_id,time,duration_time,activity,mean,click
0,29999,8154819,761158905921,17,0.013731,0.337079,0.785494
1,29999,8374672,761163438550,9,0.013731,0.337079,0.844352
2,29999,7987126,761143659968,11,0.013731,0.337079,0.584053
3,29999,7912672,761159000400,17,0.013731,0.337079,0.810503
4,29999,9062638,761163738888,11,0.013731,0.337079,0.705586


In [49]:
df_result[['user_id', 'photo_id', 'click']].to_csv(
    '../submit/pred_visual_100_w.txt', index=False, header=False, sep='\t', float_format='%.6f'
)