In [2]:
from tqdm import tqdm
import os
import numpy as np
import time
import random
import pandas as pd
from scipy import sparse
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_org_dir = 'data/data_tencent/'
data_prep_dir = 'data/preprocess/'
# data_prep_dir = 'data/sample/'
sample_test = False

In [4]:
if sample_test:
    df_data = pd.read_csv(data_prep_dir + 'train_feat_merge_mini.csv')
else:
    df_data = pd.read_csv(data_prep_dir + 'train_feat_merge.csv')

train_idx_lst = list(df_data[df_data['n_parts'] != 1].index)
valid_idx_lst = list(df_data[df_data['n_parts'] == 1].index)

# 单个特征

In [8]:
# construct x matrix
x_train = sparse.load_npz(data_prep_dir + 'train_x_sparse_selection.npz')
x_valid = sparse.load_npz(data_prep_dir + 'valid_x_sparse_selection.npz')
# train_x_sparse_cross_selection.npz
# valid_x_sparse_cross_selection.npz


print('x_train:', x_train.shape)
print('x_valid:', x_valid.shape)

# construct y vector
y_train = np.array(df_data[df_data['n_parts'] != 1]['label'])
y_valid = np.array(df_data[df_data['n_parts'] == 1]['label'])

print('y_train:', y_train.shape)
print('y_valid:', y_valid.shape)

x_train: (7038840, 800)
x_valid: (1759974, 800)
y_train: (7038840,)
y_valid: (1759974,)


## LogisticRegression

In [16]:
# 模型定义
clf = LogisticRegression(random_state=1, penalty='l2', C=10, solver='lbfgs',
                         tol=1e-4, max_iter=10000, verbose=2)
clf.fit(x_train, y_train)
 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 46.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 46.7min finished


LogisticRegression(C=10, max_iter=10000, random_state=1, verbose=2)

In [17]:
y_valid_pred = clf.predict(x_valid)

print('===================Test Set Performance===================')
print(pd.Series(y_valid_pred).describe())
print('Test AUC', roc_auc_score(y_valid, y_valid_pred))
print('Test Logloss', log_loss(y_valid, y_valid_pred))

count    1.759974e+06
mean     1.259110e-03
std      3.546160e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
dtype: float64
Test AUC 0.5060704749217427
Test Logloss 1.6591814964471123


In [None]:
y_valid_pred = []

## LightGBM

In [9]:
# 模型定义
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.1, 
                n_estimators=10000, subsample_for_bin=200000, objective=None, 
                class_weight=None, min_split_gain=0.0, min_child_weight=0.001, 
                min_child_samples=20, subsample=0.7, subsample_freq=1, 
                colsample_bytree=0.7, 
                reg_alpha=6, reg_lambda=3,
                random_state=2018, n_jobs=-1, silent=False)

In [11]:
# 模型训练
clf.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_valid, y_valid)], 
        eval_names =['train','valid'],
        eval_metric='auc', early_stopping_rounds=50)

[LightGBM] [Info] Number of positive: 337465, number of negative: 6701375
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1756
[LightGBM] [Info] Number of data points in the train set: 7038840, number of used features: 800
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047943 -> initscore=-2.988606
[LightGBM] [Info] Start training from score -2.988606
[1]	train's auc: 0.641229	train's binary_logloss: 0.189886	valid's auc: 0.640498	valid's binary_logloss: 0.190082
Training until validation scores don't improve for 50 rounds
[2]	train's auc: 0.66071	train's binary_logloss: 0.188155	valid's auc: 0.659761	valid's binary_logloss: 0.18835
[3]	train's auc: 0.66179	train's binary_logloss: 0.186925	valid's auc: 0.660796	valid's binary_logloss: 0.187111
[4]	train's auc: 0.66314	train's binary_logloss: 0.185959	valid's auc: 0.661957	valid's binary_logloss: 0.186149
[5]	train's auc: 0.665551	t

[68]	train's auc: 0.714034	train's binary_logloss: 0.175509	valid's auc: 0.712814	valid's binary_logloss: 0.175812
[69]	train's auc: 0.714337	train's binary_logloss: 0.17546	valid's auc: 0.71313	valid's binary_logloss: 0.175761
[70]	train's auc: 0.714535	train's binary_logloss: 0.175419	valid's auc: 0.713293	valid's binary_logloss: 0.175723
[71]	train's auc: 0.714837	train's binary_logloss: 0.175371	valid's auc: 0.713602	valid's binary_logloss: 0.175674
[72]	train's auc: 0.715156	train's binary_logloss: 0.175324	valid's auc: 0.713886	valid's binary_logloss: 0.175632
[73]	train's auc: 0.715352	train's binary_logloss: 0.175281	valid's auc: 0.714078	valid's binary_logloss: 0.175592
[74]	train's auc: 0.715577	train's binary_logloss: 0.175242	valid's auc: 0.714337	valid's binary_logloss: 0.17555
[75]	train's auc: 0.715742	train's binary_logloss: 0.175195	valid's auc: 0.714506	valid's binary_logloss: 0.175504
[76]	train's auc: 0.715926	train's binary_logloss: 0.175155	valid's auc: 0.714661	v

[140]	train's auc: 0.726183	train's binary_logloss: 0.173346	valid's auc: 0.723904	valid's binary_logloss: 0.173811
[141]	train's auc: 0.726313	train's binary_logloss: 0.173326	valid's auc: 0.724011	valid's binary_logloss: 0.173795
[142]	train's auc: 0.726398	train's binary_logloss: 0.173305	valid's auc: 0.724082	valid's binary_logloss: 0.173777
[143]	train's auc: 0.726488	train's binary_logloss: 0.173289	valid's auc: 0.724174	valid's binary_logloss: 0.173762
[144]	train's auc: 0.726614	train's binary_logloss: 0.173271	valid's auc: 0.724279	valid's binary_logloss: 0.173747
[145]	train's auc: 0.726688	train's binary_logloss: 0.173255	valid's auc: 0.724336	valid's binary_logloss: 0.173734
[146]	train's auc: 0.72681	train's binary_logloss: 0.173234	valid's auc: 0.724478	valid's binary_logloss: 0.173713
[147]	train's auc: 0.726885	train's binary_logloss: 0.173218	valid's auc: 0.724544	valid's binary_logloss: 0.173699
[148]	train's auc: 0.726997	train's binary_logloss: 0.173201	valid's auc:

[211]	train's auc: 0.732173	train's binary_logloss: 0.172281	valid's auc: 0.729009	valid's binary_logloss: 0.172913
[212]	train's auc: 0.73234	train's binary_logloss: 0.172257	valid's auc: 0.72915	valid's binary_logloss: 0.172892
[213]	train's auc: 0.732401	train's binary_logloss: 0.172245	valid's auc: 0.729218	valid's binary_logloss: 0.172881
[214]	train's auc: 0.732513	train's binary_logloss: 0.17223	valid's auc: 0.729312	valid's binary_logloss: 0.172869
[215]	train's auc: 0.732667	train's binary_logloss: 0.172212	valid's auc: 0.729456	valid's binary_logloss: 0.172853
[216]	train's auc: 0.732763	train's binary_logloss: 0.172197	valid's auc: 0.729528	valid's binary_logloss: 0.172841
[217]	train's auc: 0.732819	train's binary_logloss: 0.172187	valid's auc: 0.729564	valid's binary_logloss: 0.172834
[218]	train's auc: 0.732921	train's binary_logloss: 0.172172	valid's auc: 0.729643	valid's binary_logloss: 0.172821
[219]	train's auc: 0.732985	train's binary_logloss: 0.172158	valid's auc: 0

[282]	train's auc: 0.736672	train's binary_logloss: 0.171521	valid's auc: 0.732414	valid's binary_logloss: 0.172329
[283]	train's auc: 0.736717	train's binary_logloss: 0.171514	valid's auc: 0.73244	valid's binary_logloss: 0.172323
[284]	train's auc: 0.736756	train's binary_logloss: 0.171507	valid's auc: 0.732462	valid's binary_logloss: 0.17232
[285]	train's auc: 0.736878	train's binary_logloss: 0.171491	valid's auc: 0.73257	valid's binary_logloss: 0.172306
[286]	train's auc: 0.736942	train's binary_logloss: 0.171482	valid's auc: 0.732607	valid's binary_logloss: 0.172298
[287]	train's auc: 0.736989	train's binary_logloss: 0.171473	valid's auc: 0.732643	valid's binary_logloss: 0.172292
[288]	train's auc: 0.737044	train's binary_logloss: 0.171466	valid's auc: 0.732669	valid's binary_logloss: 0.172288
[289]	train's auc: 0.737085	train's binary_logloss: 0.171459	valid's auc: 0.732712	valid's binary_logloss: 0.172282
[290]	train's auc: 0.737121	train's binary_logloss: 0.171452	valid's auc: 0

[353]	train's auc: 0.739982	train's binary_logloss: 0.170956	valid's auc: 0.734583	valid's binary_logloss: 0.171949
[354]	train's auc: 0.74002	train's binary_logloss: 0.170952	valid's auc: 0.734593	valid's binary_logloss: 0.171947
[355]	train's auc: 0.740058	train's binary_logloss: 0.170946	valid's auc: 0.734606	valid's binary_logloss: 0.171944
[356]	train's auc: 0.740087	train's binary_logloss: 0.170939	valid's auc: 0.734625	valid's binary_logloss: 0.17194
[357]	train's auc: 0.740122	train's binary_logloss: 0.170929	valid's auc: 0.734647	valid's binary_logloss: 0.171932
[358]	train's auc: 0.740149	train's binary_logloss: 0.170924	valid's auc: 0.734662	valid's binary_logloss: 0.17193
[359]	train's auc: 0.740183	train's binary_logloss: 0.170919	valid's auc: 0.734667	valid's binary_logloss: 0.171928
[360]	train's auc: 0.740207	train's binary_logloss: 0.170914	valid's auc: 0.73468	valid's binary_logloss: 0.171925
[361]	train's auc: 0.740238	train's binary_logloss: 0.170909	valid's auc: 0.

[424]	train's auc: 0.742495	train's binary_logloss: 0.170517	valid's auc: 0.735992	valid's binary_logloss: 0.171691
[425]	train's auc: 0.742512	train's binary_logloss: 0.170512	valid's auc: 0.736003	valid's binary_logloss: 0.171689
[426]	train's auc: 0.742544	train's binary_logloss: 0.170507	valid's auc: 0.736019	valid's binary_logloss: 0.171686
[427]	train's auc: 0.742565	train's binary_logloss: 0.1705	valid's auc: 0.736036	valid's binary_logloss: 0.171682
[428]	train's auc: 0.7426	train's binary_logloss: 0.170494	valid's auc: 0.736059	valid's binary_logloss: 0.171678
[429]	train's auc: 0.742628	train's binary_logloss: 0.170488	valid's auc: 0.736081	valid's binary_logloss: 0.171674
[430]	train's auc: 0.742653	train's binary_logloss: 0.170483	valid's auc: 0.736088	valid's binary_logloss: 0.171672
[431]	train's auc: 0.74268	train's binary_logloss: 0.170478	valid's auc: 0.736084	valid's binary_logloss: 0.171672
[432]	train's auc: 0.74271	train's binary_logloss: 0.170472	valid's auc: 0.73

[495]	train's auc: 0.744893	train's binary_logloss: 0.170102	valid's auc: 0.737292	valid's binary_logloss: 0.171462
[496]	train's auc: 0.744949	train's binary_logloss: 0.170094	valid's auc: 0.737323	valid's binary_logloss: 0.171457
[497]	train's auc: 0.744973	train's binary_logloss: 0.170089	valid's auc: 0.737333	valid's binary_logloss: 0.171456
[498]	train's auc: 0.745	train's binary_logloss: 0.170085	valid's auc: 0.737342	valid's binary_logloss: 0.171454
[499]	train's auc: 0.74503	train's binary_logloss: 0.17008	valid's auc: 0.73735	valid's binary_logloss: 0.171453
[500]	train's auc: 0.745056	train's binary_logloss: 0.170076	valid's auc: 0.737352	valid's binary_logloss: 0.171451
[501]	train's auc: 0.745091	train's binary_logloss: 0.170069	valid's auc: 0.737378	valid's binary_logloss: 0.171447
[502]	train's auc: 0.745115	train's binary_logloss: 0.170064	valid's auc: 0.737388	valid's binary_logloss: 0.171445
[503]	train's auc: 0.745135	train's binary_logloss: 0.17006	valid's auc: 0.737

[566]	train's auc: 0.746939	train's binary_logloss: 0.169745	valid's auc: 0.738161	valid's binary_logloss: 0.1713
[567]	train's auc: 0.74696	train's binary_logloss: 0.169741	valid's auc: 0.73817	valid's binary_logloss: 0.171299
[568]	train's auc: 0.746986	train's binary_logloss: 0.169737	valid's auc: 0.738179	valid's binary_logloss: 0.171297
[569]	train's auc: 0.747012	train's binary_logloss: 0.169733	valid's auc: 0.73819	valid's binary_logloss: 0.171295
[570]	train's auc: 0.747042	train's binary_logloss: 0.169729	valid's auc: 0.738197	valid's binary_logloss: 0.171294
[571]	train's auc: 0.747078	train's binary_logloss: 0.169723	valid's auc: 0.73821	valid's binary_logloss: 0.17129
[572]	train's auc: 0.747102	train's binary_logloss: 0.169719	valid's auc: 0.738215	valid's binary_logloss: 0.17129
[573]	train's auc: 0.747141	train's binary_logloss: 0.169711	valid's auc: 0.738245	valid's binary_logloss: 0.171284
[574]	train's auc: 0.747166	train's binary_logloss: 0.169707	valid's auc: 0.7382

[637]	train's auc: 0.748963	train's binary_logloss: 0.1694	valid's auc: 0.739044	valid's binary_logloss: 0.171136
[638]	train's auc: 0.748984	train's binary_logloss: 0.169397	valid's auc: 0.739044	valid's binary_logloss: 0.171135
[639]	train's auc: 0.74901	train's binary_logloss: 0.169391	valid's auc: 0.739065	valid's binary_logloss: 0.171132
[640]	train's auc: 0.749034	train's binary_logloss: 0.169388	valid's auc: 0.739064	valid's binary_logloss: 0.171132
[641]	train's auc: 0.749079	train's binary_logloss: 0.169381	valid's auc: 0.739083	valid's binary_logloss: 0.171128
[642]	train's auc: 0.749093	train's binary_logloss: 0.169377	valid's auc: 0.739082	valid's binary_logloss: 0.171128
[643]	train's auc: 0.749113	train's binary_logloss: 0.169373	valid's auc: 0.739088	valid's binary_logloss: 0.171127
[644]	train's auc: 0.749141	train's binary_logloss: 0.169369	valid's auc: 0.739101	valid's binary_logloss: 0.171124
[645]	train's auc: 0.749164	train's binary_logloss: 0.169366	valid's auc: 0

[708]	train's auc: 0.750823	train's binary_logloss: 0.169085	valid's auc: 0.739743	valid's binary_logloss: 0.171005
[709]	train's auc: 0.750846	train's binary_logloss: 0.169081	valid's auc: 0.739745	valid's binary_logloss: 0.171005
[710]	train's auc: 0.75088	train's binary_logloss: 0.169075	valid's auc: 0.739766	valid's binary_logloss: 0.171001
[711]	train's auc: 0.7509	train's binary_logloss: 0.169072	valid's auc: 0.739769	valid's binary_logloss: 0.171001
[712]	train's auc: 0.750924	train's binary_logloss: 0.169067	valid's auc: 0.739774	valid's binary_logloss: 0.171
[713]	train's auc: 0.750949	train's binary_logloss: 0.169064	valid's auc: 0.73978	valid's binary_logloss: 0.170999
[714]	train's auc: 0.750963	train's binary_logloss: 0.169061	valid's auc: 0.739782	valid's binary_logloss: 0.170999
[715]	train's auc: 0.750997	train's binary_logloss: 0.169053	valid's auc: 0.739787	valid's binary_logloss: 0.170996
[716]	train's auc: 0.751018	train's binary_logloss: 0.169049	valid's auc: 0.739

[779]	train's auc: 0.752469	train's binary_logloss: 0.168799	valid's auc: 0.740249	valid's binary_logloss: 0.170909
[780]	train's auc: 0.752514	train's binary_logloss: 0.168792	valid's auc: 0.740278	valid's binary_logloss: 0.170902
[781]	train's auc: 0.75254	train's binary_logloss: 0.168788	valid's auc: 0.740281	valid's binary_logloss: 0.170902
[782]	train's auc: 0.752564	train's binary_logloss: 0.168783	valid's auc: 0.740295	valid's binary_logloss: 0.1709
[783]	train's auc: 0.752583	train's binary_logloss: 0.16878	valid's auc: 0.740291	valid's binary_logloss: 0.1709
[784]	train's auc: 0.752618	train's binary_logloss: 0.168774	valid's auc: 0.74031	valid's binary_logloss: 0.170897
[785]	train's auc: 0.752646	train's binary_logloss: 0.168768	valid's auc: 0.740324	valid's binary_logloss: 0.170894
[786]	train's auc: 0.752667	train's binary_logloss: 0.168765	valid's auc: 0.74033	valid's binary_logloss: 0.170893
[787]	train's auc: 0.752684	train's binary_logloss: 0.168761	valid's auc: 0.7403

[850]	train's auc: 0.754153	train's binary_logloss: 0.168516	valid's auc: 0.740758	valid's binary_logloss: 0.170811
[851]	train's auc: 0.75417	train's binary_logloss: 0.168513	valid's auc: 0.740762	valid's binary_logloss: 0.17081
[852]	train's auc: 0.754191	train's binary_logloss: 0.168509	valid's auc: 0.740764	valid's binary_logloss: 0.170809
[853]	train's auc: 0.754213	train's binary_logloss: 0.168506	valid's auc: 0.740763	valid's binary_logloss: 0.170809
[854]	train's auc: 0.754241	train's binary_logloss: 0.168502	valid's auc: 0.740776	valid's binary_logloss: 0.170808
[855]	train's auc: 0.754264	train's binary_logloss: 0.168498	valid's auc: 0.740779	valid's binary_logloss: 0.170807
[856]	train's auc: 0.754283	train's binary_logloss: 0.168494	valid's auc: 0.740779	valid's binary_logloss: 0.170807
[857]	train's auc: 0.75432	train's binary_logloss: 0.168489	valid's auc: 0.740795	valid's binary_logloss: 0.170804
[858]	train's auc: 0.754332	train's binary_logloss: 0.168486	valid's auc: 0

[921]	train's auc: 0.755581	train's binary_logloss: 0.168269	valid's auc: 0.741133	valid's binary_logloss: 0.170744
[922]	train's auc: 0.755602	train's binary_logloss: 0.168265	valid's auc: 0.741144	valid's binary_logloss: 0.170742
[923]	train's auc: 0.755619	train's binary_logloss: 0.168261	valid's auc: 0.741153	valid's binary_logloss: 0.170741
[924]	train's auc: 0.75564	train's binary_logloss: 0.168258	valid's auc: 0.741165	valid's binary_logloss: 0.170739
[925]	train's auc: 0.755667	train's binary_logloss: 0.168254	valid's auc: 0.741172	valid's binary_logloss: 0.170739
[926]	train's auc: 0.755687	train's binary_logloss: 0.16825	valid's auc: 0.741178	valid's binary_logloss: 0.170737
[927]	train's auc: 0.755702	train's binary_logloss: 0.168248	valid's auc: 0.741179	valid's binary_logloss: 0.170737
[928]	train's auc: 0.755721	train's binary_logloss: 0.168246	valid's auc: 0.741179	valid's binary_logloss: 0.170737
[929]	train's auc: 0.755751	train's binary_logloss: 0.168241	valid's auc: 

[992]	train's auc: 0.757041	train's binary_logloss: 0.168018	valid's auc: 0.741523	valid's binary_logloss: 0.170672
[993]	train's auc: 0.757057	train's binary_logloss: 0.168014	valid's auc: 0.741528	valid's binary_logloss: 0.170671
[994]	train's auc: 0.757076	train's binary_logloss: 0.168011	valid's auc: 0.741535	valid's binary_logloss: 0.17067
[995]	train's auc: 0.757103	train's binary_logloss: 0.168008	valid's auc: 0.741544	valid's binary_logloss: 0.170669
[996]	train's auc: 0.757124	train's binary_logloss: 0.168003	valid's auc: 0.741547	valid's binary_logloss: 0.170668
[997]	train's auc: 0.757159	train's binary_logloss: 0.167999	valid's auc: 0.741562	valid's binary_logloss: 0.170666
[998]	train's auc: 0.757188	train's binary_logloss: 0.167994	valid's auc: 0.741567	valid's binary_logloss: 0.170664
[999]	train's auc: 0.757207	train's binary_logloss: 0.16799	valid's auc: 0.741576	valid's binary_logloss: 0.170662
[1000]	train's auc: 0.757222	train's binary_logloss: 0.167988	valid's auc:

[1063]	train's auc: 0.758443	train's binary_logloss: 0.167769	valid's auc: 0.741955	valid's binary_logloss: 0.170596
[1064]	train's auc: 0.758465	train's binary_logloss: 0.167765	valid's auc: 0.741964	valid's binary_logloss: 0.170595
[1065]	train's auc: 0.758483	train's binary_logloss: 0.16776	valid's auc: 0.741974	valid's binary_logloss: 0.170593
[1066]	train's auc: 0.758505	train's binary_logloss: 0.167757	valid's auc: 0.741974	valid's binary_logloss: 0.170592
[1067]	train's auc: 0.758525	train's binary_logloss: 0.167754	valid's auc: 0.741968	valid's binary_logloss: 0.170592
[1068]	train's auc: 0.758544	train's binary_logloss: 0.16775	valid's auc: 0.741952	valid's binary_logloss: 0.170593
[1069]	train's auc: 0.758565	train's binary_logloss: 0.167747	valid's auc: 0.741951	valid's binary_logloss: 0.170593
[1070]	train's auc: 0.758586	train's binary_logloss: 0.167743	valid's auc: 0.741954	valid's binary_logloss: 0.170593
[1071]	train's auc: 0.758601	train's binary_logloss: 0.16774	valid

[1134]	train's auc: 0.759823	train's binary_logloss: 0.167529	valid's auc: 0.74215	valid's binary_logloss: 0.170547
[1135]	train's auc: 0.759843	train's binary_logloss: 0.167525	valid's auc: 0.742149	valid's binary_logloss: 0.170547
[1136]	train's auc: 0.759862	train's binary_logloss: 0.167522	valid's auc: 0.742158	valid's binary_logloss: 0.170545
[1137]	train's auc: 0.759882	train's binary_logloss: 0.167518	valid's auc: 0.742168	valid's binary_logloss: 0.170543
[1138]	train's auc: 0.759917	train's binary_logloss: 0.167512	valid's auc: 0.742191	valid's binary_logloss: 0.170539
[1139]	train's auc: 0.759936	train's binary_logloss: 0.167509	valid's auc: 0.7422	valid's binary_logloss: 0.170538
[1140]	train's auc: 0.759957	train's binary_logloss: 0.167506	valid's auc: 0.742202	valid's binary_logloss: 0.170538
[1141]	train's auc: 0.759977	train's binary_logloss: 0.167503	valid's auc: 0.742201	valid's binary_logloss: 0.170538
[1142]	train's auc: 0.759999	train's binary_logloss: 0.1675	valid's

[1205]	train's auc: 0.761133	train's binary_logloss: 0.167303	valid's auc: 0.742352	valid's binary_logloss: 0.170502
[1206]	train's auc: 0.761155	train's binary_logloss: 0.1673	valid's auc: 0.742353	valid's binary_logloss: 0.170501
[1207]	train's auc: 0.761174	train's binary_logloss: 0.167296	valid's auc: 0.74235	valid's binary_logloss: 0.170501
[1208]	train's auc: 0.761192	train's binary_logloss: 0.167293	valid's auc: 0.742353	valid's binary_logloss: 0.1705
[1209]	train's auc: 0.761212	train's binary_logloss: 0.16729	valid's auc: 0.742356	valid's binary_logloss: 0.1705
[1210]	train's auc: 0.761226	train's binary_logloss: 0.167288	valid's auc: 0.74236	valid's binary_logloss: 0.170499
[1211]	train's auc: 0.761245	train's binary_logloss: 0.167285	valid's auc: 0.74237	valid's binary_logloss: 0.170497
[1212]	train's auc: 0.761261	train's binary_logloss: 0.167282	valid's auc: 0.742373	valid's binary_logloss: 0.170497
[1213]	train's auc: 0.761275	train's binary_logloss: 0.167278	valid's auc:

[1276]	train's auc: 0.762399	train's binary_logloss: 0.167081	valid's auc: 0.742562	valid's binary_logloss: 0.170452
[1277]	train's auc: 0.762416	train's binary_logloss: 0.167079	valid's auc: 0.742567	valid's binary_logloss: 0.170451
[1278]	train's auc: 0.762437	train's binary_logloss: 0.167076	valid's auc: 0.742571	valid's binary_logloss: 0.170451
[1279]	train's auc: 0.762455	train's binary_logloss: 0.167072	valid's auc: 0.742571	valid's binary_logloss: 0.170451
[1280]	train's auc: 0.76247	train's binary_logloss: 0.16707	valid's auc: 0.742573	valid's binary_logloss: 0.170451
[1281]	train's auc: 0.762491	train's binary_logloss: 0.167067	valid's auc: 0.742586	valid's binary_logloss: 0.17045
[1282]	train's auc: 0.762509	train's binary_logloss: 0.167064	valid's auc: 0.742587	valid's binary_logloss: 0.17045
[1283]	train's auc: 0.762529	train's binary_logloss: 0.167061	valid's auc: 0.742582	valid's binary_logloss: 0.17045
[1284]	train's auc: 0.762549	train's binary_logloss: 0.167057	valid's

[1347]	train's auc: 0.763624	train's binary_logloss: 0.166866	valid's auc: 0.742811	valid's binary_logloss: 0.17041
[1348]	train's auc: 0.763644	train's binary_logloss: 0.166863	valid's auc: 0.742812	valid's binary_logloss: 0.17041
[1349]	train's auc: 0.763662	train's binary_logloss: 0.16686	valid's auc: 0.742817	valid's binary_logloss: 0.170409
[1350]	train's auc: 0.763681	train's binary_logloss: 0.166857	valid's auc: 0.742818	valid's binary_logloss: 0.170409
[1351]	train's auc: 0.763698	train's binary_logloss: 0.166854	valid's auc: 0.742818	valid's binary_logloss: 0.170409
[1352]	train's auc: 0.76372	train's binary_logloss: 0.166851	valid's auc: 0.742816	valid's binary_logloss: 0.170409
[1353]	train's auc: 0.763743	train's binary_logloss: 0.166848	valid's auc: 0.742809	valid's binary_logloss: 0.17041
[1354]	train's auc: 0.763757	train's binary_logloss: 0.166845	valid's auc: 0.742812	valid's binary_logloss: 0.170409
[1355]	train's auc: 0.763771	train's binary_logloss: 0.166842	valid's

[1418]	train's auc: 0.764856	train's binary_logloss: 0.166656	valid's auc: 0.742928	valid's binary_logloss: 0.170386
[1419]	train's auc: 0.764871	train's binary_logloss: 0.166654	valid's auc: 0.742924	valid's binary_logloss: 0.170387
[1420]	train's auc: 0.764886	train's binary_logloss: 0.166651	valid's auc: 0.742925	valid's binary_logloss: 0.170386
[1421]	train's auc: 0.764902	train's binary_logloss: 0.166647	valid's auc: 0.74293	valid's binary_logloss: 0.170385
[1422]	train's auc: 0.764923	train's binary_logloss: 0.166644	valid's auc: 0.742933	valid's binary_logloss: 0.170385
[1423]	train's auc: 0.764948	train's binary_logloss: 0.166641	valid's auc: 0.742926	valid's binary_logloss: 0.170386
[1424]	train's auc: 0.764959	train's binary_logloss: 0.166638	valid's auc: 0.742928	valid's binary_logloss: 0.170385
[1425]	train's auc: 0.764977	train's binary_logloss: 0.166633	valid's auc: 0.742934	valid's binary_logloss: 0.170384
[1426]	train's auc: 0.76499	train's binary_logloss: 0.16663	valid

[1489]	train's auc: 0.76607	train's binary_logloss: 0.166444	valid's auc: 0.743091	valid's binary_logloss: 0.170351
[1490]	train's auc: 0.766091	train's binary_logloss: 0.166441	valid's auc: 0.7431	valid's binary_logloss: 0.170351
[1491]	train's auc: 0.766106	train's binary_logloss: 0.166439	valid's auc: 0.743102	valid's binary_logloss: 0.170351
[1492]	train's auc: 0.766125	train's binary_logloss: 0.166436	valid's auc: 0.743102	valid's binary_logloss: 0.170351
[1493]	train's auc: 0.766145	train's binary_logloss: 0.166433	valid's auc: 0.743095	valid's binary_logloss: 0.170352
[1494]	train's auc: 0.766161	train's binary_logloss: 0.166431	valid's auc: 0.743102	valid's binary_logloss: 0.170351
[1495]	train's auc: 0.76617	train's binary_logloss: 0.166428	valid's auc: 0.743102	valid's binary_logloss: 0.170351
[1496]	train's auc: 0.766187	train's binary_logloss: 0.166425	valid's auc: 0.7431	valid's binary_logloss: 0.170351
[1497]	train's auc: 0.766214	train's binary_logloss: 0.166421	valid's 

[1560]	train's auc: 0.767253	train's binary_logloss: 0.16623	valid's auc: 0.74325	valid's binary_logloss: 0.170317
[1561]	train's auc: 0.767268	train's binary_logloss: 0.166227	valid's auc: 0.743254	valid's binary_logloss: 0.170316
[1562]	train's auc: 0.767289	train's binary_logloss: 0.166224	valid's auc: 0.743258	valid's binary_logloss: 0.170315
[1563]	train's auc: 0.767303	train's binary_logloss: 0.166222	valid's auc: 0.743256	valid's binary_logloss: 0.170316
[1564]	train's auc: 0.767317	train's binary_logloss: 0.16622	valid's auc: 0.743257	valid's binary_logloss: 0.170315
[1565]	train's auc: 0.767331	train's binary_logloss: 0.166217	valid's auc: 0.743249	valid's binary_logloss: 0.170316
[1566]	train's auc: 0.767352	train's binary_logloss: 0.166214	valid's auc: 0.743246	valid's binary_logloss: 0.170316
[1567]	train's auc: 0.767373	train's binary_logloss: 0.16621	valid's auc: 0.743245	valid's binary_logloss: 0.170316
[1568]	train's auc: 0.76739	train's binary_logloss: 0.166208	valid's

[1631]	train's auc: 0.768435	train's binary_logloss: 0.166031	valid's auc: 0.743344	valid's binary_logloss: 0.170296
[1632]	train's auc: 0.768453	train's binary_logloss: 0.166028	valid's auc: 0.743349	valid's binary_logloss: 0.170295
[1633]	train's auc: 0.76847	train's binary_logloss: 0.166024	valid's auc: 0.743342	valid's binary_logloss: 0.170296
[1634]	train's auc: 0.768488	train's binary_logloss: 0.166021	valid's auc: 0.743339	valid's binary_logloss: 0.170296
[1635]	train's auc: 0.768512	train's binary_logloss: 0.166018	valid's auc: 0.743342	valid's binary_logloss: 0.170296
[1636]	train's auc: 0.768528	train's binary_logloss: 0.166014	valid's auc: 0.743347	valid's binary_logloss: 0.170296
[1637]	train's auc: 0.768549	train's binary_logloss: 0.166011	valid's auc: 0.743347	valid's binary_logloss: 0.170296
[1638]	train's auc: 0.768564	train's binary_logloss: 0.166008	valid's auc: 0.743345	valid's binary_logloss: 0.170296
[1639]	train's auc: 0.76858	train's binary_logloss: 0.166005	vali

[1702]	train's auc: 0.769563	train's binary_logloss: 0.165831	valid's auc: 0.743472	valid's binary_logloss: 0.170272
[1703]	train's auc: 0.76958	train's binary_logloss: 0.165828	valid's auc: 0.743476	valid's binary_logloss: 0.170272
[1704]	train's auc: 0.7696	train's binary_logloss: 0.165825	valid's auc: 0.743478	valid's binary_logloss: 0.170272
[1705]	train's auc: 0.769624	train's binary_logloss: 0.165822	valid's auc: 0.743477	valid's binary_logloss: 0.170271
[1706]	train's auc: 0.769637	train's binary_logloss: 0.165819	valid's auc: 0.743477	valid's binary_logloss: 0.170271
[1707]	train's auc: 0.769652	train's binary_logloss: 0.165816	valid's auc: 0.743484	valid's binary_logloss: 0.17027
[1708]	train's auc: 0.769664	train's binary_logloss: 0.165815	valid's auc: 0.743478	valid's binary_logloss: 0.170271
[1709]	train's auc: 0.769682	train's binary_logloss: 0.165812	valid's auc: 0.743474	valid's binary_logloss: 0.170271
[1710]	train's auc: 0.769693	train's binary_logloss: 0.165809	valid'

[1773]	train's auc: 0.770709	train's binary_logloss: 0.16563	valid's auc: 0.743558	valid's binary_logloss: 0.170253
[1774]	train's auc: 0.770731	train's binary_logloss: 0.165627	valid's auc: 0.74355	valid's binary_logloss: 0.170254
[1775]	train's auc: 0.770742	train's binary_logloss: 0.165625	valid's auc: 0.743548	valid's binary_logloss: 0.170254
[1776]	train's auc: 0.770754	train's binary_logloss: 0.165623	valid's auc: 0.743542	valid's binary_logloss: 0.170255
[1777]	train's auc: 0.770771	train's binary_logloss: 0.16562	valid's auc: 0.743552	valid's binary_logloss: 0.170254
[1778]	train's auc: 0.770793	train's binary_logloss: 0.165617	valid's auc: 0.743551	valid's binary_logloss: 0.170254
[1779]	train's auc: 0.770808	train's binary_logloss: 0.165614	valid's auc: 0.743553	valid's binary_logloss: 0.170254
[1780]	train's auc: 0.770828	train's binary_logloss: 0.165611	valid's auc: 0.743555	valid's binary_logloss: 0.170254
[1781]	train's auc: 0.770838	train's binary_logloss: 0.165608	valid

[1844]	train's auc: 0.771813	train's binary_logloss: 0.165438	valid's auc: 0.743682	valid's binary_logloss: 0.170231
[1845]	train's auc: 0.771819	train's binary_logloss: 0.165435	valid's auc: 0.743685	valid's binary_logloss: 0.17023
[1846]	train's auc: 0.771832	train's binary_logloss: 0.165433	valid's auc: 0.743688	valid's binary_logloss: 0.17023
[1847]	train's auc: 0.771842	train's binary_logloss: 0.165431	valid's auc: 0.743688	valid's binary_logloss: 0.17023
[1848]	train's auc: 0.771854	train's binary_logloss: 0.165428	valid's auc: 0.743697	valid's binary_logloss: 0.170229
[1849]	train's auc: 0.771869	train's binary_logloss: 0.165425	valid's auc: 0.743696	valid's binary_logloss: 0.170229
[1850]	train's auc: 0.771888	train's binary_logloss: 0.165422	valid's auc: 0.743697	valid's binary_logloss: 0.170229
[1851]	train's auc: 0.771902	train's binary_logloss: 0.16542	valid's auc: 0.743696	valid's binary_logloss: 0.170229
[1852]	train's auc: 0.771917	train's binary_logloss: 0.165418	valid'

[1915]	train's auc: 0.772879	train's binary_logloss: 0.165249	valid's auc: 0.743736	valid's binary_logloss: 0.170219
[1916]	train's auc: 0.772896	train's binary_logloss: 0.165246	valid's auc: 0.743737	valid's binary_logloss: 0.170218
[1917]	train's auc: 0.772909	train's binary_logloss: 0.165244	valid's auc: 0.743733	valid's binary_logloss: 0.170219
[1918]	train's auc: 0.772923	train's binary_logloss: 0.16524	valid's auc: 0.743735	valid's binary_logloss: 0.170218
[1919]	train's auc: 0.77295	train's binary_logloss: 0.165237	valid's auc: 0.743741	valid's binary_logloss: 0.170217
[1920]	train's auc: 0.772966	train's binary_logloss: 0.165235	valid's auc: 0.743741	valid's binary_logloss: 0.170217
[1921]	train's auc: 0.772978	train's binary_logloss: 0.165232	valid's auc: 0.743741	valid's binary_logloss: 0.170217
[1922]	train's auc: 0.772987	train's binary_logloss: 0.16523	valid's auc: 0.743738	valid's binary_logloss: 0.170217
[1923]	train's auc: 0.773002	train's binary_logloss: 0.165227	valid

[1986]	train's auc: 0.773895	train's binary_logloss: 0.165058	valid's auc: 0.743801	valid's binary_logloss: 0.170204
[1987]	train's auc: 0.773905	train's binary_logloss: 0.165055	valid's auc: 0.743801	valid's binary_logloss: 0.170203
[1988]	train's auc: 0.773915	train's binary_logloss: 0.165053	valid's auc: 0.743799	valid's binary_logloss: 0.170203
[1989]	train's auc: 0.773939	train's binary_logloss: 0.16505	valid's auc: 0.74379	valid's binary_logloss: 0.170204
[1990]	train's auc: 0.773945	train's binary_logloss: 0.165047	valid's auc: 0.743791	valid's binary_logloss: 0.170204
[1991]	train's auc: 0.773966	train's binary_logloss: 0.165043	valid's auc: 0.743794	valid's binary_logloss: 0.170203
[1992]	train's auc: 0.773983	train's binary_logloss: 0.16504	valid's auc: 0.743795	valid's binary_logloss: 0.170203
[1993]	train's auc: 0.773999	train's binary_logloss: 0.165038	valid's auc: 0.743798	valid's binary_logloss: 0.170203
[1994]	train's auc: 0.774012	train's binary_logloss: 0.165035	valid

[2057]	train's auc: 0.77491	train's binary_logloss: 0.164867	valid's auc: 0.743801	valid's binary_logloss: 0.170196
[2058]	train's auc: 0.774924	train's binary_logloss: 0.164864	valid's auc: 0.743797	valid's binary_logloss: 0.170197
[2059]	train's auc: 0.774933	train's binary_logloss: 0.164862	valid's auc: 0.7438	valid's binary_logloss: 0.170196
[2060]	train's auc: 0.77495	train's binary_logloss: 0.164859	valid's auc: 0.743797	valid's binary_logloss: 0.170197
[2061]	train's auc: 0.774967	train's binary_logloss: 0.164857	valid's auc: 0.743799	valid's binary_logloss: 0.170197
[2062]	train's auc: 0.774984	train's binary_logloss: 0.164854	valid's auc: 0.743803	valid's binary_logloss: 0.170197
[2063]	train's auc: 0.774995	train's binary_logloss: 0.164852	valid's auc: 0.743803	valid's binary_logloss: 0.170197
[2064]	train's auc: 0.775002	train's binary_logloss: 0.164849	valid's auc: 0.743805	valid's binary_logloss: 0.170196
[2065]	train's auc: 0.775021	train's binary_logloss: 0.164847	valid'

[2128]	train's auc: 0.775937	train's binary_logloss: 0.164684	valid's auc: 0.743805	valid's binary_logloss: 0.170191
[2129]	train's auc: 0.77595	train's binary_logloss: 0.164681	valid's auc: 0.743805	valid's binary_logloss: 0.17019
[2130]	train's auc: 0.775965	train's binary_logloss: 0.164678	valid's auc: 0.743807	valid's binary_logloss: 0.17019
[2131]	train's auc: 0.775984	train's binary_logloss: 0.164675	valid's auc: 0.743808	valid's binary_logloss: 0.17019
[2132]	train's auc: 0.776001	train's binary_logloss: 0.164673	valid's auc: 0.743807	valid's binary_logloss: 0.17019
[2133]	train's auc: 0.77602	train's binary_logloss: 0.16467	valid's auc: 0.743807	valid's binary_logloss: 0.170189
[2134]	train's auc: 0.776035	train's binary_logloss: 0.164667	valid's auc: 0.743801	valid's binary_logloss: 0.17019
[2135]	train's auc: 0.77605	train's binary_logloss: 0.164665	valid's auc: 0.743804	valid's binary_logloss: 0.17019
[2136]	train's auc: 0.776073	train's binary_logloss: 0.164661	valid's auc:

LGBMClassifier(colsample_bytree=0.7, n_estimators=10000, num_leaves=40,
               random_state=2018, reg_alpha=6, reg_lambda=3, silent=False,
               subsample=0.7, subsample_freq=1)

In [14]:
print('===================Test Set Performance===================')
y_valid_pred = clf.predict_proba(x_valid, num_iteration = clf.best_iteration_)[:,1]
print(pd.Series(y_valid_pred).describe())
print('Test AUC', roc_auc_score(y_valid, y_valid_pred))
print('Test Logloss', log_loss(y_valid, y_valid_pred))

count    1.759974e+06
mean     4.796217e-02
std      5.839436e-02
min      2.186377e-04
25%      2.071137e-02
50%      3.481173e-02
75%      5.510371e-02
max      9.953930e-01
dtype: float64
Test AUC 0.7438440879170812
Test Logloss 0.17018985844814913


# 加入交叉特征

In [5]:
# construct x matrix
x_train = sparse.load_npz(data_prep_dir + 'train_x_sparse_selection.npz')
x_valid = sparse.load_npz(data_prep_dir + 'valid_x_sparse_selection.npz')
# train_x_sparse_cross_selection.npz
# valid_x_sparse_cross_selection.npz
x_train_cross = sparse.load_npz(data_prep_dir + 'train_x_sparse_cross_selection.npz')
x_valid_cross = sparse.load_npz(data_prep_dir + 'valid_x_sparse_cross_selection.npz')

x_train = sparse.hstack((x_train, x_train_cross)).tocsc()
x_valid = sparse.hstack((x_valid, x_valid_cross)).tocsc()

print('x_train:', x_train.shape)
print('x_valid:', x_valid.shape)

x_train: (7038840, 9300)
x_valid: (1759974, 9300)


In [6]:
del x_train_cross, x_valid_cross

In [7]:
# construct y vector
y_train = np.array(df_data[df_data['n_parts'] != 1]['label'])
y_valid = np.array(df_data[df_data['n_parts'] == 1]['label'])

print('y_train:', y_train.shape)
print('y_valid:', y_valid.shape)

y_train: (7038840,)
y_valid: (1759974,)


## LightGBM

In [24]:
# 模型定义
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.1, 
                n_estimators=10000, subsample_for_bin=200000, objective=None, 
                class_weight=None, min_split_gain=0.0, min_child_weight=0.001, 
                min_child_samples=20, subsample=0.7, subsample_freq=1, 
                colsample_bytree=0.7, 
                reg_alpha=6, reg_lambda=3,
                random_state=2018, n_jobs=-1, silent=False)

In [25]:
# 模型训练
clf.fit(x_train, y_train, eval_set = [(x_train, y_train), (x_valid, y_valid)], 
        eval_names =['train','valid'],
        eval_metric='auc', early_stopping_rounds=50)

[LightGBM] [Info] Number of positive: 337465, number of negative: 6701375
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18638
[LightGBM] [Info] Number of data points in the train set: 7038840, number of used features: 9241
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047943 -> initscore=-2.988606
[LightGBM] [Info] Start training from score -2.988606
[1]	train's auc: 0.643552	train's binary_logloss: 0.189725	valid's auc: 0.642099	valid's binary_logloss: 0.189903
Training until validation scores don't improve for 50 rounds
[2]	train's auc: 0.654497	train's binary_logloss: 0.188154	valid's auc: 0.653247	valid's binary_logloss: 0.188321
[3]	train's auc: 0.663561	train's binary_logloss: 0.186991	valid's auc: 0.661819	valid's binary_logloss: 0.187186
[4]	train's auc: 0.668731	train's binary_logloss: 0.185969	valid's auc: 0.667142	valid's binary_logloss: 0.186171
[5]	train's auc: 0.67

[68]	train's auc: 0.717159	train's binary_logloss: 0.175005	valid's auc: 0.715649	valid's binary_logloss: 0.175351
[69]	train's auc: 0.717514	train's binary_logloss: 0.174953	valid's auc: 0.716007	valid's binary_logloss: 0.1753
[70]	train's auc: 0.717738	train's binary_logloss: 0.17491	valid's auc: 0.716222	valid's binary_logloss: 0.175259
[71]	train's auc: 0.718187	train's binary_logloss: 0.174854	valid's auc: 0.716647	valid's binary_logloss: 0.175206
[72]	train's auc: 0.718467	train's binary_logloss: 0.174807	valid's auc: 0.716886	valid's binary_logloss: 0.175164
[73]	train's auc: 0.718832	train's binary_logloss: 0.174756	valid's auc: 0.717238	valid's binary_logloss: 0.175115
[74]	train's auc: 0.719127	train's binary_logloss: 0.17471	valid's auc: 0.717488	valid's binary_logloss: 0.175075
[75]	train's auc: 0.719357	train's binary_logloss: 0.174671	valid's auc: 0.717691	valid's binary_logloss: 0.175038
[76]	train's auc: 0.719607	train's binary_logloss: 0.174627	valid's auc: 0.717915	va

[140]	train's auc: 0.730597	train's binary_logloss: 0.172718	valid's auc: 0.728065	valid's binary_logloss: 0.17324
[141]	train's auc: 0.730717	train's binary_logloss: 0.172694	valid's auc: 0.728167	valid's binary_logloss: 0.17322
[142]	train's auc: 0.730809	train's binary_logloss: 0.172671	valid's auc: 0.728237	valid's binary_logloss: 0.1732
[143]	train's auc: 0.730935	train's binary_logloss: 0.172653	valid's auc: 0.728359	valid's binary_logloss: 0.173183
[144]	train's auc: 0.731031	train's binary_logloss: 0.172634	valid's auc: 0.72845	valid's binary_logloss: 0.173164
[145]	train's auc: 0.73114	train's binary_logloss: 0.172613	valid's auc: 0.728547	valid's binary_logloss: 0.173145
[146]	train's auc: 0.731256	train's binary_logloss: 0.172594	valid's auc: 0.728627	valid's binary_logloss: 0.173129
[147]	train's auc: 0.731362	train's binary_logloss: 0.172574	valid's auc: 0.728711	valid's binary_logloss: 0.173113
[148]	train's auc: 0.731461	train's binary_logloss: 0.172557	valid's auc: 0.72

[211]	train's auc: 0.736957	train's binary_logloss: 0.171584	valid's auc: 0.73332	valid's binary_logloss: 0.172291
[212]	train's auc: 0.737026	train's binary_logloss: 0.17157	valid's auc: 0.733371	valid's binary_logloss: 0.172281
[213]	train's auc: 0.737092	train's binary_logloss: 0.171558	valid's auc: 0.733434	valid's binary_logloss: 0.17227
[214]	train's auc: 0.737169	train's binary_logloss: 0.171545	valid's auc: 0.733503	valid's binary_logloss: 0.172258
[215]	train's auc: 0.737236	train's binary_logloss: 0.171534	valid's auc: 0.733559	valid's binary_logloss: 0.17225
[216]	train's auc: 0.737287	train's binary_logloss: 0.171524	valid's auc: 0.733608	valid's binary_logloss: 0.172241
[217]	train's auc: 0.737344	train's binary_logloss: 0.171515	valid's auc: 0.733643	valid's binary_logloss: 0.172234
[218]	train's auc: 0.737419	train's binary_logloss: 0.171501	valid's auc: 0.733686	valid's binary_logloss: 0.172225
[219]	train's auc: 0.737511	train's binary_logloss: 0.171485	valid's auc: 0.

[282]	train's auc: 0.741097	train's binary_logloss: 0.170849	valid's auc: 0.736238	valid's binary_logloss: 0.171761
[283]	train's auc: 0.741159	train's binary_logloss: 0.17084	valid's auc: 0.736284	valid's binary_logloss: 0.171754
[284]	train's auc: 0.741206	train's binary_logloss: 0.170833	valid's auc: 0.736309	valid's binary_logloss: 0.171749
[285]	train's auc: 0.741284	train's binary_logloss: 0.170821	valid's auc: 0.736358	valid's binary_logloss: 0.171741
[286]	train's auc: 0.741332	train's binary_logloss: 0.170812	valid's auc: 0.736386	valid's binary_logloss: 0.171735
[287]	train's auc: 0.741382	train's binary_logloss: 0.170804	valid's auc: 0.736414	valid's binary_logloss: 0.17173
[288]	train's auc: 0.741439	train's binary_logloss: 0.170795	valid's auc: 0.736449	valid's binary_logloss: 0.171726
[289]	train's auc: 0.741476	train's binary_logloss: 0.170788	valid's auc: 0.736475	valid's binary_logloss: 0.171721
[290]	train's auc: 0.741517	train's binary_logloss: 0.170779	valid's auc: 

[353]	train's auc: 0.74436	train's binary_logloss: 0.170282	valid's auc: 0.738263	valid's binary_logloss: 0.171398
[354]	train's auc: 0.744399	train's binary_logloss: 0.170274	valid's auc: 0.738279	valid's binary_logloss: 0.171395
[355]	train's auc: 0.744444	train's binary_logloss: 0.170268	valid's auc: 0.738295	valid's binary_logloss: 0.171392
[356]	train's auc: 0.744487	train's binary_logloss: 0.17026	valid's auc: 0.738327	valid's binary_logloss: 0.171387
[357]	train's auc: 0.744521	train's binary_logloss: 0.170252	valid's auc: 0.738345	valid's binary_logloss: 0.171382
[358]	train's auc: 0.74456	train's binary_logloss: 0.170246	valid's auc: 0.738373	valid's binary_logloss: 0.171378
[359]	train's auc: 0.744593	train's binary_logloss: 0.170239	valid's auc: 0.73839	valid's binary_logloss: 0.171374
[360]	train's auc: 0.744624	train's binary_logloss: 0.170233	valid's auc: 0.738392	valid's binary_logloss: 0.171372
[361]	train's auc: 0.744664	train's binary_logloss: 0.170227	valid's auc: 0.

[424]	train's auc: 0.746846	train's binary_logloss: 0.169841	valid's auc: 0.73953	valid's binary_logloss: 0.171157
[425]	train's auc: 0.746868	train's binary_logloss: 0.169836	valid's auc: 0.739539	valid's binary_logloss: 0.171155
[426]	train's auc: 0.746905	train's binary_logloss: 0.16983	valid's auc: 0.739552	valid's binary_logloss: 0.171153
[427]	train's auc: 0.746935	train's binary_logloss: 0.169825	valid's auc: 0.739571	valid's binary_logloss: 0.17115
[428]	train's auc: 0.74696	train's binary_logloss: 0.16982	valid's auc: 0.73958	valid's binary_logloss: 0.171147
[429]	train's auc: 0.746987	train's binary_logloss: 0.169816	valid's auc: 0.739586	valid's binary_logloss: 0.171145
[430]	train's auc: 0.747008	train's binary_logloss: 0.16981	valid's auc: 0.739594	valid's binary_logloss: 0.171142
[431]	train's auc: 0.747043	train's binary_logloss: 0.169805	valid's auc: 0.739611	valid's binary_logloss: 0.17114
[432]	train's auc: 0.747085	train's binary_logloss: 0.169798	valid's auc: 0.7396

[495]	train's auc: 0.749061	train's binary_logloss: 0.169464	valid's auc: 0.74049	valid's binary_logloss: 0.170982
[496]	train's auc: 0.749092	train's binary_logloss: 0.16946	valid's auc: 0.74049	valid's binary_logloss: 0.170982
[497]	train's auc: 0.749133	train's binary_logloss: 0.169453	valid's auc: 0.740519	valid's binary_logloss: 0.170976
[498]	train's auc: 0.749158	train's binary_logloss: 0.169448	valid's auc: 0.740532	valid's binary_logloss: 0.170974
[499]	train's auc: 0.749187	train's binary_logloss: 0.169444	valid's auc: 0.740545	valid's binary_logloss: 0.170972
[500]	train's auc: 0.749214	train's binary_logloss: 0.16944	valid's auc: 0.740552	valid's binary_logloss: 0.170971
[501]	train's auc: 0.749247	train's binary_logloss: 0.169435	valid's auc: 0.740561	valid's binary_logloss: 0.170969
[502]	train's auc: 0.749272	train's binary_logloss: 0.169431	valid's auc: 0.740566	valid's binary_logloss: 0.170968
[503]	train's auc: 0.749297	train's binary_logloss: 0.169426	valid's auc: 0.

[566]	train's auc: 0.751066	train's binary_logloss: 0.16912	valid's auc: 0.741242	valid's binary_logloss: 0.170841
[567]	train's auc: 0.751085	train's binary_logloss: 0.169116	valid's auc: 0.741248	valid's binary_logloss: 0.170839
[568]	train's auc: 0.751112	train's binary_logloss: 0.169112	valid's auc: 0.741247	valid's binary_logloss: 0.170839
[569]	train's auc: 0.751129	train's binary_logloss: 0.169108	valid's auc: 0.74125	valid's binary_logloss: 0.170838
[570]	train's auc: 0.751151	train's binary_logloss: 0.169104	valid's auc: 0.741251	valid's binary_logloss: 0.170837
[571]	train's auc: 0.751169	train's binary_logloss: 0.1691	valid's auc: 0.74125	valid's binary_logloss: 0.170835
[572]	train's auc: 0.751198	train's binary_logloss: 0.169095	valid's auc: 0.741254	valid's binary_logloss: 0.170834
[573]	train's auc: 0.751222	train's binary_logloss: 0.169091	valid's auc: 0.741263	valid's binary_logloss: 0.170832
[574]	train's auc: 0.751251	train's binary_logloss: 0.169086	valid's auc: 0.7

[637]	train's auc: 0.752894	train's binary_logloss: 0.168805	valid's auc: 0.741731	valid's binary_logloss: 0.170741
[638]	train's auc: 0.75292	train's binary_logloss: 0.168801	valid's auc: 0.741737	valid's binary_logloss: 0.170739
[639]	train's auc: 0.752945	train's binary_logloss: 0.168797	valid's auc: 0.741744	valid's binary_logloss: 0.170738
[640]	train's auc: 0.75298	train's binary_logloss: 0.16879	valid's auc: 0.741769	valid's binary_logloss: 0.170735
[641]	train's auc: 0.753006	train's binary_logloss: 0.168787	valid's auc: 0.741774	valid's binary_logloss: 0.170734
[642]	train's auc: 0.753029	train's binary_logloss: 0.168782	valid's auc: 0.74178	valid's binary_logloss: 0.170733
[643]	train's auc: 0.753052	train's binary_logloss: 0.168779	valid's auc: 0.741786	valid's binary_logloss: 0.170731
[644]	train's auc: 0.753076	train's binary_logloss: 0.168775	valid's auc: 0.74179	valid's binary_logloss: 0.17073
[645]	train's auc: 0.753094	train's binary_logloss: 0.168772	valid's auc: 0.74

[708]	train's auc: 0.754655	train's binary_logloss: 0.16851	valid's auc: 0.742205	valid's binary_logloss: 0.170656
[709]	train's auc: 0.754669	train's binary_logloss: 0.168507	valid's auc: 0.742203	valid's binary_logloss: 0.170656
[710]	train's auc: 0.754693	train's binary_logloss: 0.168503	valid's auc: 0.742201	valid's binary_logloss: 0.170656
[711]	train's auc: 0.754717	train's binary_logloss: 0.1685	valid's auc: 0.742202	valid's binary_logloss: 0.170655
[712]	train's auc: 0.754746	train's binary_logloss: 0.168495	valid's auc: 0.742212	valid's binary_logloss: 0.170654
[713]	train's auc: 0.75478	train's binary_logloss: 0.16849	valid's auc: 0.742218	valid's binary_logloss: 0.170653
[714]	train's auc: 0.7548	train's binary_logloss: 0.168486	valid's auc: 0.742219	valid's binary_logloss: 0.170652
[715]	train's auc: 0.754824	train's binary_logloss: 0.168479	valid's auc: 0.74223	valid's binary_logloss: 0.170649
[716]	train's auc: 0.754852	train's binary_logloss: 0.168475	valid's auc: 0.7422

[779]	train's auc: 0.756271	train's binary_logloss: 0.168228	valid's auc: 0.742594	valid's binary_logloss: 0.170574
[780]	train's auc: 0.75629	train's binary_logloss: 0.168225	valid's auc: 0.742593	valid's binary_logloss: 0.170574
[781]	train's auc: 0.756306	train's binary_logloss: 0.168222	valid's auc: 0.742599	valid's binary_logloss: 0.170573
[782]	train's auc: 0.756331	train's binary_logloss: 0.168217	valid's auc: 0.742605	valid's binary_logloss: 0.170572
[783]	train's auc: 0.756369	train's binary_logloss: 0.168212	valid's auc: 0.742619	valid's binary_logloss: 0.170569
[784]	train's auc: 0.756388	train's binary_logloss: 0.168209	valid's auc: 0.742616	valid's binary_logloss: 0.170569
[785]	train's auc: 0.756404	train's binary_logloss: 0.168206	valid's auc: 0.742619	valid's binary_logloss: 0.170569
[786]	train's auc: 0.756431	train's binary_logloss: 0.168201	valid's auc: 0.742626	valid's binary_logloss: 0.170567
[787]	train's auc: 0.75645	train's binary_logloss: 0.168198	valid's auc: 

[850]	train's auc: 0.75776	train's binary_logloss: 0.167973	valid's auc: 0.742908	valid's binary_logloss: 0.170512
[851]	train's auc: 0.757781	train's binary_logloss: 0.167969	valid's auc: 0.742913	valid's binary_logloss: 0.170511
[852]	train's auc: 0.757801	train's binary_logloss: 0.167966	valid's auc: 0.742908	valid's binary_logloss: 0.170512
[853]	train's auc: 0.757823	train's binary_logloss: 0.167961	valid's auc: 0.74291	valid's binary_logloss: 0.170511
[854]	train's auc: 0.757842	train's binary_logloss: 0.167958	valid's auc: 0.742914	valid's binary_logloss: 0.170509
[855]	train's auc: 0.757862	train's binary_logloss: 0.167954	valid's auc: 0.742924	valid's binary_logloss: 0.170507
[856]	train's auc: 0.757886	train's binary_logloss: 0.167949	valid's auc: 0.742922	valid's binary_logloss: 0.170507
[857]	train's auc: 0.757904	train's binary_logloss: 0.167946	valid's auc: 0.742924	valid's binary_logloss: 0.170506
[858]	train's auc: 0.757922	train's binary_logloss: 0.167943	valid's auc: 

[921]	train's auc: 0.7593	train's binary_logloss: 0.167708	valid's auc: 0.743254	valid's binary_logloss: 0.170447
[922]	train's auc: 0.759315	train's binary_logloss: 0.167705	valid's auc: 0.743255	valid's binary_logloss: 0.170447
[923]	train's auc: 0.759339	train's binary_logloss: 0.167702	valid's auc: 0.743262	valid's binary_logloss: 0.170446
[924]	train's auc: 0.759357	train's binary_logloss: 0.167699	valid's auc: 0.743262	valid's binary_logloss: 0.170446
[925]	train's auc: 0.759387	train's binary_logloss: 0.167694	valid's auc: 0.743267	valid's binary_logloss: 0.170446
[926]	train's auc: 0.759406	train's binary_logloss: 0.167691	valid's auc: 0.743267	valid's binary_logloss: 0.170445
[927]	train's auc: 0.759428	train's binary_logloss: 0.167688	valid's auc: 0.743273	valid's binary_logloss: 0.170445
[928]	train's auc: 0.759447	train's binary_logloss: 0.167685	valid's auc: 0.743274	valid's binary_logloss: 0.170444
[929]	train's auc: 0.759463	train's binary_logloss: 0.167682	valid's auc: 

[992]	train's auc: 0.760763	train's binary_logloss: 0.167452	valid's auc: 0.743449	valid's binary_logloss: 0.170403
[993]	train's auc: 0.760786	train's binary_logloss: 0.167448	valid's auc: 0.743456	valid's binary_logloss: 0.170401
[994]	train's auc: 0.760803	train's binary_logloss: 0.167445	valid's auc: 0.743451	valid's binary_logloss: 0.170402
[995]	train's auc: 0.76082	train's binary_logloss: 0.167442	valid's auc: 0.743457	valid's binary_logloss: 0.170402
[996]	train's auc: 0.760834	train's binary_logloss: 0.167439	valid's auc: 0.743461	valid's binary_logloss: 0.170401
[997]	train's auc: 0.76085	train's binary_logloss: 0.167436	valid's auc: 0.743459	valid's binary_logloss: 0.170401
[998]	train's auc: 0.760871	train's binary_logloss: 0.167433	valid's auc: 0.743458	valid's binary_logloss: 0.170401
[999]	train's auc: 0.760895	train's binary_logloss: 0.16743	valid's auc: 0.743455	valid's binary_logloss: 0.170402
[1000]	train's auc: 0.760919	train's binary_logloss: 0.167426	valid's auc: 

[1063]	train's auc: 0.762089	train's binary_logloss: 0.167211	valid's auc: 0.743635	valid's binary_logloss: 0.170361
[1064]	train's auc: 0.762108	train's binary_logloss: 0.167208	valid's auc: 0.743631	valid's binary_logloss: 0.170361
[1065]	train's auc: 0.762125	train's binary_logloss: 0.167203	valid's auc: 0.743641	valid's binary_logloss: 0.170358
[1066]	train's auc: 0.762149	train's binary_logloss: 0.1672	valid's auc: 0.743646	valid's binary_logloss: 0.170358
[1067]	train's auc: 0.762168	train's binary_logloss: 0.167196	valid's auc: 0.743639	valid's binary_logloss: 0.170359
[1068]	train's auc: 0.762182	train's binary_logloss: 0.167194	valid's auc: 0.743637	valid's binary_logloss: 0.170359
[1069]	train's auc: 0.762193	train's binary_logloss: 0.167191	valid's auc: 0.743636	valid's binary_logloss: 0.170359
[1070]	train's auc: 0.762207	train's binary_logloss: 0.167189	valid's auc: 0.743638	valid's binary_logloss: 0.170358
[1071]	train's auc: 0.762226	train's binary_logloss: 0.167185	vali

[1134]	train's auc: 0.763391	train's binary_logloss: 0.166977	valid's auc: 0.743827	valid's binary_logloss: 0.170315
[1135]	train's auc: 0.76341	train's binary_logloss: 0.166973	valid's auc: 0.74383	valid's binary_logloss: 0.170314
[1136]	train's auc: 0.763427	train's binary_logloss: 0.16697	valid's auc: 0.743833	valid's binary_logloss: 0.170314
[1137]	train's auc: 0.763446	train's binary_logloss: 0.166967	valid's auc: 0.743829	valid's binary_logloss: 0.170314
[1138]	train's auc: 0.763461	train's binary_logloss: 0.166962	valid's auc: 0.743836	valid's binary_logloss: 0.170312
[1139]	train's auc: 0.763476	train's binary_logloss: 0.166959	valid's auc: 0.743841	valid's binary_logloss: 0.170311
[1140]	train's auc: 0.763495	train's binary_logloss: 0.166956	valid's auc: 0.743849	valid's binary_logloss: 0.170309
[1141]	train's auc: 0.763517	train's binary_logloss: 0.166952	valid's auc: 0.743855	valid's binary_logloss: 0.170308
[1142]	train's auc: 0.763537	train's binary_logloss: 0.166949	valid

[1205]	train's auc: 0.764697	train's binary_logloss: 0.16674	valid's auc: 0.744046	valid's binary_logloss: 0.170269
[1206]	train's auc: 0.764722	train's binary_logloss: 0.166737	valid's auc: 0.744043	valid's binary_logloss: 0.170268
[1207]	train's auc: 0.764737	train's binary_logloss: 0.166734	valid's auc: 0.744039	valid's binary_logloss: 0.170269
[1208]	train's auc: 0.764753	train's binary_logloss: 0.16673	valid's auc: 0.744042	valid's binary_logloss: 0.170268
[1209]	train's auc: 0.764768	train's binary_logloss: 0.166727	valid's auc: 0.744034	valid's binary_logloss: 0.170269
[1210]	train's auc: 0.764793	train's binary_logloss: 0.166723	valid's auc: 0.744037	valid's binary_logloss: 0.170269
[1211]	train's auc: 0.764811	train's binary_logloss: 0.16672	valid's auc: 0.744042	valid's binary_logloss: 0.170269
[1212]	train's auc: 0.76483	train's binary_logloss: 0.166717	valid's auc: 0.744047	valid's binary_logloss: 0.170267
[1213]	train's auc: 0.764842	train's binary_logloss: 0.166715	valid'

[1276]	train's auc: 0.765983	train's binary_logloss: 0.166516	valid's auc: 0.744174	valid's binary_logloss: 0.17024
[1277]	train's auc: 0.766	train's binary_logloss: 0.166512	valid's auc: 0.744172	valid's binary_logloss: 0.170239
[1278]	train's auc: 0.766013	train's binary_logloss: 0.16651	valid's auc: 0.744177	valid's binary_logloss: 0.170239
[1279]	train's auc: 0.766036	train's binary_logloss: 0.166506	valid's auc: 0.744175	valid's binary_logloss: 0.170239
[1280]	train's auc: 0.766051	train's binary_logloss: 0.166502	valid's auc: 0.744187	valid's binary_logloss: 0.170235
[1281]	train's auc: 0.766065	train's binary_logloss: 0.166499	valid's auc: 0.744184	valid's binary_logloss: 0.170235
[1282]	train's auc: 0.76608	train's binary_logloss: 0.166496	valid's auc: 0.74418	valid's binary_logloss: 0.170235
[1283]	train's auc: 0.766093	train's binary_logloss: 0.166493	valid's auc: 0.744174	valid's binary_logloss: 0.170236
[1284]	train's auc: 0.766113	train's binary_logloss: 0.16649	valid's au

[1347]	train's auc: 0.767213	train's binary_logloss: 0.166295	valid's auc: 0.744245	valid's binary_logloss: 0.170217
[1348]	train's auc: 0.767239	train's binary_logloss: 0.166291	valid's auc: 0.744241	valid's binary_logloss: 0.170217
[1349]	train's auc: 0.767248	train's binary_logloss: 0.166288	valid's auc: 0.744235	valid's binary_logloss: 0.170218
[1350]	train's auc: 0.767272	train's binary_logloss: 0.166284	valid's auc: 0.744243	valid's binary_logloss: 0.170217
[1351]	train's auc: 0.76729	train's binary_logloss: 0.166282	valid's auc: 0.744246	valid's binary_logloss: 0.170217
[1352]	train's auc: 0.767305	train's binary_logloss: 0.166279	valid's auc: 0.744247	valid's binary_logloss: 0.170217
[1353]	train's auc: 0.767322	train's binary_logloss: 0.166276	valid's auc: 0.744252	valid's binary_logloss: 0.170217
[1354]	train's auc: 0.767337	train's binary_logloss: 0.166274	valid's auc: 0.744249	valid's binary_logloss: 0.170217
[1355]	train's auc: 0.767358	train's binary_logloss: 0.166271	val

LGBMClassifier(colsample_bytree=0.7, n_estimators=10000, num_leaves=40,
               random_state=2018, reg_alpha=6, reg_lambda=3, silent=False,
               subsample=0.7, subsample_freq=1)

In [26]:
print('===================Test Set Performance===================')
y_valid_pred = clf.predict_proba(x_valid, num_iteration = clf.best_iteration_)[:,1]

print(pd.Series(y_valid_pred).describe())
print('Test AUC', roc_auc_score(y_valid, y_valid_pred))
print('Test Logloss', log_loss(y_valid, y_valid_pred))

count    1.759974e+06
mean     4.798085e-02
std      5.781655e-02
min      2.774193e-04
25%      2.066312e-02
50%      3.498627e-02
75%      5.554607e-02
max      9.938129e-01
dtype: float64
Test AUC 0.7442824433402417
Test Logloss 0.17021078701055584


## LogisticRegression

In [8]:
# 模型定义与训练
clf = LogisticRegression(random_state=1, penalty='l2', C=10, solver='lbfgs',
                         tol=1e-4, max_iter=10000, verbose=2)
clf.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 365.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 365.9min finished


LogisticRegression(C=10, max_iter=10000, random_state=1, verbose=2)

In [10]:
y_valid_pred = clf.predict(x_valid)

print('===================Test Set Performance===================')
print(pd.Series(y_valid_pred).describe())
print('Test AUC', roc_auc_score(y_valid, y_valid_pred))
print('Test Logloss', log_loss(y_valid, y_valid_pred))

count    1.759974e+06
mean     1.844345e-03
std      4.290623e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
dtype: float64
Test AUC 0.5106611993494088
Test Logloss 1.6484665776988547
