In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('./data/random_data.csv')
data.shape

(4930161, 17)

In [3]:
data.head()

Unnamed: 0,label,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,I1,I2
0,0,4110010518477440242,1896312013845627886,6717084795003240529,1080695977093114212,4111614179005464985,8798042721559307282,5895132605950582744,6084180829892317120,7697383445739955609,4884422954281974949,8336763477901001868,6242285785126424193,7900538574498788796,134740763003655426,128.0,2996580352
1,0,8549798872825571966,1896312013845627886,5931133767701417382,1080695977093114212,6220860133300853649,8798042721559307282,7906599642295772887,8074429082947521887,2016335991759856420,8321822893668424639,6374970459031672275,2638981520939704579,1276775102817461451,7804158046819701007,0.0,3960238080
2,0,8549798872825571966,1896312013845627886,5931133767701417382,1080695977093114212,6220860133300853649,8798042721559307282,5895132605950582744,6084180829892317120,7697383445739955609,4884422954281974949,8336763477901001868,6242285785126424193,7900538574498788796,134740763003655426,128.0,2996584448
3,1,4110010518477440242,1896312013845627886,6738940029233988884,1080695977093114212,4111614179005464985,4661809481046063301,4860738004335111677,9151208213879479282,287166139677673559,3794272402836699782,4828615104526241983,6674460745934457691,1276775102817461451,134740763003655426,102.0,5837828096
4,0,754307468715170673,1896312013845627886,4744813893973488598,1080695977093114212,4111614179005464985,8798042721559307282,3365621371127355288,8074429082947521887,8049740328582698059,9094510792016618155,1867504036779671592,3042470253253863991,1276775102817461451,134740763003655426,92.0,3950256128


In [4]:
cols = data.columns.values
CATEGORICAL_COLS = [f for f in cols if f[0] == "C"] # C开头是类别特征
data[CATEGORICAL_COLS] = data[CATEGORICAL_COLS].fillna('unkown')

CATEGORICAL_COLS

['C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14']

In [5]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

def encodeColumns(sdf, colnames):
    df = sdf
    labelEncoderDict = {}
    for col in colnames:
        labelEncoderDict[col] = {}
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        clas = le.classes_
        for i in range(0, len(clas)):
            labelEncoderDict[col][clas[i]] = i
 
    return df, labelEncoderDict

data, le_dict = encodeColumns(data, CATEGORICAL_COLS)

In [6]:
df_train = data.loc[:3500000-1]

y_train = df_train["label"]
X_train = df_train.drop(columns=["label"])

X_train.shape, y_train.shape

((3500000, 16), (3500000,))

In [7]:
df_test = data.loc[3500000:]
y_test = df_test["label"]
X_test = df_test.drop(columns=["label"])

X_test.shape, y_test.shape

((1430161, 16), (1430161,))

In [8]:
import lightgbm
lgb_train = lightgbm.Dataset(data=X_train, label=y_train, 
                             categorical_feature=CATEGORICAL_COLS, 
                             free_raw_data=False)

lgb_valid = lightgbm.Dataset(data=X_test, label=y_test, 
                             categorical_feature=CATEGORICAL_COLS, 
                             free_raw_data=False)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss','xentropy','auc'},
    'num_leaves': 200, #<2^depth
    'max_depth':9,
    'learning_rate': 0.06,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'min_data_in_leaf': 300,
    'max_bin': 1000,
    'lambda_l1': 1,             #l1正则
#     'lambda_l2': 0.001,     #l2正则
    'is_unbalance':True, ###
    'bagging_freq': 5, #每 k 次迭代执行bagging
    'verbose': 1,# <0 显示致命的, =0 显示错误 (警告), >0 显示信息
    'num_iterations':300
    }
 
gbm = lightgbm.train(params, lgb_train, valid_sets=[lgb_valid])



[LightGBM] [Info] Number of positive: 419054, number of negative: 3080946
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 3500000.000000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6020
[LightGBM] [Info] Number of data points in the train set: 3500000, number of used features: 16




[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 1430161.000000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.119730 -> initscore=-1.994992
[LightGBM] [Info] Start training from score -1.994992
[1]	valid_0's cross_entropy: 0.311487	valid_0's binary_logloss: 0.311487	valid_0's auc: 0.98567
[2]	valid_0's cross_entropy: 0.278331	valid_0's binary_logloss: 0.278331	valid_0's auc: 0.987702
[3]	valid_0's cross_entropy: 0.252793	valid_0's binary_logloss: 0.252793	valid_0's auc: 0.99116
[4]	valid_0's cross_entropy: 0.232263	valid_0's binary_logloss: 0.232263	valid_0's auc: 0.994879
[5]	valid_0's cross_entropy: 0.214937	valid_0's binary_logloss: 0.214937	valid_0's auc: 0.995389
[95]	valid_0's cross_entropy: 0.0415385	valid_0's binary_logloss: 0.0415385	valid_0's auc: 0.997669
[96]	valid_0's cross_entropy: 0.0414831	valid_0's binary_logloss: 0.0414831	valid_0's auc: 0.997673
[97]	valid_0's cross_entro

In [9]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
y_pred = gbm.predict(X_test)
print('auc值：', roc_auc_score(y_test, y_pred))

auc值： 0.9978716600868143


In [10]:
fea_cols = [c for c in cols if c != 'label']

print(pd.DataFrame({
        'column': fea_cols,
        'importance': gbm.feature_importance(importance_type='gain'),
    }).sort_values(by='importance'))

   column    importance
3      C4  1.575813e+04
12    C13  1.763607e+04
13    C14  1.030285e+05
1      C2  1.157638e+05
15     I2  3.027119e+05
9     C10  3.054032e+05
14     I1  6.080994e+05
0      C1  6.874883e+05
4      C5  7.385032e+05
7      C8  9.332413e+05
5      C6  1.295243e+06
6      C7  1.839049e+06
2      C3  5.463211e+06
11    C12  8.249795e+06
8      C9  1.335486e+07
10    C11  2.042035e+07
