In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import matplotlib
import matplotlib.pyplot as plt
import lightgbm as lgb
import operator
import time

In [7]:
# 1.读取文件
train = pd.read_csv("data/train.csv")
train_target = pd.read_csv('data/train_target.csv')
train = train.merge(train_target, on='id')
test = pd.read_csv("data/test.csv")
print(train.shape)
print(test.shape)
# 2.合并数据
test['y'] = -1
data = pd.concat([train, test],sort=False, axis=0)
print(train.shape)
print(test.shape)
print(data.shape)

(56411, 105)
(27785, 104)
(56411, 105)
(27785, 105)
(84196, 106)


In [13]:
# 简单数据描述
stats = []
for col in train.columns:
    stats.append((col, train[col].nunique(), train[col].isnull().sum() * 100 / train.shape[0],
                  train[col].value_counts(normalize=True, dropna=False).values[0] * 100, train[col].dtype))

stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values',
                                        'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Unique_values', ascending=False)[:30]

Unnamed: 0,Feature,Unique_values,Percentage of missing values,Percentage of values in the biggest category,type
0,id,56411,0.0,0.001773,int64
91,certBalidStop,6224,0.0,0.333268,float64
92,bankCard,4769,23.679779,23.679779,float64
90,certValidBegin,4620,0.0,0.186134,int64
1,certId,3696,0.0,0.716172,int64
5,dist,3237,0.0,1.024623,int64
94,residentAddr,2694,0.0,54.218149,int64
8,lmt,1108,0.0,2.802645,float64
93,ethnic,46,0.0,92.226693,int64
4,age,37,0.0,6.906454,int64


In [12]:
stats = []
for col in test.columns:
    stats.append((col, test[col].nunique(), test[col].isnull().sum() * 100 / test.shape[0],
                  test[col].value_counts(normalize=True, dropna=False).values[0] * 100, test[col].dtype))

stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values',
                                        'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Unique_values', ascending=False)[:30]

Unnamed: 0,Feature,Unique_values,Percentage of missing values,Percentage of values in the biggest category,type
0,id,27785,0.0,0.003599,int64
91,certBalidStop,5504,0.0,0.399496,float64
90,certValidBegin,4094,0.0,0.197949,int64
92,bankCard,3607,24.452042,24.452042,float64
1,certId,3361,0.0,0.755803,int64
5,dist,2857,0.0,0.906964,int64
94,residentAddr,2391,0.0,53.503689,int64
8,lmt,946,0.0,2.695699,float64
93,ethnic,45,0.0,92.485154,int64
4,age,36,0.0,6.928199,int64


In [24]:
# 特征工程
# 根据 unique values确定

no_feas=['id','target']+['certId','bankCard','dist','residentAddr']
data['certPeriod']=data['certBalidStop']-data['certValidBegin']
numerical_features = ['certBalidStop','certValidBegin','lmt','age','certPeriod']
categorical_features=[fea for fea in data.columns if fea not in numerical_features+no_feas]

In [31]:
features=[fea for fea in data.columns if fea not in no_feas]

In [34]:
train = data.loc[data['y']!=-1,:] # train set
test = data.loc[data['y']==-1,:]  # test set
y = train['target'].values.astype(int)
X = train[features].values
print("X shape:",X.shape)
print("y shape:",y.shape)
test_data = test[features].values
print("test shape",test_data.shape)

X shape: (56411, 101)
y shape: (56411,)
test shape (27785, 101)


In [37]:
# 训练
# 采取分层采样
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

print("start：********************************")
start = time.time()

N = 5 
skf = StratifiedKFold(n_splits=N,shuffle=True,random_state=2018)

auc_cv = []
pred_cv = []
for k,(train_in,test_in) in enumerate(skf.split(X,y)):
    X_train,X_test,y_train,y_test = X[train_in],X[test_in],\
                                    y[train_in],y[test_in]
    
    # 数据结构
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    # 设置参数
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'auc'},
        'max_depth': 4,
        'min_child_weight': 6,
        'num_leaves': 16,
        'learning_rate': 0.02,# 0.05
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        #'lambda_l1':0.25,
        #'lambda_l2':0.5,
        #'scale_pos_weight':10.0/1.0, #14309.0 / 691.0, #不设置
        #'num_threads':4,
    }
    print('................Start training..........................')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=2000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=100,
                    verbose_eval=100)

    print('................Start predict .........................')
    # 预测
    y_pred = gbm.predict(X_test,num_iteration=gbm.best_iteration)
    # 评估
    tmp_auc = roc_auc_score(y_test,y_pred)
    auc_cv.append(tmp_auc)
    print("valid auc:",tmp_auc)
    # test
    pred = gbm.predict(test_data, num_iteration = gbm.best_iteration)
    pred_cv.append(pred) 
# K交叉验证的平均分数 
print('the cv information:')
print(auc_cv)
print('cv mean score',np.mean(auc_cv))

end = time.time()
print("......................run with time: ",(end - start) / 60.0 )
print("over:*********************************")

# 10.5折交叉验证结果均值融合，保存文件
mean_auc = np.mean(auc_cv)
print("mean auc:",mean_auc)
filepath = 'result/lgb_'+ str(mean_auc)+'.csv' # 线下平均分数

# 转为array
res =  np.array(pred_cv)
print("总的结果：",res.shape)
# 最后结果平均，mean
r = res.mean(axis = 0)
print('result shape:',r.shape)
result = DataFrame()
result['id'] = test['id']
result['target'] = r
result.to_csv(filepath,index=False,sep=",")

start：********************************
................Start training..........................
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.671588
[200]	valid_0's auc: 0.685211
[300]	valid_0's auc: 0.673787
Early stopping, best iteration is:
[210]	valid_0's auc: 0.686126
................Start predict .........................
valid auc: 0.6861257978621752
................Start training..........................
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.679968
[200]	valid_0's auc: 0.68975
[300]	valid_0's auc: 0.690091
Early stopping, best iteration is:
[247]	valid_0's auc: 0.692619
................Start predict .........................
valid auc: 0.6926188777330411
................Start training..........................
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.726314
[200]	valid_0's auc: 0.721901
Early stopping, best iteration is:
[102]	valid_0's auc: 0.72806