In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('data/train_set.csv')
test = pd.read_csv('data/test_set.csv')
data = pd.concat([train,test])

In [3]:
feature = train.columns.tolist()
feature.remove('ID')
feature.remove('y')
object_columns = train.columns[train.dtypes == 'object'].tolist()
num_columns = list(set(feature) - set(object_columns))
print (object_columns)
print (num_columns)

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
['previous', 'day', 'duration', 'campaign', 'balance', 'age', 'pdays']


In [4]:
train[num_columns].describe()

Unnamed: 0,previous,day,duration,campaign,balance,age,pdays
count,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0
mean,0.591737,15.835289,257.732393,2.77205,1357.555082,40.935379,40.248766
std,2.568313,8.31948,256.975151,3.136097,2999.822811,10.634289,100.213541
min,0.0,1.0,0.0,1.0,-8019.0,18.0,-1.0
25%,0.0,8.0,103.0,1.0,73.0,33.0,-1.0
50%,0.0,16.0,181.0,2.0,448.0,39.0,-1.0
75%,0.0,21.0,317.0,3.0,1435.0,48.0,-1.0
max,275.0,31.0,3881.0,55.0,102127.0,95.0,854.0


In [5]:
for col in object_columns:
    data = pd.concat([data, pd.get_dummies(data[col], prefix=col+'_')], axis=1)
    data.drop(col, axis=1, inplace=True)

In [6]:
X_train = data[data['y'].notnull()]
X_test = data[data['y'].isnull()]

y_train = X_train['y']
X_train.drop(['ID', 'y'], axis=1, inplace=True)
result = pd.DataFrame({'ID': X_test['ID']})
X_test.drop(['ID', 'y'], axis=1, inplace=True)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train[num_columns])
X_train[num_columns] = scaler.transform(X_train[num_columns])
X_test[num_columns] = scaler.transform(X_test[num_columns])

In [8]:
param = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'binary', # 目标函数
    'metric': {'auc'},  # 评估函数
    'learning_rate': 0.01,
    'is_unbalance': True,
    'verbose': 0 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
oof = np.zeros(len(y_train))
result['pred'] = 0
feature_importance_df = pd.DataFrame()

# .iloc[tra_idx]
# .iloc[val_idx]
for fold_, (tra_idx, val_idx) in enumerate(folds.split(X_train, y_train.values)):
    print("fold {}".format(fold_))
    tra_data = lgb.Dataset(X_train.iloc[tra_idx], label=y_train.iloc[tra_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, tra_data, num_round, valid_sets = [tra_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = X_train.columns.tolist()
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    result['pred'] += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

fold 0
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.934089	valid_1's auc: 0.924288
[200]	training's auc: 0.940457	valid_1's auc: 0.927855
[300]	training's auc: 0.946738	valid_1's auc: 0.930949
[400]	training's auc: 0.951844	valid_1's auc: 0.933069
[500]	training's auc: 0.95704	valid_1's auc: 0.934328
[600]	training's auc: 0.961078	valid_1's auc: 0.934898
[700]	training's auc: 0.964678	valid_1's auc: 0.935267
[800]	training's auc: 0.967585	valid_1's auc: 0.935324
[900]	training's auc: 0.970262	valid_1's auc: 0.935215
Early stopping, best iteration is:
[794]	training's auc: 0.967449	valid_1's auc: 0.935361
fold 1
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.933076	valid_1's auc: 0.927413
[200]	training's auc: 0.940125	valid_1's auc: 0.929871
[300]	training's auc: 0.945906	valid_1's auc: 0.932717
[400]	training's auc: 0.951116	valid_1's auc: 0.934809
[500]	training's auc: 0.956472	valid_1's auc: 0.936481
[

In [9]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, oof)

0.9352705678381494

In [10]:
print (result.head())
result.to_csv('data/submission.csv', index=False)

      ID      pred
0  25318  0.161086
1  25319  0.025287
2  25320  0.013778
3  25321  0.901041
4  25322  0.111295


In [11]:
feature_importance_sort = feature_importance_df[["Feature", "importance"]].groupby("Feature").mean()\
                            .sort_values(by="importance", ascending=False)
feature_importance_sort[:20]

Unnamed: 0_level_0,importance
Feature,Unnamed: 1_level_1
duration,3824.4
day,2751.0
balance,2599.2
age,2149.8
pdays,1495.0
campaign,847.0
month__may,581.6
contact__unknown,536.0
poutcome__success,532.6
month__mar,464.8
