In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('data/train_set.csv')
test = pd.read_csv('data/test_set.csv')
data = pd.concat([train,test])

In [3]:
feature = train.columns.tolist()
feature.remove('ID')
feature.remove('y')
object_columns = train.columns[train.dtypes == 'object'].tolist()
num_columns = list(set(feature) - set(object_columns))
print (object_columns)
print (num_columns)

['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
['balance', 'duration', 'age', 'campaign', 'previous', 'day', 'pdays']


In [4]:
train[num_columns].describe()

Unnamed: 0,balance,duration,age,campaign,previous,day,pdays
count,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0
mean,1357.555082,257.732393,40.935379,2.77205,0.591737,15.835289,40.248766
std,2999.822811,256.975151,10.634289,3.136097,2.568313,8.31948,100.213541
min,-8019.0,0.0,18.0,1.0,0.0,1.0,-1.0
25%,73.0,103.0,33.0,1.0,0.0,8.0,-1.0
50%,448.0,181.0,39.0,2.0,0.0,16.0,-1.0
75%,1435.0,317.0,48.0,3.0,0.0,21.0,-1.0
max,102127.0,3881.0,95.0,55.0,275.0,31.0,854.0


In [5]:
for col in object_columns:
    data = pd.concat([data, pd.get_dummies(data[col], prefix=col+'_')], axis=1)
    data.drop(col, axis=1, inplace=True)

In [6]:
X_train = data[data['y'].notnull()]
X_test = data[data['y'].isnull()]

y_train = X_train['y']
X_train.drop(['ID', 'y'], axis=1, inplace=True)
result = pd.DataFrame({'ID': X_test['ID']})
X_test.drop(['ID', 'y'], axis=1, inplace=True)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train[num_columns])
X_train[num_columns] = scaler.transform(X_train[num_columns])
X_test[num_columns] = scaler.transform(X_test[num_columns])

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
# 创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)  # 创建验证数据
 
# 将参数写成字典下形式
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'learning_rate': 0.01,
    'is_unbalance': True,
    'random_state': 0,
    'verbose': 0
}
 
print('Start training...')
clf = lgb.cv(params,
             metrics=['auc'],
             lgb_train,
             num_boost_round=10000,
             nfold=5,
             early_stopping_rounds=100，
             verbose_eval=True)
 
print('Start predicting...')
# 预测数据集
y_pred = clf.predict(X_val, num_iteration=clf.best_iteration) #如果在训练期间启用了早期停止，可以通过best_iteration方式从最佳迭代中获得预测
# 评估模型
print(roc_auc_score(y_val, y_pred))

Start training...
Start predicting...


AttributeError: 'dict' object has no attribute 'predict'

In [None]:
y_test = clf.predict(X_test)
result = pd.DataFrame({'ID': X_test['ID'], 'pred': y_test})
print (result.head())
result.to_csv('data/submission.csv', index=False)

In [None]:
# param = {
#     'task': 'train',
#     'boosting_type': 'gbdt',  # 设置提升类型
#     'objective': 'binary', # 目标函数
#     'metric': {'auc'},  # 评估函数
#     'learning_rate': 0.01,
#     'is_unbalance': True,
#     'verbose': 0 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
# }
# folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# oof = np.zeros(len(y_train))
# result['pred'] = 0
# feature_importance_df = pd.DataFrame()

# # .iloc[tra_idx]
# # .iloc[val_idx]
# for fold_, (tra_idx, val_idx) in enumerate(folds.split(X_train, y_train.values)):
#     print("fold {}".format(fold_))
#     tra_data = lgb.Dataset(X_train.iloc[tra_idx], label=y_train.iloc[tra_idx])#, categorical_feature=categorical_feats)
#     val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx])#, categorical_feature=categorical_feats)

#     num_round = 10000
#     clf = lgb.train(param, tra_data, num_round, valid_sets = [tra_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
#     oof[val_idx] = clf.predict(X_train.iloc[val_idx], num_iteration=clf.best_iteration)
#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["Feature"] = X_train.columns.tolist()
#     fold_importance_df["importance"] = clf.feature_importance()
#     fold_importance_df["fold"] = fold_
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#     result['pred'] += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

In [None]:
# from sklearn.metrics import roc_auc_score
# roc_auc_score(y_train, oof)

In [None]:
# print (result.head())
# result.to_csv('data/submission.csv', index=False)

In [None]:
# feature_importance_sort = feature_importance_df[["Feature", "importance"]].groupby("Feature").mean()\
#                             .sort_values(by="importance", ascending=False)
# feature_importance_sort[:20]