In [1]:
import pandas as pd
PATH = '/home/kai/data/kaggle/homecredit/'
train_missing = pd.read_pickle(PATH + 'feature_selection/missing.pkl')
ig = train_missing[train_missing > 0.8].index.values.tolist()

In [3]:
import sys
import time

class Logger(object):
    def __init__(self, logtofile=True, logfilename='log'):
        self.terminal = sys.stdout
        self.logfile = "{}_{}.log".format(logfilename, int(time.time()))
        self.logtofile = logtofile
    def write(self, message):
        if self.logtofile:
            self.log = open(self.logfile, "a")
            self.log.write(message)  
            self.log.close()
    def flush(self):
        pass
sys.stdout = Logger(logfilename='logfilelog')

In [4]:
import pandas as pd
import numpy as np
import warnings
import time
warnings.filterwarnings("ignore")
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score

PATH = '/home/kai/data/kaggle/homecredit/'
train = pd.read_pickle(PATH + 'train_bo2.pkl')
test = pd.read_pickle(PATH + 'test_bo2.pkl')

ignored_col = ['ORGANIZATION_TYPE', 'TARGET', 'SK_ID_CURR'] + ig
feats = [x for x in train.columns if x not in ignored_col]

train.shape

(307511, 2678)

In [5]:
categorical_feats = [x for x in ['FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE',
'OCCUPATION_TYPE','FLAG_DOCUMENT_2','FLAG_DOCUMENT_3','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12',
'FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18',
'FLAG_DOCUMENT_19','FLAG_DOCUMENT_20','FLAG_DOCUMENT_21',
'NAME_TYPE_SUITE'] if x not in ignored_col]

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)
PATH = '/home/kai/data/kaggle/homecredit/'
print('begin cv')
target = train['TARGET']
test_df = test.copy()
ignore_cols = ['ORGANIZATION_TYPE', 'TARGET', 'SK_ID_CURR', 'DAYS_BIRTH']
features = [x for x in train.columns if x not in ignore_cols]
train = train[features]
test = test[features]

from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import lightgbm as lgbm

lgbm_train = lgbm.Dataset(data=train,
                          label=target,
                          categorical_feature=categorical_feats,
                          free_raw_data=False)


lgbm_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_iteration': 10000,
    'num_threads': 8,

    'num_leaves': int(round(39.2552)),
    'feature_fraction': 0.1063,
    'bagging_fraction': 0.8098,
    'max_depth': int(round(9.5599)),
    'lambda_l1': 0.2999,
    'lambda_l2': 0.2738,
    'min_split_gain': 0.0411,
    'min_child_weight': 49.2974,
    'early_stopping_rounds': 152.2727,
    'scale_pos_weight': 1.5677}

cv_results = lgbm.cv(train_set=lgbm_train,
                     params=lgbm_params,
                     nfold=5,
                     verbose_eval=100,
                     metrics=['auc'])

optimum_boost_rounds = np.argmax(cv_results['auc-mean'])
print('Optimum boost rounds = {}'.format(optimum_boost_rounds))
print('Best CV result = {}'.format(np.max(cv_results['auc-mean'])))

clf = lgbm.train(train_set=lgbm_train,
                 params=lgbm_params,
                 num_boost_round=optimum_boost_rounds,
                verbose_eval=100)

""" Predict on test set and create submission """
y_pred = clf.predict(test)
out_df = pd.DataFrame({'SK_ID_CURR': test_df['SK_ID_CURR'], 'TARGET': y_pred})
print((out_df['TARGET']>0.5).sum(),'more than half')


In [None]:
import matplotlib.pyplot as plt
out_df[['TARGET']].hist()
plt.show()

In [None]:
import matplotlib.pyplot as plt

fig, (ax, ax1) = plt.subplots(1, 2, figsize=[15, 7])
lgbm.plot_importance(clf, ax=ax, max_num_features=30, importance_type='split')
lgbm.plot_importance(clf, ax=ax1, max_num_features=30, importance_type='gain')
ax.set_title('Importance by splits')
ax1.set_title('Importance by gain')
plt.tight_layout()
plt.show()

In [None]:
y = [x for x in zip(clf.feature_name(), clf.feature_importance('gain'))]
y.sort(key=lambda x: x[1], reverse=True)
y = [(i, j) for i, j in enumerate(y)]
y.to_pickle(PATH + 'feature_importance_gain.pkl')

In [None]:
x = [x for x in zip(clf.feature_name(), clf.feature_importance('split'))]
x.sort(key=lambda x: x[1], reverse=True)
x = [(i, j) for i, j in enumerate(x)]
x.to_pickle(PATH + 'feature_importance_split.pkl')