In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft

# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

# modeling 
import lightgbm as lgb

# utilities
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
# memory management
import gc

import os



In [2]:
os.chdir("D:\data\Jupyter Notebook\kaggle\Home-Credit-Default-Risk")

### 前処理済データ読み込み

In [3]:
# Read in data
train = pd.read_csv('m_train.csv')
test = pd.read_csv('m_test.csv')

In [4]:
train_1 = train.drop('TARGET',axis=1)

In [5]:
train_labels= train['TARGET']

### 別の方法でimputation scaleしてみる

In [6]:
train_names = list(train.columns)
train_1_names = list(train_1.columns)

In [7]:
from sklearn.preprocessing import MinMaxScaler, Imputer
# Impute the domainnomial features
imputer = Imputer(strategy = 'median')

train_imp = imputer.fit_transform(train_1)
test_imp = imputer.transform(test)

# Scale the domainnomial features
#scaler = MinMaxScaler(feature_range = (0, 1))

#train_imp = scaler.fit_transform(train_1)
#test_imp = scaler.fit_transform(test)

In [8]:
train_imp = pd.DataFrame(data=train_imp,columns = train_1_names)

In [9]:
test_imp = pd.DataFrame(data=test_imp,columns = train_1_names)

In [10]:
train_imp['TARGET'] = train_labels

In [21]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train_labels
train_imp = train_imp.drop(['TARGET'], axis=1)
x_train = train_imp.values # Creates an array of the train data
x_test = test_imp.values # Creats an array of the test data

In [35]:
y_train.describe()

count    307511.000000
mean          0.080729
std           0.272419
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: TARGET, dtype: float64

In [39]:
# Kfoldを作成
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
rf = RandomForestClassifier(n_estimators = 200,
                            random_state = 50,
                            verbose = 1, 
                            n_jobs = -1)
for i, (train_index, test_index) in enumerate(kf):
    print(i)
    print(train_index)
    print(test_index)
    x_tr = x_train[train_index]
    y_tr = y_train[train_index]
    x_te = x_train[test_index]
    rf.fit(x_tr, y_tr)
    oof_train[test_index] = rf.predict(x_te)
    oof_test_skf[i, :] = rf.predict(x_test)

oof_test[:] = oof_test_skf.mean(axis=0)

0
[ 61503  61504  61505 ... 307508 307509 307510]
[    0     1     2 ... 61500 61501 61502]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.6s finished


1
[     0      1      2 ... 307508 307509 307510]
[ 61503  61504  61505 ... 123002 123003 123004]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   54.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  4.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.7s finished


2
[     0      1      2 ... 307508 307509 307510]
[123005 123006 123007 ... 184504 184505 184506]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    2.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.6s finished


3
[     0      1      2 ... 307508 307509 307510]
[184507 184508 184509 ... 246006 246007 246008]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  4.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    2.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.6s finished


4
[     0      1      2 ... 246006 246007 246008]
[246009 246010 246011 ... 307508 307509 307510]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   54.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  4.2min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.6s finished


In [45]:
x_train_first_rf = pd.DataFrame({'rf': oof_train.ravel()})
x_test_first_rf = pd.DataFrame({'rf': oof_test.ravel()})

In [44]:
# Kfoldを作成
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
et = ExtraTreesClassifier(n_estimators = 300,
                            random_state = 50,
                            max_depth = 8,
                            min_samples_leaf = 2,
                            verbose = 1, 
                            n_jobs = -1)
for i, (train_index, test_index) in enumerate(kf):
    print(i)
    print(train_index)
    print(test_index)
    x_tr = x_train[train_index]
    y_tr = y_train[train_index]
    x_te = x_train[test_index]
    et.fit(x_tr, y_tr)
    oof_train[test_index] = et.predict(x_te)
    oof_test_skf[i, :] = et.predict(x_test)

oof_test[:] = oof_test_skf.mean(axis=0)

0
[ 61503  61504  61505 ... 307508 307509 307510]
[    0     1     2 ... 61500 61501 61502]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.4s finished


1
[     0      1      2 ... 307508 307509 307510]
[ 61503  61504  61505 ... 123002 123003 123004]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.4s finished


2
[     0      1      2 ... 307508 307509 307510]
[123005 123006 123007 ... 184504 184505 184506]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.4s finished


3
[     0      1      2 ... 307508 307509 307510]
[184507 184508 184509 ... 246006 246007 246008]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.4s finished


4
[     0      1      2 ... 246006 246007 246008]
[246009 246010 246011 ... 307508 307509 307510]


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    0.4s finished


In [46]:
x_train_first_et = pd.DataFrame({'et': oof_train.ravel()})
x_test_first_et = pd.DataFrame({'et': oof_test.ravel()})

In [50]:
x_train_first_et.describe()

Unnamed: 0,et
count,307511.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


### 欠損値埋めされたデータを作る

In [None]:
# 無限大をnullに置き換え
train.replace([np.inf, -np.inf], np.nan)
test.replace([np.inf, -np.inf], np.nan)

In [None]:
train_imp = train.fillna(train.median())
test_imp = test.fillna(train.median())

print('Training data shape: ', train_imp.shape)
print('Testing data shape: ', test_imp.shape)

In [None]:
#列の欠損値を調べる関数
def missing_values_table(df):
    #欠損値合計
    mis_val = df.isnull().sum()
    #欠損値割合
    mis_val_percent = 100 * df.isnull().sum()/len(df)
    #結果のテーブルを作成
    mis_val_table = pd.concat([mis_val,mis_val_percent],axis=1)
    
    #列名をリネーム
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0:'Missing Values',1:'% of Total Values'})
    
    #降順にソート
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1]!=0].sort_values(
    '% of Total Values',ascending=False).round(1)
    
    #サマリーを表示
    print("Your selected dataframe has "+str(df.shape[1])+" columns.\n"
         "There are "+str(mis_val_table_ren_columns.shape[0])+
         " columns that have missing values.")
    
    #欠損情報のデータフレームを返す
    return mis_val_table_ren_columns

In [None]:
#欠損統計
missing_values = missing_values_table(train_imp)
missing_values.head(20)

### Learning

In [None]:
# Kfoldを作成
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# SKlearnHelperのクラス
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self,x_train,y_train):
        self.clf.train(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def predict_proba(self, x):
        return self.clf.predict_proba(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self, x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [None]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        print(i)
        print(train_index)
        print(test_index)
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        #print(x_tr)
        #print(y_tr)
        clf.fit(x_tr, y_tr)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 1
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [None]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [None]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train_imp['TARGET'].ravel()
train_imp = train_imp.drop(['TARGET'], axis=1)
x_train = train_imp.values # Creates an array of the train data
x_test = test_imp.values # Creats an array of the test data

In [None]:
print('Training data shape:',x_train.shape)
print('Testing data shape:',x_test.shape)

In [None]:
# Create our OOF train and test predictions. These base results will be used as new features
#et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
#print("Extra Trees is complete")
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
print("Random Forest is complete")
#ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost
#print("AdaBoost is complete")
#gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
#print("Gradient Boost is complete")
#svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier
#print("SVC is complete")
print("Training is complete")

In [None]:
x_train_first = pd.DataFrame( {'et': et_oof_train.ravel(),
     'rf': rf_oof_train.ravel()})
x_train_first.head()

In [None]:
x_train_first.describe()

In [None]:
rf_oof_train.mean()

In [None]:
#x_train_first = np.concatenate(( et_oof_train, rf_oof_train, svc_oof_train), axis=1)
#x_test_first = np.concatenate(( et_oof_test, rf_oof_test, svc_oof_test), axis=1)

x_train_first = pd.concat([pd.DataFrame(et_oof_train,columns = ['et']),pd.DataFrame(rf_oof_train,columns = ['rf'])],axis=1)
x_test_first = pd.concat([pd.DataFrame(et_oof_test,columns = ['et']),pd.DataFrame(rf_oof_test,columns = ['rf'])],axis=1)

#x_train_first = pd.DataFrame({'et': et_oof_train, 'rf': rf_oof_train})
#x_test_first = pd.DataFrame({'et': et_oof_test, 'rf': rf_oof_test})

print('Training data shape:',x_train_first.shape)
print('Testing data shape:',x_test_first.shape)

In [None]:
# first levelのpredice(0,1)をトレーニング、テストデータにくっつける
x_train_second = pd.concat([train,x_train_first],axis=1)
x_test_second = pd.concat([test,x_test_first],axis=1)
# TARGET列を再びくっつける
x_train_second['TARGET'] = y_train

In [None]:
#first levelの出力だけを、次の入力とする場合
x_train_first['TARGET'] = y_train

In [None]:
from sklearn.model_selection import KFold

In [None]:
def model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', boosting_type='goss',
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [None]:
submission, feature_importances, metrics = model(train,test)

In [None]:
metrics

In [None]:
submission.to_csv('submission.csv', index = False)

In [None]:
def plot_feature_importances(df):
    """
    Plot importances returned by a model. This can work with any measure of
    feature importance provided that higher importance is better. 
    
    Args:
        df (dataframe): feature importances. Must have the features in a column
        called `features` and the importances in a column called `importance
        
    Returns:
        shows a plot of the 15 most importance features
        
        df (dataframe): feature importances sorted by importance (highest to lowest) 
        with a column for normalized importance
        """
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    return df

In [None]:
feature_importances_sorted = plot_feature_importances(feature_importances)

### トレーニング・テストデータの出力

In [None]:
train.to_csv('m_train.csv', index = False)
test.to_csv('m_test.csv', index = False)

In [None]:
train_imp.to_csv('m_train_imp.csv', index = False)
test_imp.to_csv('m_test_imp.csv', index = False)

### アンサンブル

In [None]:
submission

In [None]:
submission_ensemble = submission.copy().drop(['TARGET'],axis=1)
submission_ensemble['TARGET'] = (submission['TARGET'] + x_train_first['et']+ x_train_first['rf'])/3
submission_ensemble['SK_ID_CURR']
submission_ensemble.to_csv('submission_ensemble.csv', index = False)