In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft

# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

# modeling 
import lightgbm as lgb

# utilities
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
# memory management
import gc

import os



In [2]:
os.chdir("D:\data\Jupyter Notebook\kaggle\Home-Credit-Default-Risk")

### 前処理済データ読み込み

In [3]:
# Read in data
train = pd.read_csv('m_train.csv')
test = pd.read_csv('m_test.csv')

### 欠損値埋めされたデータを作る

In [4]:
# 無限大をnullに置き換え
train.replace([np.inf, -np.inf], np.nan)
test.replace([np.inf, -np.inf], np.nan)

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,...,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Transport: type 3,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_reg oper account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",SK_ID_CURR
0,0,135000.0,568800.0,20560.5,0.018850,-19241,-2329,-5170.0,-812,,...,0,0,0,0,0,1,0,0,1,100001
1,0,99000.0,222768.0,17370.0,0.035792,-18064,-4469,-9118.0,-1623,,...,1,0,0,0,0,0,0,0,0,100005
2,0,202500.0,663264.0,69777.0,0.019101,-20038,-4458,-2175.0,-3503,5.0,...,0,0,1,0,0,0,0,0,0,100013
3,2,315000.0,1575000.0,49018.5,0.026392,-13976,-1866,-2000.0,-4208,,...,0,0,0,0,1,1,0,1,0,100028
4,1,180000.0,625500.0,32067.0,0.010032,-13040,-2191,-4000.0,-4262,16.0,...,0,0,0,0,0,0,0,0,0,100038
5,0,270000.0,959688.0,34600.5,0.025164,-18604,-12009,-6116.0,-2027,10.0,...,0,0,0,1,0,1,0,0,0,100042
6,2,180000.0,499221.0,22117.5,0.022800,-16685,-2580,-10125.0,-241,3.0,...,0,0,0,0,0,0,0,0,0,100057
7,0,166500.0,180000.0,14220.0,0.005144,-9516,-1387,-5063.0,-2055,,...,1,0,0,0,0,0,0,0,0,100065
8,0,315000.0,364896.0,28957.5,0.046220,-12744,-1013,-1686.0,-3171,,...,0,0,0,0,0,1,0,0,1,100066
9,1,162000.0,45000.0,5337.0,0.018634,-10395,-2625,-8124.0,-3041,5.0,...,0,1,0,0,0,0,0,0,0,100067


In [5]:
train_imp = train.fillna(train.median())
test_imp = test.fillna(train.median())

print('Training data shape: ', train_imp.shape)
print('Testing data shape: ', test_imp.shape)

Training data shape:  (307511, 543)
Testing data shape:  (48744, 542)


In [6]:
#列の欠損値を調べる関数
def missing_values_table(df):
    #欠損値合計
    mis_val = df.isnull().sum()
    #欠損値割合
    mis_val_percent = 100 * df.isnull().sum()/len(df)
    #結果のテーブルを作成
    mis_val_table = pd.concat([mis_val,mis_val_percent],axis=1)
    
    #列名をリネーム
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0:'Missing Values',1:'% of Total Values'})
    
    #降順にソート
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1]!=0].sort_values(
    '% of Total Values',ascending=False).round(1)
    
    #サマリーを表示
    print("Your selected dataframe has "+str(df.shape[1])+" columns.\n"
         "There are "+str(mis_val_table_ren_columns.shape[0])+
         " columns that have missing values.")
    
    #欠損情報のデータフレームを返す
    return mis_val_table_ren_columns

In [7]:
#欠損統計
missing_values = missing_values_table(train_imp)
missing_values.head(20)

Your selected dataframe has 543 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


### Learning

In [8]:
# Kfoldを作成
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5
kf = KFold(ntrain, n_folds = NFOLDS, random_state=SEED)
# SKlearnHelperのクラス
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self,x_train,y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self, x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [9]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)

In [10]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [11]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [12]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train_imp['TARGET'].ravel()
train_imp = train_imp.drop(['TARGET'], axis=1)
x_train = train_imp.values # Creates an array of the train data
x_test = test_imp.values # Creats an array of the test data

In [13]:
print('Training data shape:',x_train.shape)
print('Testing data shape:',x_test.shape)

Training data shape: (307511, 542)
Testing data shape: (48744, 542)


In [None]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
print("Extra Trees is complete")
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
print("Random Forest is complete")
#ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost
#print("AdaBoost is complete")
#gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
#print("Gradient Boost is complete")
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier
print("SVC is complete")
print("Training is complete")

Extra Trees is complete
Random Forest is complete


In [None]:
#x_train_first = np.concatenate(( et_oof_train, rf_oof_train, svc_oof_train), axis=1)
#x_test_first = np.concatenate(( et_oof_test, rf_oof_test, svc_oof_test), axis=1)

x_train_first = pd.Dataframe({'et': et_oof_train, 'rf': rf_oof_train, 'svc': svc_oof_train})
x_test_first = pd.Dataframe({'et': et_oof_test, 'rf': rf_oof_test, 'svc': svc_oof_test})

print('Training data shape:',x_train_first.shape)
print('Testing data shape:',x_test_first.shape)

In [None]:
# first levelのpredice(0,1)をトレーニング、テストデータにくっつける
x_train_second = train_imp.merge(x_train_first, on = 'SK_ID_CURR',how = 'left')
x_test_second = test_imp.merge(x_test_first, on = 'SK_ID_CURR',how = 'left')
# TARGET列を再びくっつける
x_train_second['TARGET'] = y_train

In [None]:
# first_levelの出力をいったん出力して、メモリ消去
x_train_first.to_csv('x_train_first.csv', index = False)
x_test_first.to_csv('x_test_first.csv', index = False)

# Clean up memory
gc.enable()
del x_train_first, x_test_first
gc.collect()

In [None]:
def model(features, test_features, encoding = 'ohe', n_folds = 5):
    
    """Train and test a light gradient boosting model using
    cross validation. 
    
    Parameters
    --------
        features (pd.DataFrame): 
            dataframe of training features to use 
            for training a model. Must include the TARGET column.
        test_features (pd.DataFrame): 
            dataframe of testing features to use
            for making predictions with the model. 
        encoding (str, default = 'ohe'): 
            method for encoding categorical variables. Either 'ohe' for one-hot encoding or 'le' for integer label encoding
            n_folds (int, default = 5): number of folds to use for cross validation
        
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Extract the labels for training
    labels = features['TARGET']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    # One Hot Encoding
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        
        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join = 'inner', axis = 1)
        
        # No categorical indices to record
        cat_indices = 'auto'
    
    # Integer label encoding
    elif encoding == 'le':
        
        # Create a label encoder
        label_encoder = LabelEncoder()
        
        # List for storing categorical indices
        cat_indices = []
        
        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))

                # Record the categorical indices
                cat_indices.append(i)
    
    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Extract feature names
    feature_names = list(features.columns)
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = False, random_state = 50)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', boosting_type='goss',
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Record the feature importances
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Make the feature importance dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [None]:
submission, feature_importances, metrics = model(x_train_second,x_test_second)

In [None]:
metrics

In [None]:
submission.to_csv('stacking_first_submission.csv', index = False)

### トレーニング・テストデータの出力

In [None]:
train.to_csv('m_train.csv', index = False)
test.to_csv('m_test.csv', index = False)

In [None]:
train_imp.to_csv('m_train_imp.csv', index = False)
test_imp.to_csv('m_test_imp.csv', index = False)