In [1]:
import os
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./train/train.csv')
test = pd.read_csv('./test/test.csv')

train.drop(['Name', 'Description', 'PetID', 'RescuerID'], axis=1, inplace=True)
test.drop(['Name', 'Description', 'PetID', 'RescuerID'], axis=1, inplace=True)

target = train['AdoptionSpeed']
train.drop(['AdoptionSpeed'], axis=1, inplace=True)

In [3]:
#modeling
from subprocess import check_output
from sklearn.svm import SVC
from sklearn import svm, neighbors
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, VotingClassifier, AdaBoostClassifier,
GradientBoostingClassifier,ExtraTreesClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import xgboost as xbg

In [4]:
ntrain  = train.shape[0]
ntest  = test.shape[0]

SEED = 1
NFOLDS = 5
kf = KFold(n_splits=NFOLDS)

In [5]:
class SklearnHelper(object):
    def __init__(self, clf, seed = 0, params=None): # self, 모델, 시드, 파라미터
        params["random_state"] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x, y)
        
    def feature_importances(self, x, y):
        importance = []
        for i in self.clf.fit(x, y).feature_importances_:
            importance.append(i)
        return importance

In [6]:
def get_oof(clf, X, y, X_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        print('\nFold {}'.format(i))
        x_tr = X.iloc[train_index]
        y_tr = y[train_index]
        x_te = X.iloc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(X_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [7]:
#Parameter 설정
#Random Forest
rf_params = {
    "n_jobs": -1,
    "n_estimators": 500,
    "warm_start": True,
    #"max_features":0.2,
    "max_depth":6,
    "min_samples_leaf": 2, 
    "max_features": "sqrt",
    "verbose":0
}

#Extra Trees
et_params = {
    "n_jobs": -1,
    "n_estimators": 500,
    #"max_features":0.5,
    "max_depth":8,
    "min_samples_leaf": 2, 
    "verbose":0
}

#AdaBoost
ada_params = {
    "n_estimators" : 500,
    "learning_rate" : 0.75
}

#Gradient Boosting
gb_params = {
    "n_estimators":500,
    #"max_features" : 0.2
    "max_depth" : 5,
    "min_samples_leaf" : 2,
    "verbose" : 0
}

In [8]:
rf = SklearnHelper(clf = RandomForestClassifier, seed = SEED, params = rf_params)
et = SklearnHelper(clf = ExtraTreesClassifier, seed = SEED, params = et_params)
ada = SklearnHelper(clf = AdaBoostClassifier, seed = SEED, params = ada_params)
gb = SklearnHelper(clf = GradientBoostingClassifier, seed = SEED, params = gb_params)

In [9]:
#First Level Prediction - OOF train and test
print ("Generating OOFs")

et_oof_train, et_oof_test = get_oof(et, train, target, test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,train, target, test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, train, target, test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,train, target, test) # Gradient Boost
# svc_oof_train, svc_oof_test = get_oof(svc,train, target, test) # Support Vector Classifier

print("Training is complete")

Generating OOFs

Fold 0

Fold 1

Fold 2

Fold 3

Fold 4

Fold 0

Fold 1

Fold 2

Fold 3

Fold 4

Fold 0

Fold 1

Fold 2

Fold 3

Fold 4

Fold 0

Fold 1

Fold 2

Fold 3

Fold 4
Training is complete


In [10]:
rf_feature = rf.feature_importances(train,target)
et_feature = et.feature_importances(train, target)
ada_feature = ada.feature_importances(train, target)
gb_feature = gb.feature_importances(train,target)

In [11]:
cols = train.columns.values
feature_df = pd.DataFrame({
    "features":cols,
    'Random Forest feature importances': rf_feature,
     'Extra Trees  feature importances': et_feature,
      'AdaBoost feature importances': ada_feature,
    'Gradient Boost feature importances': gb_feature
})

In [13]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [14]:
# Scatter plot 
trace = go.Scatter(
    y = feature_df['Random Forest feature importances'].values,
    x = feature_df['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_df['Random Forest feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_df['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot 
trace = go.Scatter(
    y = feature_df['Extra Trees  feature importances'].values,
    x = feature_df['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_df['Extra Trees  feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_df['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Extra Trees Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot 
trace = go.Scatter(
    y = feature_df['AdaBoost feature importances'].values,
    x = feature_df['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_df['AdaBoost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_df['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'AdaBoost Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot 
trace = go.Scatter(
    y = feature_df['Gradient Boost feature importances'].values,
    x = feature_df['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_df['Gradient Boost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_df['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boosting Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

In [15]:
feature_df["mean"] = feature_df.mean(axis = 1)
feature_df.head()

Unnamed: 0,features,Random Forest feature importances,Extra Trees feature importances,AdaBoost feature importances,Gradient Boost feature importances,mean
0,Type,0.027244,0.057686,0.004,0.009446,0.024594
1,Age,0.264222,0.067114,0.184,0.183061,0.174599
2,Breed1,0.153234,0.102187,0.27,0.146381,0.167951
3,Breed2,0.032249,0.032907,0.042,0.064408,0.042891
4,Gender,0.021501,0.036053,0.012,0.028218,0.024443


In [16]:
data =[
    go.Bar(
    x =feature_df["features"].values,
    y = feature_df["mean"].values
    )
]

layout = go.Layout(
    title = "Feature Importance-Mean",
    yaxis =dict(
        title = "Importance", 
    )
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename = "BAR")

In [17]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train[:,0],
     'ExtraTrees': et_oof_train[:,0],
     'AdaBoost': ada_oof_train[:,0],
      'GradientBoost': gb_oof_train[:,0]
    })
base_predictions_train.head(10)

Unnamed: 0,RandomForest,ExtraTrees,AdaBoost,GradientBoost
0,2.0,2.0,1.0,2.0
1,1.0,4.0,1.0,1.0
2,3.0,2.0,2.0,3.0
3,4.0,4.0,3.0,4.0
4,2.0,2.0,2.0,1.0
5,2.0,2.0,2.0,2.0
6,4.0,4.0,4.0,4.0
7,2.0,2.0,3.0,3.0
8,2.0,2.0,2.0,2.0
9,4.0,4.0,4.0,4.0


In [18]:
# Train에 ADD하기
# train = np.concatenate((train, et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis = 1)
# test = np.concatenate((test, et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
train = np.concatenate((train, et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis = 1)
test = np.concatenate((test, et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

In [20]:
train

array([[  2.,   3., 299., ...,   2.,   1.,   2.],
       [  2.,   1., 265., ...,   1.,   1.,   1.],
       [  1.,   1., 307., ...,   3.,   2.,   3.],
       ...,
       [  2.,   2., 265., ...,   4.,   1.,   4.],
       [  2.,   9., 266., ...,   4.,   4.,   2.],
       [  1.,   1., 307., ...,   2.,   2.,   3.]])