# Classification (with class labels)
***

# Import Packages

In [1]:
# for creating dataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# general import
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# this package
from stack import StackModel, StackMaster

# models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# Create dataset

In [2]:
iris = load_iris()
X, y = iris.data, [iris.target_names[i] for i in iris.target]

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# convert pandas dataframe or series
X_train = pd.DataFrame(X_train, columns=iris.feature_names)
X_test = pd.DataFrame(X_test, columns=iris.feature_names)
y_train = pd.Series(y_train, name='species')
y_test = pd.Series(y_test, name='species')

# Fit stage 1 models

In [3]:
# initialize models
model_1 = [
    StackModel(
        model_name='LinearDiscriminantAnalysis',
        model=LinearDiscriminantAnalysis(),
        x_names=['sepal length (cm)', 'sepal width (cm)'],
        regression=False),
    StackModel(
        model_name='GaussianNB',
        model=GaussianNB(),
        x_names=['sepal length (cm)', 'sepal width (cm)'],
        regression=False),
    StackModel(
        model_name='SVC',
        model=SVC(),
        x_names=['petal length (cm)'],
        regression=False),
    StackModel(
        model_name='RandomForestClassifier',
        model=RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3),
        regression=False),
    StackModel(
        model_name='ExtraTreesClassifier',
        model=ExtraTreesClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3),
        regression=False)
]

# fit models
master = StackMaster(models=model_1)
master.fit(X_train, y_train, refit=True)

2019-09-22 22:56:10,996 - stack - INFO - LinearDiscriminantAnalysis start fit
2019-09-22 22:56:11,057 - stack - INFO - LinearDiscriminantAnalysis end fit
2019-09-22 22:56:11,061 - stack - INFO - LinearDiscriminantAnalysis save fit pkl
2019-09-22 22:56:11,062 - stack - INFO - GaussianNB start fit
2019-09-22 22:56:11,172 - stack - INFO - GaussianNB end fit
2019-09-22 22:56:11,187 - stack - INFO - GaussianNB save fit pkl
2019-09-22 22:56:11,189 - stack - INFO - SVC start fit
2019-09-22 22:56:11,263 - stack - INFO - SVC end fit
2019-09-22 22:56:11,267 - stack - INFO - SVC save fit pkl
2019-09-22 22:56:11,268 - stack - INFO - RandomForestClassifier start fit
2019-09-22 22:56:12,867 - stack - INFO - RandomForestClassifier end fit
2019-09-22 22:56:12,903 - stack - INFO - RandomForestClassifier save fit pkl
2019-09-22 22:56:12,905 - stack - INFO - ExtraTreesClassifier start fit
2019-09-22 22:56:14,215 - stack - INFO - ExtraTreesClassifier end fit
2019-09-22 22:56:14,249 - stack - INFO - ExtraT

In [4]:
# look at the predicted train data
master.S_train.head()

Unnamed: 0,LinearDiscriminantAnalysis,GaussianNB,SVC,RandomForestClassifier,ExtraTreesClassifier
0,virginica,virginica,virginica,virginica,virginica
1,versicolor,setosa,versicolor,versicolor,versicolor
2,setosa,setosa,setosa,setosa,setosa
3,versicolor,versicolor,versicolor,virginica,virginica
4,virginica,virginica,virginica,virginica,virginica


# Fit Stage 2 model

In [5]:
# initialize model
model_2 = StackModel(
    model_name='XGBClassifier',
    model=XGBClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3),
    regression=False)

# fit model
model_2.fit(pd.get_dummies(master.S_train, drop_first=True), y_train, refit=True)

2019-09-22 22:56:14,332 - stack - INFO - XGBClassifier start fit
2019-09-22 22:56:14,618 - stack - INFO - XGBClassifier end fit
2019-09-22 22:56:14,623 - stack - INFO - XGBClassifier save fit pkl


In [6]:
# look at the predicted train data
model_2.S_train.head()

0     virginica
1    versicolor
2        setosa
3     virginica
4     virginica
Name: XGBClassifier, dtype: object

# Predict test data

In [7]:
master.predict(X_test, repredict=True)
model_2.predict(pd.get_dummies(master.S_test, drop_first=True), repredict=True)

2019-09-22 22:56:14,654 - stack - INFO - LinearDiscriminantAnalysis start predict
2019-09-22 22:56:14,670 - stack - INFO - LinearDiscriminantAnalysis end predict
2019-09-22 22:56:14,692 - stack - INFO - LinearDiscriminantAnalysis save pred pkl
2019-09-22 22:56:14,697 - stack - INFO - GaussianNB start predict
2019-09-22 22:56:14,726 - stack - INFO - GaussianNB end predict
2019-09-22 22:56:14,729 - stack - INFO - GaussianNB save pred pkl
2019-09-22 22:56:14,737 - stack - INFO - SVC start predict
2019-09-22 22:56:14,786 - stack - INFO - SVC end predict
2019-09-22 22:56:14,795 - stack - INFO - SVC save pred pkl
2019-09-22 22:56:14,805 - stack - INFO - RandomForestClassifier start predict
2019-09-22 22:56:15,369 - stack - INFO - RandomForestClassifier end predict
2019-09-22 22:56:15,372 - stack - INFO - RandomForestClassifier save pred pkl
2019-09-22 22:56:15,373 - stack - INFO - ExtraTreesClassifier start predict
2019-09-22 22:56:15,912 - stack - INFO - ExtraTreesClassifier end predict
201

In [8]:
# look at the predicted test data
master.S_test.head()

Unnamed: 0,LinearDiscriminantAnalysis,GaussianNB,SVC,RandomForestClassifier,ExtraTreesClassifier
0,versicolor,versicolor,virginica,virginica,virginica
1,versicolor,versicolor,versicolor,versicolor,versicolor
2,setosa,setosa,setosa,setosa,setosa
3,virginica,virginica,virginica,virginica,virginica
4,setosa,setosa,setosa,setosa,setosa


In [9]:
# look at the predicted test data
model_2.S_test.head()

0     virginica
1    versicolor
2        setosa
3     virginica
4        setosa
Name: XGBClassifier, dtype: object

In [10]:
# accuracy_score for test data
print('Stage 1 : LinearDiscriminantAnalysis')
print('                    {:.4f}'.format(master.models['LinearDiscriminantAnalysis'].evaluate(y_test)))
print('Stage 1 : GaussianNB')
print('                    {:.4f}'.format(master.models['GaussianNB'].evaluate(y_test)))
print('Stage 1 : SVC')
print('                    {:.4f}'.format(master.models['SVC'].evaluate(y_test)))
print('Stage 1 : RandomForestClassifier')
print('                    {:.4f}'.format(master.models['RandomForestClassifier'].evaluate(y_test)))
print('Stage 1 : ExtraTreesClassifier')
print('                    {:.4f}'.format(master.models['ExtraTreesClassifier'].evaluate(y_test)))
print('Stage 2 : XGBClassifier')
print('                    {:.4f}'.format(model_2.evaluate(y_test)))

Stage 1 : LinearDiscriminantAnalysis
                    0.7333
Stage 1 : GaussianNB
                    0.7333
Stage 1 : SVC
                    0.9667
Stage 1 : RandomForestClassifier
                    0.9667
Stage 1 : ExtraTreesClassifier
                    0.9667
Stage 2 : XGBClassifier
                    1.0000
