# Classification (with class labels)
***

# Import Packages

In [1]:
# for creating dataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# general import
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# this package
from stack import StackModel, StackMaster

# models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# Create dataset

In [2]:
iris = load_iris()
X, y = iris.data, [iris.target_names[i] for i in iris.target]

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# convert pandas dataframe or series
X_train = pd.DataFrame(X_train, columns=iris.feature_names)
X_test = pd.DataFrame(X_test, columns=iris.feature_names)
y_train = pd.Series(y_train, name='species')
y_test = pd.Series(y_test, name='species')

# Fit stage 1 models

In [3]:
# initialize models
models_1 = [
    StackModel(
        model_name='LinearDiscriminantAnalysis',
        model=LinearDiscriminantAnalysis,
        x_names=['sepal length (cm)', 'sepal width (cm)'], 
        merge_method='mode'),
    StackModel(
        model_name='GaussianNB',
        model=GaussianNB,
        x_names=['sepal length (cm)', 'sepal width (cm)'], 
        merge_method='mode'),
    StackModel(
        model_name='SVC',
        model=SVC, 
        x_names=['petal length (cm)'], 
        merge_method='mode'),
    StackModel(
        model_name='RandomForestClassifier',
        model=RandomForestClassifier,
        params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3}, 
        merge_method='mode'),
    StackModel(
        model_name='ExtraTreesClassifier',
        model=ExtraTreesClassifier,
        params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3}, 
        merge_method='mode')
]

# fit models
master = StackMaster(models=models_1)
master.fit(X_train, y_train, refit=True)

2019-09-21 18:14:17,935 - stack - INFO - LinearDiscriminantAnalysis start fit
2019-09-21 18:14:17,991 - stack - INFO - LinearDiscriminantAnalysis end fit
2019-09-21 18:14:18,000 - stack - INFO - LinearDiscriminantAnalysis save fit pkl
2019-09-21 18:14:18,003 - stack - INFO - GaussianNB start fit
2019-09-21 18:14:18,092 - stack - INFO - GaussianNB end fit
2019-09-21 18:14:18,119 - stack - INFO - GaussianNB save fit pkl
2019-09-21 18:14:18,120 - stack - INFO - SVC start fit
2019-09-21 18:14:18,198 - stack - INFO - SVC end fit
2019-09-21 18:14:18,202 - stack - INFO - SVC save fit pkl
2019-09-21 18:14:18,204 - stack - INFO - RandomForestClassifier start fit
2019-09-21 18:14:19,865 - stack - INFO - RandomForestClassifier end fit
2019-09-21 18:14:19,902 - stack - INFO - RandomForestClassifier save fit pkl
2019-09-21 18:14:19,903 - stack - INFO - ExtraTreesClassifier start fit
2019-09-21 18:14:21,322 - stack - INFO - ExtraTreesClassifier end fit
2019-09-21 18:14:21,356 - stack - INFO - ExtraT

In [4]:
# look at the predicted train data
master.train_pred.head()

Unnamed: 0,LinearDiscriminantAnalysis,GaussianNB,SVC,RandomForestClassifier,ExtraTreesClassifier
0,virginica,virginica,virginica,virginica,virginica
1,versicolor,setosa,versicolor,versicolor,versicolor
2,setosa,setosa,setosa,setosa,setosa
3,versicolor,versicolor,versicolor,virginica,virginica
4,virginica,virginica,virginica,virginica,virginica


# Fit Stage 2 model

In [5]:
# initialize model
model_2 = StackModel(
    model_name='XGBClassifier', 
    model=XGBClassifier, 
    params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3}, 
    merge_method='mode')

# fit model
model_2.fit(pd.get_dummies(master.train_pred, drop_first=True), y_train, refit=True)

2019-09-21 18:14:21,468 - stack - INFO - XGBClassifier start fit
2019-09-21 18:14:21,686 - stack - INFO - XGBClassifier end fit
2019-09-21 18:14:21,700 - stack - INFO - XGBClassifier save fit pkl


In [6]:
# look at the predicted train data
model_2.train_pred.head()

0     virginica
1    versicolor
2        setosa
3     virginica
4     virginica
Name: XGBClassifier, dtype: object

# Predict test data

In [7]:
master.predict(X_test, repredict=True)
model_2.predict(pd.get_dummies(master.test_pred, drop_first=True), repredict=True)

2019-09-21 18:14:21,763 - stack - INFO - LinearDiscriminantAnalysis start predict
2019-09-21 18:14:21,799 - stack - INFO - LinearDiscriminantAnalysis end predict
2019-09-21 18:14:21,828 - stack - INFO - LinearDiscriminantAnalysis save pred pkl
2019-09-21 18:14:21,850 - stack - INFO - GaussianNB start predict
2019-09-21 18:14:21,865 - stack - INFO - GaussianNB end predict
2019-09-21 18:14:21,874 - stack - INFO - GaussianNB save pred pkl
2019-09-21 18:14:21,878 - stack - INFO - SVC start predict
2019-09-21 18:14:21,917 - stack - INFO - SVC end predict
2019-09-21 18:14:21,919 - stack - INFO - SVC save pred pkl
2019-09-21 18:14:21,920 - stack - INFO - RandomForestClassifier start predict
2019-09-21 18:14:22,482 - stack - INFO - RandomForestClassifier end predict
2019-09-21 18:14:22,485 - stack - INFO - RandomForestClassifier save pred pkl
2019-09-21 18:14:22,486 - stack - INFO - ExtraTreesClassifier start predict
2019-09-21 18:14:23,056 - stack - INFO - ExtraTreesClassifier end predict
201

In [8]:
# look at the predicted test data
master.test_pred.head()

Unnamed: 0,LinearDiscriminantAnalysis,GaussianNB,SVC,RandomForestClassifier,ExtraTreesClassifier
0,versicolor,versicolor,virginica,virginica,virginica
1,versicolor,versicolor,versicolor,versicolor,versicolor
2,setosa,setosa,setosa,setosa,setosa
3,virginica,virginica,virginica,virginica,virginica
4,setosa,setosa,setosa,setosa,setosa


In [9]:
# look at the predicted test data
model_2.test_pred.head()

0     virginica
1    versicolor
2        setosa
3     virginica
4        setosa
Name: XGBClassifier, dtype: object

In [10]:
# MAE for test data
print('Stage 1 : LinearDiscriminantAnalysis')
print('                    {:.4f}'.format(accuracy_score(y_test, master.test_pred.LinearDiscriminantAnalysis)))
print('Stage 1 : GaussianNB')
print('                    {:.4f}'.format(accuracy_score(y_test, master.test_pred.GaussianNB)))
print('Stage 1 : SVC')
print('                    {:.4f}'.format(accuracy_score(y_test, master.test_pred.SVC)))
print('Stage 1 : RandomForestClassifier')
print('                    {:.4f}'.format(accuracy_score(y_test, master.test_pred.RandomForestClassifier)))
print('Stage 1 : ExtraTreesClassifier')
print('                    {:.4f}'.format(accuracy_score(y_test, master.test_pred.ExtraTreesClassifier)))
print('Stage 2 : XGBClassifier')
print('                    {:.4f}'.format(accuracy_score(y_test, model_2.test_pred)))

Stage 1 : LinearDiscriminantAnalysis
                    0.7333
Stage 1 : GaussianNB
                    0.7333
Stage 1 : SVC
                    0.9667
Stage 1 : RandomForestClassifier
                    0.9667
Stage 1 : ExtraTreesClassifier
                    0.9667
Stage 2 : XGBClassifier
                    1.0000
