# Classification (with class probabilities)
***

# Import Packages

In [1]:
# for creating dataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# general import
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss

# this package
from stack import StackModel, StackMaster

# models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# Create dataset

In [2]:
iris = load_iris()
X, y = iris.data, [iris.target_names[i] for i in iris.target]

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# convert pandas dataframe or series
X_train = pd.DataFrame(X_train, columns=iris.feature_names)
X_test = pd.DataFrame(X_test, columns=iris.feature_names)
y_train = pd.Series(y_train, name='species')
y_test = pd.Series(y_test, name='species')

# Fit stage 1 models

In [3]:
# initialize models
model_1 = [
    StackModel(
        model_name='LinearDiscriminantAnalysis',
        model=LinearDiscriminantAnalysis(),
        x_names=['sepal length (cm)', 'sepal width (cm)'],
        predict_proba=True),
    StackModel(
        model_name='GaussianNB',
        model=GaussianNB(),
        x_names=['sepal length (cm)', 'sepal width (cm)'],
        predict_proba=True),
    StackModel(
        model_name='SVC',
        model=SVC(probability=True),
        x_names=['petal length (cm)'],
        predict_proba=True),
    StackModel(
        model_name='RandomForestClassifier',
        model=RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3),
        predict_proba=True),
    StackModel(
        model_name='ExtraTreesClassifier',
        model=ExtraTreesClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3),
        predict_proba=True)
]

# fit models
master = StackMaster(models=model_1)
master.fit(X_train, y_train, refit=True)

2019-09-22 23:12:04,255 - stack - INFO - LinearDiscriminantAnalysis start fit
2019-09-22 23:12:04,320 - stack - INFO - LinearDiscriminantAnalysis end fit
2019-09-22 23:12:04,329 - stack - INFO - LinearDiscriminantAnalysis save fit pkl
2019-09-22 23:12:04,331 - stack - INFO - GaussianNB start fit
2019-09-22 23:12:04,425 - stack - INFO - GaussianNB end fit
2019-09-22 23:12:04,428 - stack - INFO - GaussianNB save fit pkl
2019-09-22 23:12:04,429 - stack - INFO - SVC start fit
2019-09-22 23:12:04,560 - stack - INFO - SVC end fit
2019-09-22 23:12:04,563 - stack - INFO - SVC save fit pkl
2019-09-22 23:12:04,564 - stack - INFO - RandomForestClassifier start fit
2019-09-22 23:12:05,982 - stack - INFO - RandomForestClassifier end fit
2019-09-22 23:12:06,037 - stack - INFO - RandomForestClassifier save fit pkl
2019-09-22 23:12:06,040 - stack - INFO - ExtraTreesClassifier start fit
2019-09-22 23:12:07,934 - stack - INFO - ExtraTreesClassifier end fit
2019-09-22 23:12:08,023 - stack - INFO - ExtraT

In [4]:
# look at the predicted train data
master.S_train.head()

Unnamed: 0,LinearDiscriminantAnalysis_setosa,LinearDiscriminantAnalysis_versicolor,LinearDiscriminantAnalysis_virginica,GaussianNB_setosa,GaussianNB_versicolor,GaussianNB_virginica,SVC_setosa,SVC_versicolor,SVC_virginica,RandomForestClassifier_setosa,RandomForestClassifier_versicolor,RandomForestClassifier_virginica,ExtraTreesClassifier_setosa,ExtraTreesClassifier_versicolor,ExtraTreesClassifier_virginica
0,0.001035,0.398692,0.600273,0.000621,0.321633,0.677747,0.014828,0.033913,0.951258,0.0,0.017629,0.982371,0.001552,0.234305,0.764143
1,0.444009,0.502168,0.053823,0.436394,0.40994,0.153666,0.015871,0.9042,0.079929,0.05,0.866057,0.083943,0.057082,0.664547,0.278371
2,0.999134,0.000774,9.2e-05,0.966245,0.012834,0.020921,0.948853,0.027434,0.023713,1.0,0.0,0.0,0.946644,0.047512,0.005844
3,0.000455,0.674418,0.325127,0.001285,0.555101,0.443614,0.01375,0.55841,0.427841,0.000435,0.219801,0.779764,0.018933,0.374967,0.6061
4,4.3e-05,0.365783,0.634174,0.000206,0.455418,0.544376,0.015196,0.032225,0.952579,0.0,0.015863,0.984137,0.001364,0.129174,0.869463


# Fit Stage 2 model

In [5]:
# initialize model
model_2 = StackModel(
    model_name='XGBClassifier',
    model=XGBClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3),
    predict_proba=True)

# fit model
model_2.fit(master.S_train, y_train, refit=True)

2019-09-22 23:12:08,121 - stack - INFO - XGBClassifier start fit
2019-09-22 23:12:08,491 - stack - INFO - XGBClassifier end fit
2019-09-22 23:12:08,496 - stack - INFO - XGBClassifier save fit pkl


In [6]:
# look at the predicted train data
model_2.S_train.head()

Unnamed: 0,XGBClassifier_setosa,XGBClassifier_versicolor,XGBClassifier_virginica
0,0.002214,0.002227,0.995559
1,0.003512,0.993354,0.003135
2,0.990761,0.00492,0.004319
3,0.00367,0.992111,0.004219
4,0.002214,0.002227,0.995559


# Predict test data

In [7]:
master.predict(X_test, repredict=True)
model_2.predict(master.S_test, repredict=True)

2019-09-22 23:12:08,583 - stack - INFO - LinearDiscriminantAnalysis start predict
2019-09-22 23:12:08,621 - stack - INFO - LinearDiscriminantAnalysis end predict
2019-09-22 23:12:08,623 - stack - INFO - LinearDiscriminantAnalysis save pred pkl
2019-09-22 23:12:08,625 - stack - INFO - GaussianNB start predict
2019-09-22 23:12:08,685 - stack - INFO - GaussianNB end predict
2019-09-22 23:12:08,696 - stack - INFO - GaussianNB save pred pkl
2019-09-22 23:12:08,697 - stack - INFO - SVC start predict
2019-09-22 23:12:08,762 - stack - INFO - SVC end predict
2019-09-22 23:12:08,765 - stack - INFO - SVC save pred pkl
2019-09-22 23:12:08,767 - stack - INFO - RandomForestClassifier start predict
2019-09-22 23:12:09,343 - stack - INFO - RandomForestClassifier end predict
2019-09-22 23:12:09,346 - stack - INFO - RandomForestClassifier save pred pkl
2019-09-22 23:12:09,349 - stack - INFO - ExtraTreesClassifier start predict
2019-09-22 23:12:09,882 - stack - INFO - ExtraTreesClassifier end predict
201

In [8]:
# look at the predicted test data
master.S_test.head()

Unnamed: 0,LinearDiscriminantAnalysis_setosa,LinearDiscriminantAnalysis_versicolor,LinearDiscriminantAnalysis_virginica,GaussianNB_setosa,GaussianNB_versicolor,GaussianNB_virginica,SVC_setosa,SVC_versicolor,SVC_virginica,RandomForestClassifier_setosa,RandomForestClassifier_versicolor,RandomForestClassifier_virginica,ExtraTreesClassifier_setosa,ExtraTreesClassifier_versicolor,ExtraTreesClassifier_virginica
0,0.003663562,0.751172,0.2451643,0.02548262,0.733363,0.241154,0.013536,0.133241,0.853223,0.0,0.094794,0.905206,0.009703,0.186974,0.803322
1,1.552051e-06,0.56276,0.437238,0.001319613,0.856047,0.142634,0.017176,0.965036,0.017788,0.001051,0.947587,0.051362,0.061603,0.822203,0.116194
2,0.9999851,1.4e-05,7.849685e-07,0.9957541,0.000737,0.003509,0.948531,0.026367,0.025102,0.971564,0.028257,0.000179,0.965996,0.030511,0.003494
3,2.534461e-08,0.040878,0.9591224,5.764813e-09,0.038775,0.961225,0.019833,0.064476,0.915692,0.0,0.00907,0.99093,0.002365,0.142461,0.855174
4,0.998417,0.001547,3.581474e-05,0.966143,0.02319,0.010667,0.947124,0.0273,0.025576,1.0,0.0,0.0,0.942916,0.048192,0.008892


In [9]:
# look at the predicted test data
model_2.S_test.head()

Unnamed: 0,XGBClassifier_setosa,XGBClassifier_versicolor,XGBClassifier_virginica
0,0.00351,0.004947,0.991543
1,0.004578,0.990516,0.004906
2,0.990839,0.004236,0.004925
3,0.003666,0.003915,0.992419
4,0.990895,0.004236,0.004869


In [10]:
# log_loss for test data
print('Stage 1 : LinearDiscriminantAnalysis')
print('                    {:.4f}'.format(master.models['LinearDiscriminantAnalysis'].evaluate(y_test)))
print('Stage 1 : GaussianNB')
print('                    {:.4f}'.format(master.models['GaussianNB'].evaluate(y_test)))
print('Stage 1 : SVC')
print('                    {:.4f}'.format(master.models['SVC'].evaluate(y_test)))
print('Stage 1 : RandomForestClassifier')
print('                    {:.4f}'.format(master.models['RandomForestClassifier'].evaluate(y_test)))
print('Stage 1 : ExtraTreesClassifier')
print('                    {:.4f}'.format(master.models['ExtraTreesClassifier'].evaluate(y_test)))
print('Stage 2 : XGBClassifier')
print('                    {:.4f}'.format(model_2.evaluate(y_test)))

Stage 1 : LinearDiscriminantAnalysis
                    0.5020
Stage 1 : GaussianNB
                    0.5234
Stage 1 : SVC
                    0.1558
Stage 1 : RandomForestClassifier
                    0.0822
Stage 1 : ExtraTreesClassifier
                    0.2404
Stage 2 : XGBClassifier
                    0.0467
