# Classification (with class probabilities)
***

# Import Packages

In [1]:
# for creating dataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# general import
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss

# this package
from stack import StackModel, StackMaster

# models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# Create dataset

In [2]:
iris = load_iris()
X, y = iris.data, [iris.target_names[i] for i in iris.target]

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# convert pandas dataframe or series
X_train = pd.DataFrame(X_train, columns=iris.feature_names)
X_test = pd.DataFrame(X_test, columns=iris.feature_names)
y_train = pd.Series(y_train, name='species')
y_test = pd.Series(y_test, name='species')

# Fit stage 1 models

In [3]:
# initialize models
models_1 = [
    StackModel(
        model_name='LinearDiscriminantAnalysis',
        model=LinearDiscriminantAnalysis,
        x_names=['sepal length (cm)', 'sepal width (cm)'], 
        predict_proba=True), 
    StackModel(
        model_name='GaussianNB',
        model=GaussianNB,
        x_names=['sepal length (cm)', 'sepal width (cm)'], 
        predict_proba=True), 
    StackModel(
        model_name='SVC',
        model=SVC, 
        x_names=['petal length (cm)'], 
        predict_proba=True, 
        params={'probability':True}), 
    StackModel(
        model_name='RandomForestClassifier',
        model=RandomForestClassifier,
        params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3}, 
        predict_proba=True), 
    StackModel(
        model_name='ExtraTreesClassifier',
        model=ExtraTreesClassifier,
        params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3}, 
        predict_proba=True)
]

# fit models
master = StackMaster(models=models_1)
master.fit(X_train, y_train, refit=True)

2019-09-21 18:13:41,825 - stack - INFO - LinearDiscriminantAnalysis start fit
2019-09-21 18:13:41,925 - stack - INFO - LinearDiscriminantAnalysis end fit
2019-09-21 18:13:41,930 - stack - INFO - LinearDiscriminantAnalysis save fit pkl
2019-09-21 18:13:41,931 - stack - INFO - GaussianNB start fit
2019-09-21 18:13:42,006 - stack - INFO - GaussianNB end fit
2019-09-21 18:13:42,011 - stack - INFO - GaussianNB save fit pkl
2019-09-21 18:13:42,015 - stack - INFO - SVC start fit
2019-09-21 18:13:42,139 - stack - INFO - SVC end fit
2019-09-21 18:13:42,143 - stack - INFO - SVC save fit pkl
2019-09-21 18:13:42,143 - stack - INFO - RandomForestClassifier start fit
2019-09-21 18:13:44,225 - stack - INFO - RandomForestClassifier end fit
2019-09-21 18:13:44,274 - stack - INFO - RandomForestClassifier save fit pkl
2019-09-21 18:13:44,277 - stack - INFO - ExtraTreesClassifier start fit
2019-09-21 18:13:45,666 - stack - INFO - ExtraTreesClassifier end fit
2019-09-21 18:13:45,705 - stack - INFO - ExtraT

In [4]:
# look at the predicted train data
master.train_pred.head()

Unnamed: 0,LinearDiscriminantAnalysis_setosa,LinearDiscriminantAnalysis_versicolor,LinearDiscriminantAnalysis_virginica,GaussianNB_setosa,GaussianNB_versicolor,GaussianNB_virginica,SVC_setosa,SVC_versicolor,SVC_virginica,RandomForestClassifier_setosa,RandomForestClassifier_versicolor,RandomForestClassifier_virginica,ExtraTreesClassifier_setosa,ExtraTreesClassifier_versicolor,ExtraTreesClassifier_virginica
0,0.001035,0.398692,0.600273,0.000621,0.321633,0.677747,0.01511,0.03736,0.94753,0.0,0.017629,0.982371,0.001552,0.234305,0.764143
1,0.444009,0.502168,0.053823,0.436394,0.40994,0.153666,0.015953,0.902358,0.08169,0.05,0.866057,0.083943,0.057082,0.664547,0.278371
2,0.999134,0.000774,9.2e-05,0.966245,0.012834,0.020921,0.947637,0.026973,0.02539,1.0,0.0,0.0,0.946644,0.047512,0.005844
3,0.000455,0.674418,0.325127,0.001285,0.555101,0.443614,0.013847,0.566022,0.420131,0.000435,0.219801,0.779764,0.018933,0.374967,0.6061
4,4.3e-05,0.365783,0.634174,0.000206,0.455418,0.544376,0.015482,0.035536,0.948982,0.0,0.015863,0.984137,0.001364,0.129174,0.869463


# Fit Stage 2 model

In [5]:
# initialize model
model_2 = StackModel(
    model_name='XGBClassifier',
    model=XGBClassifier,
    params={'random_state': 0, 'n_jobs': -1, 'n_estimators': 100, 'max_depth': 3},
    predict_proba=True)

# fit model
model_2.fit(master.train_pred, y_train, refit=True)

2019-09-21 18:13:45,787 - stack - INFO - XGBClassifier start fit
2019-09-21 18:13:46,117 - stack - INFO - XGBClassifier end fit
2019-09-21 18:13:46,122 - stack - INFO - XGBClassifier save fit pkl


In [6]:
# look at the predicted train data
model_2.train_pred.head()

Unnamed: 0,XGBClassifier_setosa,XGBClassifier_versicolor,XGBClassifier_virginica
0,0.002139,0.002119,0.995742
1,0.003463,0.99348,0.003057
2,0.990743,0.004646,0.004611
3,0.003863,0.991694,0.004442
4,0.002139,0.002119,0.995742


# Predict test data

In [7]:
master.predict(X_test, repredict=True)
model_2.predict(pd.get_dummies(master.test_pred, drop_first=True), repredict=True)

2019-09-21 18:13:46,176 - stack - INFO - LinearDiscriminantAnalysis start predict
2019-09-21 18:13:46,210 - stack - INFO - LinearDiscriminantAnalysis end predict
2019-09-21 18:13:46,212 - stack - INFO - LinearDiscriminantAnalysis save pred pkl
2019-09-21 18:13:46,213 - stack - INFO - GaussianNB start predict
2019-09-21 18:13:46,260 - stack - INFO - GaussianNB end predict
2019-09-21 18:13:46,263 - stack - INFO - GaussianNB save pred pkl
2019-09-21 18:13:46,268 - stack - INFO - SVC start predict
2019-09-21 18:13:46,319 - stack - INFO - SVC end predict
2019-09-21 18:13:46,324 - stack - INFO - SVC save pred pkl
2019-09-21 18:13:46,325 - stack - INFO - RandomForestClassifier start predict
2019-09-21 18:13:46,875 - stack - INFO - RandomForestClassifier end predict
2019-09-21 18:13:46,878 - stack - INFO - RandomForestClassifier save pred pkl
2019-09-21 18:13:46,879 - stack - INFO - ExtraTreesClassifier start predict
2019-09-21 18:13:47,414 - stack - INFO - ExtraTreesClassifier end predict
201

In [8]:
# look at the predicted test data
master.test_pred.head()

Unnamed: 0,LinearDiscriminantAnalysis_setosa,LinearDiscriminantAnalysis_versicolor,LinearDiscriminantAnalysis_virginica,GaussianNB_setosa,GaussianNB_versicolor,GaussianNB_virginica,SVC_setosa,SVC_versicolor,SVC_virginica,RandomForestClassifier_setosa,RandomForestClassifier_versicolor,RandomForestClassifier_virginica,ExtraTreesClassifier_setosa,ExtraTreesClassifier_versicolor,ExtraTreesClassifier_virginica
0,0.003663562,0.751172,0.2451643,0.02548262,0.733363,0.241154,0.013592,0.138711,0.847697,0.0,0.094794,0.905206,0.009703,0.186974,0.803322
1,1.552051e-06,0.56276,0.437238,0.001319613,0.856047,0.142634,0.017314,0.963385,0.019301,0.001051,0.947587,0.051362,0.061603,0.822203,0.116194
2,0.9999851,1.4e-05,7.849685e-07,0.9957541,0.000737,0.003509,0.948372,0.026241,0.025386,0.971564,0.028257,0.000179,0.965996,0.030511,0.003494
3,2.534461e-08,0.040878,0.9591224,5.764813e-09,0.038775,0.961225,0.020198,0.068364,0.911439,0.0,0.00907,0.99093,0.002365,0.142461,0.855174
4,0.998417,0.001547,3.581474e-05,0.966143,0.02319,0.010667,0.946957,0.027167,0.025876,1.0,0.0,0.0,0.942916,0.048192,0.008892


In [9]:
# look at the predicted test data
model_2.test_pred.head()

Unnamed: 0,XGBClassifier_setosa,XGBClassifier_versicolor,XGBClassifier_virginica
0,0.003433,0.004442,0.992125
1,0.004774,0.990382,0.004844
2,0.990865,0.004198,0.004938
3,0.003715,0.00422,0.992065
4,0.990865,0.004198,0.004938


In [10]:
# MAE for test data
print('Stage 1 : LinearDiscriminantAnalysis')
print('                    {:.4f}'.format(log_loss(y_test, master.test_pred.loc[:, master.test_pred.columns.str.startswith('LinearDiscriminantAnalysis')])))
print('Stage 1 : GaussianNB')
print('                    {:.4f}'.format(log_loss(y_test, master.test_pred.loc[:, master.test_pred.columns.str.startswith('GaussianNB')])))
print('Stage 1 : SVC')
print('                    {:.4f}'.format(log_loss(y_test, master.test_pred.loc[:, master.test_pred.columns.str.startswith('SVC')])))
print('Stage 1 : RandomForestClassifier')
print('                    {:.4f}'.format(log_loss(y_test, master.test_pred.loc[:, master.test_pred.columns.str.startswith('RandomForestClassifier')])))
print('Stage 1 : ExtraTreesClassifier')
print('                    {:.4f}'.format(log_loss(y_test, master.test_pred.loc[:, master.test_pred.columns.str.startswith('ExtraTreesClassifier')])))
print('Stage 2 : XGBClassifier')
print('                    {:.4f}'.format(log_loss(y_test, model_2.test_pred)))

Stage 1 : LinearDiscriminantAnalysis
                    0.5020
Stage 1 : GaussianNB
                    0.5234
Stage 1 : SVC
                    0.1581
Stage 1 : RandomForestClassifier
                    0.0822
Stage 1 : ExtraTreesClassifier
                    0.2404
Stage 2 : XGBClassifier
                    0.0454
