# Classification (with probabilities) + Detailed workflow
***

# Import

In [1]:
from glob import glob
import re 
import numpy as np
np.random.seed(0) # ensure reproducibility
np.set_printoptions(suppress = True)
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import log_loss
# Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# NN
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
# Stacking
from vecstack import stacking

Using Theano backend.


# Prepare data

In [2]:
n_classes = 3

# Create data: 500 example, 5 feature, 3 classes
X, y = make_classification(n_samples=500, n_features=5, 
                           n_informative=3, n_redundant=1, 
                           n_classes=n_classes, flip_y=0, 
                           random_state=0)

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('Train shape:', X_train.shape)
print('Test shape: ', X_test.shape)

Train shape: (400, 5)
Test shape:  (100, 5)


# Initialize 1st level models

In [3]:
def build_keras_model_1():
    model = Sequential()
    model.add(Dense(64, 
                    input_dim=X_train.shape[1], 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(n_classes, 
                    kernel_initializer='normal', 
                    activation='softmax'))
    model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy', 
                  metrics=['categorical_accuracy'])
    return model

# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models_1 = [ 
    GaussianNB(),
    
    LogisticRegression(random_state=0),
    
    ExtraTreesClassifier(random_state=0, n_jobs=-1, 
                         n_estimators=100, max_depth=3),
                         
    RandomForestClassifier(random_state=0, n_jobs=-1, 
                           n_estimators=100, max_depth=3),
        
    XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                  n_estimators=100, max_depth=3),
                  
    LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                   n_estimators=100, max_depth=3),
                  
    KerasClassifier(build_fn=build_keras_model_1, epochs=2, 
                    batch_size=32, verbose=0)
]

# Perform stacking

In [4]:
S_train_1, S_test_1 = stacking(models_1,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:       [classification]
n_classes:  [3]
metric:     [log_loss]
mode:       [oof_pred]
n_models:   [7]

model 0:    [GaussianNB]
    fold 0: [0.65551778]
    fold 1: [0.42335961]
    fold 2: [0.38132309]
    fold 3: [0.57180128]
    fold 4: [0.30426116]
    ----
    MEAN:   [0.46725258] + [0.12825825]
    FULL:   [0.46842847]

    Fitting on full train set...

model 1:    [LogisticRegression]
    fold 0: [0.59821304]
    fold 1: [0.54202039]
    fold 2: [0.55194968]
    fold 3: [0.46887313]
    fold 4: [0.44995007]
    ----
    MEAN:   [0.52220126] + [0.05499033]
    FULL:   [0.52280210]

    Fitting on full train set...

model 2:    [ExtraTreesClassifier]
    fold 0: [0.79961086]
    fold 1: [0.75093790]
    fold 2: [0.77930597]
    fold 3: [0.76984042]
    fold 4: [0.75288684]
    ----
    MEAN:   [0.77051640] + [0.01799067]
    FULL:   [0.77062834]

    Fitting on full train set...

model 3:    [RandomForestClassifier]
    fold 0: [0.61575788]
    fold 1: [0.40598536]
    fold 2

# Look at the result

In [5]:
print('We have %d classes and %d models so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1), n_classes * len(models_1)))
print('S_train_1 shape:', S_train_1.shape)
print('S_test_1 shape: ', S_test_1.shape)

We have 3 classes and 7 models so in resulting arrays we expect to see 21 columns.
S_train_1 shape: (400, 21)
S_test_1 shape:  (100, 21)


In [6]:
S_train_1[:2]

array([[ 0.00118767,  0.02222581,  0.97658652,  0.06963771,  0.05618856,
         0.87417373,  0.21796766,  0.21491663,  0.56711571,  0.03741131,
         0.08982228,  0.87276641,  0.00182469,  0.00537052,  0.99280483,
         0.00040829,  0.00281319,  0.99677852,  0.3023589 ,  0.26565766,
         0.43198347],
       [ 0.96030684,  0.03969316,  0.        ,  0.75245808,  0.24720408,
         0.00033784,  0.5615216 ,  0.26871071,  0.16976769,  0.85696824,
         0.12811857,  0.01491319,  0.9877857 ,  0.01111581,  0.00109853,
         0.99732125,  0.00258249,  0.00009626,  0.38591456,  0.31510866,
         0.29897675]])

In [7]:
S_test_1[:2]

array([[ 0.38824186,  0.37434678,  0.23741136,  0.35886342,  0.38701687,
         0.2541197 ,  0.31662764,  0.29004533,  0.39332704,  0.27655712,
         0.55408115,  0.16936173,  0.58901626,  0.3923738 ,  0.01860998,
         0.60639131,  0.3588515 ,  0.03475718,  0.33691144,  0.31800038,
         0.34508815],
       [ 0.32313599,  0.67239959,  0.00446442,  0.32348396,  0.54466285,
         0.13185319,  0.31999925,  0.36345201,  0.31654874,  0.10054021,
         0.81354061,  0.08591918,  0.02955116,  0.95850134,  0.01194747,
         0.03609523,  0.90174785,  0.06215692,  0.3260029 ,  0.37157273,
         0.3024244 ]])

# Our arrays and log were saved in current dir

In [8]:
names = sorted(glob('*.npy'))
npy_1_name = names[0] # for later use

print('Arrays:')
for name in names:
    print(name)

names = sorted(glob('*.log.txt'))
log_1_name = names[0] # for later use

print('\nLogs:')
for name in names:
    print(name)

Arrays:
[2018.02.01].[15.41.41].305268.0eadc0.npy

Logs:
[2018.02.01].[15.41.41].305268.0eadc0.log.txt


# Initialize some other 1st level model(s)

As we continue to work on the problem we create many other models.  
Let's say we want to try more powerful neural network.

In [9]:
def build_keras_model_2():
    model = Sequential()
    model.add(Dense(256, 
                    input_dim=X_train.shape[1], 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(64, 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(n_classes, 
                    kernel_initializer='normal', 
                    activation='softmax'))
    model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy', 
                  metrics=['categorical_accuracy'])
    return model

# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models_2 = [        
    KerasClassifier(build_fn=build_keras_model_2, epochs=5, 
                    batch_size=32, verbose=0)
]

# Perform stacking again

In [10]:
S_train_2, S_test_2 = stacking(models_2,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:       [classification]
n_classes:  [3]
metric:     [log_loss]
mode:       [oof_pred]
n_models:   [1]

model 0:    [KerasClassifier]
    fold 0: [0.54741578]
    fold 1: [0.42787166]
    fold 2: [0.40649939]
    fold 3: [0.45298407]
    fold 4: [0.39133918]
    ----
    MEAN:   [0.44522202] + [0.05515007]
    FULL:   [0.44570354]

    Fitting on full train set...

Result was saved to [./[2018.02.01].[15.42.30].250441.2c4c1b.npy]


# New arrays and log were saved too

In [11]:
names = sorted(glob('*.npy'))

print('Arrays:')
for name in names:
    print(name)
    
names = sorted(glob('*.log.txt'))

print('\nLogs:')
for name in names:
    print(name)

Arrays:
[2018.02.01].[15.41.41].305268.0eadc0.npy
[2018.02.01].[15.42.30].250441.2c4c1b.npy

Logs:
[2018.02.01].[15.41.41].305268.0eadc0.log.txt
[2018.02.01].[15.42.30].250441.2c4c1b.log.txt


# Time to collect results

After several (many) days of building, optimizing, and testing models we have a lot of files with saved OOF.  
At this point we can load and use OOF of specific model or all OOF we have.

# Find specific model

We can open logs and find the model of interest.  
We can do it programmatically or just open logs in editor.  
Name of the `.log.txt` file is the same as the name of corresponding `.npy` file (except extension).  
To find columns containing OOF of specific model we use model index from log:
* if we predicted class labels - corresponding column index is just model index
* if we predicted probabilities - corresponding column index is model index multiplied by number of classes

In [12]:
print("Let's open this log: %s" % log_1_name)
with open(log_1_name) as f:
    lines = f.readlines()

print("Let's look what models did we build in those session.\n")
for line in lines:
    if re.search(r'^model [0-9]+', line):
        print(line)

Let's open this log: [2018.02.01].[15.41.41].305268.0eadc0.log.txt
Let's look what models did we build in those session.

model 0:    [GaussianNB]

model 1:    [LogisticRegression]

model 2:    [ExtraTreesClassifier]

model 3:    [RandomForestClassifier]

model 4:    [XGBClassifier]

model 5:    [LGBMClassifier]

model 6:    [KerasClassifier]



# Load specific model OOF

Let's say we are interested in `LGBMClassifier`.  
We found out that it has index 5.  
Then we load target `.npy` file and because of probabilities we need 3 columns from 15 (5 \* 3) to 18 (5 \* 3 + 3)

In [13]:
print("Let's load this .npy file: %s" % npy_1_name)
S = np.load(npy_1_name)
S_train_lgbm = S[0][:, 15:18]
S_test_lgbm = S[1][:, 15:18]

Let's load this .npy file: [2018.02.01].[15.41.41].305268.0eadc0.npy


In [14]:
S_train_lgbm[:5]

array([[ 0.00040829,  0.00281319,  0.99677852],
       [ 0.99732125,  0.00258249,  0.00009626],
       [ 0.98322854,  0.01610955,  0.00066191],
       [ 0.00107737,  0.99633895,  0.00258368],
       [ 0.97101719,  0.02843959,  0.00054321]])

In [15]:
S_test_lgbm[:5]

array([[ 0.60639131,  0.3588515 ,  0.03475718],
       [ 0.03609523,  0.90174785,  0.06215692],
       [ 0.08650007,  0.89717473,  0.0163252 ],
       [ 0.00068572,  0.98858075,  0.01073353],
       [ 0.00122693,  0.99814513,  0.00062793]])

# Compute score of specific model

In [16]:
print('LGBMCLassifier log loss: %.8f' % log_loss(y_train, S_train_lgbm))

LGBMCLassifier log loss: 0.41430248


# Load ALL OOF

***Note:*** If you load OOF from scratch, don't forget to load `y_train` from initial dataset too.

In [17]:
print('We have %d classes and %d models TOTAL so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), 
                                 n_classes * (len(models_1) + len(models_2))))

We have 3 classes and 8 models TOTAL so in resulting arrays we expect to see 24 columns.


In [18]:
# Create empty arrays
S_train_all = np.zeros((X_train.shape[0], 0))
S_test_all = np.zeros((X_test.shape[0], 0))

# Load results
for name in sorted(glob('*.npy')):
    print('Loading: %s' % name)
    S = np.load(name)
    S_train_all = np.c_[S_train_all, S[0]]
    S_test_all = np.c_[S_test_all, S[1]]
    
print('\nS_train_all shape:', S_train_all.shape)
print('S_test_all shape: ', S_test_all.shape)

Loading: [2018.02.01].[15.41.41].305268.0eadc0.npy
Loading: [2018.02.01].[15.42.30].250441.2c4c1b.npy

S_train_all shape: (400, 24)
S_test_all shape:  (100, 24)


# Apply 2nd level model

In [19]:
# Initialize 2nd level model
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train_all, y_train)

# Predict
y_pred = model.predict_proba(S_test_all)

# Final prediction score
print('Final prediction score: %.8f' % log_loss(y_test, y_pred))

Final prediction score: 0.38636334
