# Classification (with probabilities) + Detailed workflow
***

# Import

In [1]:
import warnings
warnings.filterwarnings('ignore', message='The y_prob values do not sum to one')
warnings.filterwarnings('ignore', message='Skipping variable loading for optimizer')
warnings.filterwarnings('ignore', message='X does not have valid feature names')
import re
from glob import glob
import numpy as np
np.random.seed(0) # ensure reproducibility
np.set_printoptions(suppress=True)
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import log_loss
# Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# NN
import tensorflow as tf
# Data is small so do NOT use GPU for simplicity
tf.config.set_visible_devices([], 'GPU')
# Suppress Python level warnings from tensorflow
tf.get_logger().setLevel('ERROR') 
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Input
from scikeras.wrappers import KerasClassifier, KerasRegressor
# Stacking
from vecstack import stacking

2025-09-06 15:55:39.761844: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-06 15:55:39.792129: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-06 15:55:40.527567: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
W0000 00:00:1757163340.650380   23114 gpu_device.cc:2342] Cannot dlopen some GPU libraries.

# Prepare data

In [2]:
n_classes = 3

# Create data: 500 example, 5 feature, 3 classes
X, y = make_classification(n_samples=500, n_features=5, 
                           n_informative=3, n_redundant=1, 
                           n_classes=n_classes, flip_y=0, 
                           random_state=0)

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('Train shape:', X_train.shape)
print('Test shape: ', X_test.shape)

Train shape: (400, 5)
Test shape:  (100, 5)


# Initialize 1st level models

In [3]:
def build_keras_model_1():
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    model.add(Dense(64,
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(n_classes, 
                    kernel_initializer='normal', 
                    activation='softmax'))
    model.compile(optimizer='rmsprop', 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['categorical_accuracy'])
    return model

# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models_1 = [ 
    GaussianNB(),
    
    LogisticRegression(random_state=0),
    
    ExtraTreesClassifier(random_state=0, n_jobs=-1, 
                         n_estimators=100, max_depth=3),
                         
    RandomForestClassifier(random_state=0, n_jobs=-1, 
                           n_estimators=100, max_depth=3),
        
    XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                  n_estimators=100, max_depth=3),
                  
    LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                   n_estimators=100, max_depth=3, verbose=-1),

    KerasClassifier(model=build_keras_model_1(), epochs=2, 
                    batch_size=32, verbose=0)
]

# Perform stacking

In [4]:
S_train_1, S_test_1 = stacking(models_1,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:         [classification]
n_classes:    [3]
metric:       [log_loss]
mode:         [oof_pred]
n_models:     [7]

model  0:     [GaussianNB]
    fold  0:  [0.57030626]
    fold  1:  [0.28256165]
    fold  2:  [0.35609357]
    fold  3:  [0.57833219]
    fold  4:  [0.60933411]
    ----
    MEAN:     [0.47932556] + [0.13332980]
    FULL:     [0.47932556]

    Fitting on full train set...

model  1:     [LogisticRegression]
    fold  0:  [0.58074650]
    fold  1:  [0.27626266]
    fold  2:  [0.40797434]
    fold  3:  [0.48192230]
    fold  4:  [0.73155308]
    ----
    MEAN:     [0.49569178] + [0.15420632]
    FULL:     [0.49569178]

    Fitting on full train set...

model  2:     [ExtraTreesClassifier]
    fold  0:  [0.80523428]
    fold  1:  [0.72214703]
    fold  2:  [0.72851161]
    fold  3:  [0.78859646]
    fold  4:  [0.82512546]
    ----
    MEAN:     [0.77392297] + [0.04137715]
    FULL:     [0.77392297]

    Fitting on full train set...

model  3:     [RandomForestClassifier]


# Look at the result

In [5]:
print('We have %d classes and %d models so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1), n_classes * len(models_1)))
print('S_train_1 shape:', S_train_1.shape)
print('S_test_1 shape: ', S_test_1.shape)

We have 3 classes and 7 models so in resulting arrays we expect to see 21 columns.
S_train_1 shape: (400, 21)
S_test_1 shape:  (100, 21)


In [6]:
S_train_1[:2]

array([[0.00083161, 0.01828022, 0.98088816, 0.01605728, 0.02724415,
        0.95669857, 0.22361998, 0.20286901, 0.57351102, 0.0365039 ,
        0.10297998, 0.86051611, 0.00304293, 0.01768309, 0.97927397,
        0.00074538, 0.00395689, 0.99529773, 0.32723224, 0.28824005,
        0.38452768],
       [0.95026182, 0.04973818, 0.        , 0.89084281, 0.10909541,
        0.00006178, 0.55396899, 0.27694952, 0.16908149, 0.85772772,
        0.13275189, 0.00952038, 0.9821493 , 0.01676223, 0.00108847,
        0.99746299, 0.00249197, 0.00004503, 0.39017811, 0.32593027,
        0.28389156]])

In [7]:
S_test_1[:2]

array([[0.38824186, 0.37434678, 0.23741136, 0.307547  , 0.52253329,
        0.16991971, 0.31662764, 0.29004533, 0.39332704, 0.27655712,
        0.55408115, 0.16936173, 0.58477622, 0.38952848, 0.02569526,
        0.72381327, 0.24995384, 0.02623288, 0.30865759, 0.34028247,
        0.35106   ],
       [0.32313599, 0.67239959, 0.00446442, 0.28539557, 0.62320346,
        0.09140096, 0.31999925, 0.36345201, 0.31654874, 0.10054021,
        0.81354061, 0.08591918, 0.02924692, 0.95591789, 0.01483521,
        0.01189152, 0.94377175, 0.04433673, 0.27213532, 0.52776176,
        0.20010303]])

# Our arrays and log were saved in current dir

In [8]:
names = sorted(glob('*.npy'))
npy_1_name = names[0] # for later use

print('Arrays:')
for name in names:
    print(name)

names = sorted(glob('*.log.txt'))
log_1_name = names[0] # for later use

print('\nLogs:')
for name in names:
    print(name)

Arrays:
[2025.09.06].[15.55.45].032643.7f0aac.npy

Logs:
[2025.09.06].[15.55.45].032643.7f0aac.log.txt


# Initialize some other 1st level model(s)

As we continue to work on the problem we create many other models.  
Let's say we want to try more powerful neural network.

In [9]:
def build_keras_model_2():
    model = Sequential()
    model.add(Input(shape=(X_train.shape[1],)))
    model.add(Dense(256,
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(64, 
                    kernel_initializer='normal', 
                    activation='relu'))
    model.add(Dense(n_classes, 
                    kernel_initializer='normal', 
                    activation='softmax'))
    model.compile(optimizer='rmsprop', 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['categorical_accuracy'])
    return model

# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models_2 = [        
    KerasClassifier(model=build_keras_model_2(), epochs=5, 
                    batch_size=32, verbose=0)
]

# Perform stacking again

In [10]:
S_train_2, S_test_2 = stacking(models_2,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

task:         [classification]
n_classes:    [3]
metric:       [log_loss]
mode:         [oof_pred]
n_models:     [1]

model  0:     [KerasClassifier]
    fold  0:  [0.51636764]
    fold  1:  [0.22029691]
    fold  2:  [0.28535020]
    fold  3:  [0.39152662]
    fold  4:  [0.43074365]
    ----
    MEAN:     [0.36885700] + [0.10502070]
    FULL:     [0.36885700]

    Fitting on full train set...

Result was saved to [./[2025.09.06].[15.55.47].511377.c4aa2f.npy]


# New arrays and log were saved too

In [11]:
names = sorted(glob('*.npy'))

print('Arrays:')
for name in names:
    print(name)
    
names = sorted(glob('*.log.txt'))

print('\nLogs:')
for name in names:
    print(name)

Arrays:
[2025.09.06].[15.55.45].032643.7f0aac.npy
[2025.09.06].[15.55.47].511377.c4aa2f.npy

Logs:
[2025.09.06].[15.55.45].032643.7f0aac.log.txt
[2025.09.06].[15.55.47].511377.c4aa2f.log.txt


# Time to collect results

After several (many) days of building, optimizing, and testing models we have a lot of files with saved OOF.  
At this point we can load and use OOF of specific model or all OOF we have.

# Find specific model

We can open logs and find the model of interest.  
We can do it programmatically or just open logs in editor.  
Name of the `.log.txt` file is the same as the name of corresponding `.npy` file (except extension).  
To find columns containing OOF of specific model we use model index from log:
* if we predicted class labels - corresponding column index is just model index
* if we predicted probabilities - corresponding column index is model index multiplied by number of classes

In [12]:
print("Let's open this log: %s" % log_1_name)
with open(log_1_name) as f:
    lines = f.readlines()

print("Let's look what models did we build in those session.\n")
for line in lines:
    if re.search(r'^model [0-9]+', line):
        print(line)

Let's open this log: [2025.09.06].[15.55.45].032643.7f0aac.log.txt
Let's look what models did we build in those session.



# Load specific model OOF

Let's say we are interested in `LGBMClassifier`.  
We found out that it has index 5.  
Then we load target `.npy` file and because of probabilities we need 3 columns from 15 (5 \* 3) to 18 (5 \* 3 + 3)

In [13]:
print("Let's load this .npy file: %s" % npy_1_name)
S = np.load(npy_1_name, allow_pickle=True)
S_train_lgbm = S[0][:, 15:18]
S_test_lgbm = S[1][:, 15:18]

Let's load this .npy file: [2025.09.06].[15.55.45].032643.7f0aac.npy


In [14]:
S_train_lgbm[:5]

array([[0.00074538, 0.00395689, 0.99529773],
       [0.99746299, 0.00249197, 0.00004503],
       [0.99599212, 0.00369756, 0.00031032],
       [0.00109104, 0.99659281, 0.00231615],
       [0.98498265, 0.01489686, 0.00012049]])

In [15]:
S_test_lgbm[:5]

array([[0.72381327, 0.24995384, 0.02623288],
       [0.01189152, 0.94377175, 0.04433673],
       [0.0896902 , 0.90432675, 0.00598305],
       [0.00034138, 0.99091816, 0.00874045],
       [0.0001139 , 0.99955083, 0.00033527]])

# Compute score of specific model

In [16]:
print('LGBMCLassifier log loss: %.8f' % log_loss(y_train, S_train_lgbm))

LGBMCLassifier log loss: 0.40351483


# Load ALL OOF

***Note:*** If you load OOF from scratch, don't forget to load `y_train` from initial dataset too.

In [17]:
print('We have %d classes and %d models TOTAL so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), 
                                 n_classes * (len(models_1) + len(models_2))))

We have 3 classes and 8 models TOTAL so in resulting arrays we expect to see 24 columns.


In [18]:
# Create empty arrays
S_train_all = np.zeros((X_train.shape[0], 0))
S_test_all = np.zeros((X_test.shape[0], 0))

# Load results
for name in sorted(glob('*.npy')):
    print('Loading: %s' % name)
    S = np.load(name, allow_pickle=True)
    S_train_all = np.c_[S_train_all, S[0]]
    S_test_all = np.c_[S_test_all, S[1]]
    
print('\nS_train_all shape:', S_train_all.shape)
print('S_test_all shape: ', S_test_all.shape)

Loading: [2025.09.06].[15.55.45].032643.7f0aac.npy
Loading: [2025.09.06].[15.55.47].511377.c4aa2f.npy

S_train_all shape: (400, 24)
S_test_all shape:  (100, 24)


# Apply 2nd level model

In [19]:
# Initialize 2nd level model
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train_all, y_train)

# Predict
y_pred = model.predict_proba(S_test_all)

# Final prediction score
print('Final prediction score: %.8f' % log_loss(y_test, y_pred))

Final prediction score: 0.37246788
