# Description

We will test the custom_transformer here.

In [1]:
# Auto-reload all the modules. Great way to make writing and testing your modules much easier.
%load_ext autoreload
%autoreload 2

# Import and Init

In [2]:
import h2o
from h2o.automl import H2OAutoML

import os
import pickle
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
from feature_engine.wrappers import SklearnTransformerWrapper

In [5]:
from h2o_remake import H2O_Remake as hr

In [6]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,3 days 1 hour 51 mins
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,"1 year, 3 months and 30 days !!!"
H2O cluster name:,H2O_from_python_unknownUser_i1hfgl
H2O cluster total nodes:,1
H2O cluster free memory:,5.859 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


# Load Data

In [7]:
# Load the data
data_df = pd.read_csv('./data/winequality-red.csv', sep=";")

In [8]:
# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(data_df)

In [9]:
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]

In [10]:
train_x.sample(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
597,11.9,0.58,0.58,1.9,0.071,5.0,18.0,0.998,3.09,0.63,10.0
859,6.8,0.56,0.22,1.8,0.074,15.0,24.0,0.99438,3.4,0.82,11.2
1524,6.0,0.42,0.19,2.0,0.075,22.0,47.0,0.99522,3.39,0.78,10.0


In [11]:
train_y.sample(3)

Unnamed: 0,quality
1344,5
1026,6
142,6


# Apply Feature Engine

In [12]:
train_x.sample(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
84,6.3,0.3,0.48,1.8,0.069,18.0,61.0,0.9959,3.44,0.78,10.3
1137,10.4,0.52,0.45,2.0,0.08,6.0,13.0,0.99774,3.22,0.76,11.4
789,8.6,0.63,0.17,2.9,0.099,21.0,119.0,0.998,3.09,0.52,9.3


In [13]:
train_x.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [14]:
# set up the wrapper
cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
scaler = SklearnTransformerWrapper(transformer = StandardScaler(),
                                    variables = cols)

#
scaled_train_x  = scaler.fit_transform(train_x)


In [15]:
scaled_train_x.sample(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1348,-0.66231,0.706868,-1.266041,-0.523692,-0.204618,-0.836654,-1.030379,-0.485026,0.212468,-1.527557,-0.862706
729,-1.121931,1.873392,-1.266041,0.472985,-0.357326,1.050115,0.336635,-0.955017,1.950275,-0.956111,2.169383
1593,-0.892121,0.512448,-1.008091,-0.4525,-0.422773,1.144453,-0.257719,-0.139286,0.727374,0.92966,-0.862706


# Train with H2O

## Select x, y Cols

In [16]:
list(scaled_train_x.columns )

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [17]:
x_cols = \
['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
]

y_col = 'quality'

## Define Column Types 

In [18]:
temp_hdf = h2o.H2OFrame( scaled_train_x.sample(5))
temp_hdf.types

Parse progress: |█████████████████████████████████████████████████████████| 100%


{'fixed acidity': 'real',
 'volatile acidity': 'real',
 'citric acid': 'real',
 'residual sugar': 'real',
 'chlorides': 'real',
 'free sulfur dioxide': 'real',
 'total sulfur dioxide': 'real',
 'density': 'real',
 'pH': 'real',
 'sulphates': 'real',
 'alcohol': 'real'}

In [19]:
# Note
# - quality is enum
h2o_columns = \
{'quality': 'enum'}



In [20]:
train_xy = scaled_train_x.copy()
train_xy[y_col] = train_y
train_hdf = h2o.H2OFrame(train_xy, column_types=h2o_columns)


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [21]:
train_hdf

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
-0.260142,0.679094,-1.36922,-0.0253533,-0.204618,0.106731,-0.257719,-0.252732,0.212468,0.472503,1.22185,7
1.40598,-1.32066,0.745968,-0.666074,-0.073725,0.295408,-0.584613,-0.711918,-0.173711,0.415359,1.41136,6
-0.489953,-0.0430399,-0.595371,-0.4525,-0.0519096,-0.364961,-0.346872,0.017378,0.469921,-0.213232,-0.862706,7
-0.202689,-0.598527,-0.543781,-0.381309,-0.313695,-0.930992,-0.852072,0.233466,-0.109348,-0.270376,-1.14696,6
-0.719763,-1.04292,0.0237081,-0.381309,-0.117356,1.42747,0.633812,0.0930088,0.920463,-1.01326,-0.957459,5
0.88891,0.0680576,0.900737,-0.167736,-0.357326,0.0123922,-0.198283,1.25988,0.534284,-0.213232,-0.957459,5
0.946363,-0.209686,-0.389012,6.0259,-0.357326,-0.270623,0.0988937,2.55641,-0.946069,0.18678,-1.14696,6
-0.375047,-1.48731,-0.0794718,-0.381309,-0.77182,0.295408,0.901271,-0.900995,-1.01043,0.758226,0.463833,6
-0.375047,0.40135,-1.42081,0.0458379,-0.706374,-0.836654,-1.00066,-0.204112,0.469921,-0.556099,0.36908,5
0.601647,0.484673,0.0237081,0.472985,-0.0082786,0.201069,0.752683,1.8001,0.0193787,-0.727533,-0.6732,5




## Train Model

In [22]:
aml = H2OAutoML(max_models = 100,
                max_runtime_secs = 30,  
                nfolds  = 5, 
                balance_classes = True,
                exclude_algos = [ 'DeepLearning'],
               )

In [23]:
%%time
aml.train(x = x_cols, 
          y = y_col, 
          training_frame = train_hdf)

AutoML progress: |████████████████████████████████████████████████████████| 100%
CPU times: user 3.61 s, sys: 148 ms, total: 3.76 s
Wall time: 43 s


In [24]:
aml.leaderboard


model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_BestOfFamily_AutoML_20210416_032447,0.689239,0.858701,0.534091,0.285254
GBM_1_AutoML_20210416_032447,0.691958,1.01904,0.531251,0.282228
StackedEnsemble_AllModels_AutoML_20210416_032447,0.692026,0.856574,0.533881,0.285029
GBM_2_AutoML_20210416_032447,0.694659,0.975532,0.536245,0.287559
XGBoost_1_AutoML_20210416_032447,0.69625,0.915821,0.555842,0.30896
DRF_1_AutoML_20210416_032447,0.704658,1.50289,0.520919,0.271357
XGBoost_3_AutoML_20210416_032447,0.711389,0.995363,0.600985,0.361183
XGBoost_2_AutoML_20210416_032447,0.717143,1.01589,0.609178,0.371098
GLM_1_AutoML_20210416_032447,0.7189,0.971545,0.572652,0.32793
GBM_3_AutoML_20210416_032447,0.729317,1.00956,0.578724,0.334922




# Wrap H2O Model As sklearn Transformer

In [25]:
h2o_model = aml.leader
h2o_model.model_id

'StackedEnsemble_BestOfFamily_AutoML_20210416_032447'

In [26]:
h2o_estimator = hr.make_custom_estimator(h2o_model.model_id)

In [27]:
h2o_estimator.predict(train_x)

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predict,3.0,4.0,5.0,6.0,7.0,8.0
0,7.9,0.650,0.01,2.5,0.078,17.0,38.0,0.99630,3.34,0.74,11.7,5.0,0.003234,0.019899,0.572795,0.386203,0.012686,0.005183
1,10.8,0.290,0.42,1.6,0.084,19.0,27.0,0.99545,3.28,0.73,11.9,6.0,0.010210,0.062968,0.391913,0.402777,0.092031,0.040102
2,7.5,0.520,0.16,1.9,0.085,12.0,35.0,0.99680,3.38,0.62,9.5,5.0,0.003117,0.019116,0.571739,0.389235,0.011788,0.005005
3,8.0,0.420,0.17,2.0,0.073,6.0,18.0,0.99720,3.29,0.61,9.2,5.0,0.008354,0.049530,0.446265,0.407295,0.065488,0.023069
4,7.1,0.340,0.28,2.0,0.082,31.0,68.0,0.99694,3.45,0.48,9.4,5.0,0.003166,0.018194,0.543350,0.417198,0.012929,0.005162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,5.6,0.915,0.00,2.1,0.041,17.0,78.0,0.99346,3.68,0.73,11.4,5.0,0.003240,0.020477,0.598904,0.359338,0.012902,0.005138
1195,7.6,0.780,0.00,1.7,0.076,33.0,45.0,0.99612,3.31,0.62,10.7,5.0,0.003180,0.018558,0.568420,0.391807,0.012932,0.005103
1196,9.0,0.785,0.24,1.7,0.078,10.0,21.0,0.99692,3.29,0.67,10.0,5.0,0.007822,0.051106,0.456651,0.409680,0.054229,0.020511
1197,9.6,0.600,0.50,2.3,0.079,28.0,71.0,0.99970,3.50,0.57,9.7,5.0,0.003033,0.020384,0.582462,0.378716,0.010498,0.004907


## Make Pipeline

In [28]:
steps=[ ('scaler', scaler),
        ('h2o_est', h2o_estimator),
      ]
steps

[('scaler',
  SklearnTransformerWrapper(transformer=StandardScaler(),
                            variables=['fixed acidity', 'volatile acidity',
                                       'citric acid', 'residual sugar',
                                       'chlorides', 'free sulfur dioxide',
                                       'total sulfur dioxide', 'density', 'pH',
                                       'sulphates', 'alcohol'])),
 ('h2o_est',
  Custom_Estimator(model_id='StackedEnsemble_BestOfFamily_AutoML_20210416_032447'))]

In [29]:
pipeline = Pipeline(steps)

In [30]:
pipeline.transform(test_x)

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predict,3.0,4.0,5.0,6.0,7.0,8.0
0,1.405984,-0.431881,0.281658,-0.025353,0.253507,0.389746,-0.257719,0.762881,-0.431164,0.301070,0.369080,5.0,0.008023,0.030268,0.501339,0.331988,0.108980,0.019402
1,-0.834668,-0.043040,-0.131062,0.045838,-0.139171,-0.553638,-0.287436,0.044389,0.984827,-0.898967,0.558586,5.0,0.007865,0.047239,0.447627,0.437824,0.046210,0.013235
2,-0.719763,0.790191,-1.420811,-0.167736,-0.095541,0.201069,-0.584613,0.492771,0.856100,-0.670388,-0.957459,5.0,0.003207,0.039739,0.826101,0.117432,0.008798,0.004724
3,-0.949574,-0.376332,-0.182652,-0.594883,-0.226433,0.201069,-0.376589,-1.063061,0.534284,-0.327521,0.179575,6.0,0.005579,0.016618,0.097656,0.812977,0.056234,0.010936
4,-0.547405,0.401350,-0.079472,-0.310118,-0.095541,0.106731,1.317319,-0.328362,-0.109348,-0.556099,-0.578448,5.0,0.002647,0.024864,0.729921,0.230246,0.007788,0.004534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,-1.007026,-0.043040,-1.214451,-0.238927,-0.400957,-0.742315,-0.941226,-0.630885,0.598647,-0.156087,-0.957459,6.0,0.007469,0.036324,0.279815,0.615592,0.049016,0.011785
396,-0.662310,0.706868,-1.266041,-0.523692,-0.204618,-0.836654,-1.030379,-0.485026,0.212468,-1.527557,-0.862706,5.0,0.003405,0.042885,0.848187,0.094316,0.006986,0.004220
397,0.601647,-1.598405,0.281658,-0.167736,-0.575481,3.408576,0.782401,0.590011,1.049190,1.386817,-0.199437,7.0,0.007718,0.024551,0.093077,0.219945,0.620760,0.033950
398,-0.317595,-0.542979,2.190486,-0.452500,8.216158,0.578423,0.604095,0.341510,-1.139159,3.558311,-0.957459,5.0,0.003490,0.035341,0.753957,0.189167,0.012327,0.005719


In [31]:
pipeline.predict(test_x)

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predict,3.0,4.0,5.0,6.0,7.0,8.0
0,1.405984,-0.431881,0.281658,-0.025353,0.253507,0.389746,-0.257719,0.762881,-0.431164,0.301070,0.369080,5.0,0.008023,0.030268,0.501339,0.331988,0.108980,0.019402
1,-0.834668,-0.043040,-0.131062,0.045838,-0.139171,-0.553638,-0.287436,0.044389,0.984827,-0.898967,0.558586,5.0,0.007865,0.047239,0.447627,0.437824,0.046210,0.013235
2,-0.719763,0.790191,-1.420811,-0.167736,-0.095541,0.201069,-0.584613,0.492771,0.856100,-0.670388,-0.957459,5.0,0.003207,0.039739,0.826101,0.117432,0.008798,0.004724
3,-0.949574,-0.376332,-0.182652,-0.594883,-0.226433,0.201069,-0.376589,-1.063061,0.534284,-0.327521,0.179575,6.0,0.005579,0.016618,0.097656,0.812977,0.056234,0.010936
4,-0.547405,0.401350,-0.079472,-0.310118,-0.095541,0.106731,1.317319,-0.328362,-0.109348,-0.556099,-0.578448,5.0,0.002647,0.024864,0.729921,0.230246,0.007788,0.004534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,-1.007026,-0.043040,-1.214451,-0.238927,-0.400957,-0.742315,-0.941226,-0.630885,0.598647,-0.156087,-0.957459,6.0,0.007469,0.036324,0.279815,0.615592,0.049016,0.011785
396,-0.662310,0.706868,-1.266041,-0.523692,-0.204618,-0.836654,-1.030379,-0.485026,0.212468,-1.527557,-0.862706,5.0,0.003405,0.042885,0.848187,0.094316,0.006986,0.004220
397,0.601647,-1.598405,0.281658,-0.167736,-0.575481,3.408576,0.782401,0.590011,1.049190,1.386817,-0.199437,7.0,0.007718,0.024551,0.093077,0.219945,0.620760,0.033950
398,-0.317595,-0.542979,2.190486,-0.452500,8.216158,0.578423,0.604095,0.341510,-1.139159,3.558311,-0.957459,5.0,0.003490,0.035341,0.753957,0.189167,0.012327,0.005719


In [32]:
pipeline.fit(train_x)

Pipeline(steps=[('scaler',
                 SklearnTransformerWrapper(transformer=StandardScaler(),
                                           variables=['fixed acidity',
                                                      'volatile acidity',
                                                      'citric acid',
                                                      'residual sugar',
                                                      'chlorides',
                                                      'free sulfur dioxide',
                                                      'total sulfur dioxide',
                                                      'density', 'pH',
                                                      'sulphates',
                                                      'alcohol'])),
                ('h2o_est',
                 Custom_Estimator(model_id='StackedEnsemble_BestOfFamily_AutoML_20210416_032447'))])

# Save the Pipeline as Pickle

In [33]:
with open('test_custom_estimator.pickle', 'wb') as f:
    pickle.dump(pipeline, f)

In [34]:
h2o.remove(aml.leader.model_id)

In [35]:
del scaler

# Restore Pipeline and Predict

In [36]:
with open('test_custom_estimator.pickle', 'rb') as f:
    pipeline2 = pickle.load(f)

In [37]:
pipeline2.predict(test_x)

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,predict,3.0,4.0,5.0,6.0,7.0,8.0
0,1.405984,-0.431881,0.281658,-0.025353,0.253507,0.389746,-0.257719,0.762881,-0.431164,0.301070,0.369080,5.0,0.008023,0.030268,0.501339,0.331988,0.108980,0.019402
1,-0.834668,-0.043040,-0.131062,0.045838,-0.139171,-0.553638,-0.287436,0.044389,0.984827,-0.898967,0.558586,5.0,0.007865,0.047239,0.447627,0.437824,0.046210,0.013235
2,-0.719763,0.790191,-1.420811,-0.167736,-0.095541,0.201069,-0.584613,0.492771,0.856100,-0.670388,-0.957459,5.0,0.003207,0.039739,0.826101,0.117432,0.008798,0.004724
3,-0.949574,-0.376332,-0.182652,-0.594883,-0.226433,0.201069,-0.376589,-1.063061,0.534284,-0.327521,0.179575,6.0,0.005579,0.016618,0.097656,0.812977,0.056234,0.010936
4,-0.547405,0.401350,-0.079472,-0.310118,-0.095541,0.106731,1.317319,-0.328362,-0.109348,-0.556099,-0.578448,5.0,0.002647,0.024864,0.729921,0.230246,0.007788,0.004534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,-1.007026,-0.043040,-1.214451,-0.238927,-0.400957,-0.742315,-0.941226,-0.630885,0.598647,-0.156087,-0.957459,6.0,0.007469,0.036324,0.279815,0.615592,0.049016,0.011785
396,-0.662310,0.706868,-1.266041,-0.523692,-0.204618,-0.836654,-1.030379,-0.485026,0.212468,-1.527557,-0.862706,5.0,0.003405,0.042885,0.848187,0.094316,0.006986,0.004220
397,0.601647,-1.598405,0.281658,-0.167736,-0.575481,3.408576,0.782401,0.590011,1.049190,1.386817,-0.199437,7.0,0.007718,0.024551,0.093077,0.219945,0.620760,0.033950
398,-0.317595,-0.542979,2.190486,-0.452500,8.216158,0.578423,0.604095,0.341510,-1.139159,3.558311,-0.957459,5.0,0.003490,0.035341,0.753957,0.189167,0.012327,0.005719


# Completion

In [38]:
import datetime
print( datetime.datetime.now() )

2021-04-16 03:25:32.467212
