In [1]:
# !pip install pandas numpy tensorflow tensorflowjs

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import json


In [2]:
# "LookUp -> Normalization -> Prediction" -> TFJS
# "LookUp" call this a str_encoder "model" ('strings' -> target encode)
# "Normalization -> Prediction" Model ('numbers ' -> )

In [3]:


class TargetEncoding(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs) # Boilerplate
        
    def call(self, inputs):
#         return self.norm(self.table.lookup(inputs))
        return self.table.lookup(inputs)
    
    def compute_output_shape(self, input_shape):
        return input_shape
    
#     def compute_output_signature()
    def adapt(self, feature, target, 
              mean, var, default_value=-1):
        self.feature_name = feature.name
        self.keys = tf.constant(feature)
        self.target = tf.constant(target)
        self.table = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(self.keys,self.target),
            default_value=default_value)
    def to_dict(self):
        d = {k:v for k,v in zip(self.keys, self.target)}
        return d
#         for k,v in zip(self.keys, self.target):
#             d[k] = v
        
    
#         self.norm = layers.Normalization(axis=None, 
#                                          mean=mean,
#                                          variance=var)

In [4]:
data = pd.DataFrame()
data['Target'] = np.random.normal(0,1,(100,))
data['Feature'] = ["Class A"] * 30 + ["Class B"] * 20 \
    + ["Class C"] * 50
data['NumFeature'] = np.random.normal(0,1,(100,))
data

Unnamed: 0,Target,Feature,NumFeature
0,0.266143,Class A,-0.098598
1,0.218707,Class A,0.598085
2,0.135895,Class A,0.529328
3,-0.979208,Class A,-0.032673
4,-1.120905,Class A,1.639194
...,...,...,...
95,0.188529,Class C,-0.088189
96,-0.069587,Class C,-1.193713
97,-0.854548,Class C,-0.127421
98,-0.388919,Class C,1.639334


In [5]:
gr = data.groupby('Feature')['Target'].mean()
targ_enc = TargetEncoding()
targ_enc.adapt(
    feature=gr.index,
    target=gr.values,
    mean=data.Target.mean(),
    var=data.Target.var(),
    default_value=data.Target.median(),
)

2022-05-22 21:51:15.085830: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-22 21:51:15.158111: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-22 21:51:15.158331: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-22 21:51:15.159237: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [6]:
x,y = data[["Feature","NumFeature"]], data['Target']
x = {k:tf.constant(x[k]) for k in x.columns}

In [7]:
# Make encoder "model" that only transforms
# string inputs into floats and passes everything 
# numeric unchanged.
inputs = {
    'Feature': keras.Input(shape=(1,), dtype=tf.string, name="Feature"),
    'NumFeature': keras.Input(shape=(1,), dtype=tf.float32, name="NumFeature")
}

inputs_str_enc = {
    'Feature': targ_enc(inputs['Feature']),
    "NumFeature": inputs['NumFeature'],
}

encoder = keras.Model(inputs = inputs, 
                          outputs = inputs_str_enc)
x_encoded = encoder.predict(x)



In [7]:
# Make model with only numeric inputs
num_inputs = {
    k:keras.Input((1,), dtype=tf.float32, name=v.name)
    for k,v in inputs.items()
}

norm_inputs = []
for k,v in x_encoded.items():
    norm = keras.layers.Normalization()
    norm.adapt(v)
    norm_inputs.append(norm(num_inputs[k]))
    print(k)
    
conc = keras.layers.Concatenate()(norm_inputs) # TODO: Check this!
outputs = {
    'Target':keras.layers.Dense(1)(conc)
}
model = keras.Model(inputs=num_inputs, outputs=outputs)
model(x_encoded)['Target'][:5]

Feature
NumFeature


<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[1.080482 ],
       [1.0553811],
       [1.0507214],
       [1.1013936],
       [1.0881029]], dtype=float32)>

In [8]:
tf.saved_model.save(model, 'models/tf_model/prediction')
tf.saved_model.save(encoder, 'models/tf_model/encoder')



INFO:tensorflow:Assets written to: models/tf_model/prediction/assets


INFO:tensorflow:Assets written to: models/tf_model/prediction/assets


INFO:tensorflow:Assets written to: models/tf_model/encoder/assets


INFO:tensorflow:Assets written to: models/tf_model/encoder/assets


In [10]:
agg = {}
assert x.keys() == x_encoded.keys()
for (sk,sv),(ek,ev) in zip(x.items(),x_encoded.items()):
    if sv.dtype == 'string': # We know encoder only works on strings
        print(sk, '->', ek, 'encoded')
        sv = pd.Series(np.array(sv), name='In')
        ev = np.squeeze(np.array(ev))
        assert np.ndim(ev) == 1
        ev = pd.Series(np.array(ev), name='Out')
        tmp = pd.concat([sv,ev],axis=1)
        tmp = tmp.groupby('In')['Out'].first().to_dict()
        tmp = {k.decode('utf-8'):v for k,v in tmp.items()} # Remove *b*'...' 
        print(tmp)
        agg[sk] = [tmp, targ_enc.table.default_value.numpy()]
    

Feature -> Feature encoded
{'Class A': 0.07614372724174703, 'Class B': 0.08330041774579192, 'Class C': 0.29269168665438516}


In [11]:
agg

{'Feature': [{'Class A': 0.07614372724174703,
   'Class B': 0.08330041774579192,
   'Class C': 0.29269168665438516},
  0.22657014915515206]}

In [12]:
!tensorflowjs_converter \
    --input_format=tf_saved_model \
    --output_format=tfjs_graph_model \
    models/tf_model/prediction/ \
    models/tfjs_model/prediction/

2022-05-22 00:31:52.519772: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-22 00:31:52.549289: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-22 00:31:52.549468: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-22 00:31:52.549768: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [13]:
import json
with open('models/tfjs_model/prediction/preprocessing.json','w') as f:
    f.write(json.dumps(agg))

In [14]:
with open('models/tfjs_model/prediction/preprocessing.json','r') as f:
    print(f.read())

{"Feature": [{"Class A": 0.07614372724174703, "Class B": 0.08330041774579192, "Class C": 0.29269168665438516}, 0.22657014915515206]}
