In [1]:
import os
import numpy as np
import tensorflow as tf
import struct

In [None]:
args = dict()

args["gpu_num"] = 4                               # the number of available GPUs
args["iter_num"] = 10                             # the number of training iteration
args["slot_num"] = 3                              # the number of feature fields in this embedding layer
args["embed_vec_size"] = 16                       # the dimension of embedding vectors
args["global_batch_size"] = 65536                 # the globally batchsize for all GPUs
args["max_vocabulary_size"] = 30000
args["vocabulary_range_per_slot"] = [[0,10000],[10000,20000],[20000,30000]]
args["dense_model_path"] = "naive_dnn_dense.model"
args["embedding_table_path"] = "naive_dnn_sparse.model"
# args["saved_path"] = "naive_dnn_tf_saved_model"
args["np_key_type"] = np.int64
args["np_vector_type"] = np.float32
args["tf_key_type"] = tf.int64
args["tf_vector_type"] = tf.float32

os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(args["gpu_num"])))

In [2]:
def generate_random_samples(num_samples, vocabulary_range_per_slot, key_dtype = args["np_key_type"]):
    keys = list()
    for vocab_range in vocabulary_range_per_slot:
        keys_per_slot = np.random.randint(low=vocab_range[0], 
                                          high=vocab_range[1], 
                                          size=(num_samples, 1), 
                                          dtype=key_dtype)
        keys.append(keys_per_slot)
    keys = np.concatenate(np.array(keys), axis = 1)
    labels = np.random.randint(low=0, high=2, size=(num_samples, 1))
    return keys, labels

# slice tensor into batches, 65536
def tf_dataset(keys, labels, batchsize):
    dataset = tf.data.Dataset.from_tensor_slices((keys, labels))
    dataset = dataset.batch(batchsize, drop_remainder=True)
    return dataset

In [4]:
class TrainModel(tf.keras.models.Model):
    def __init__(self,
                 init_tensors,
                 slot_num,
                 embed_vec_size,
                 **kwargs):
        super(TrainModel, self).__init__(**kwargs)
        
        self.slot_num = slot_num
        self.embed_vec_size = embed_vec_size
        self.init_tensors = init_tensors
        self.params = tf.Variable(initial_value=tf.concat(self.init_tensors, axis=0))
        self.fc_1 = tf.keras.layers.Dense(units=256, activation=None,
                                                 kernel_initializer="ones",
                                                 bias_initializer="zeros",
                                                 name='fc_1')
        self.fc_2 = tf.keras.layers.Dense(units=128, activation=None,
                                                 kernel_initializer="ones",
                                                 bias_initializer="zeros",
                                                 name='fc_2')
        self.fc_3 = tf.keras.layers.Dense(units=1, activation=None,
                                                 kernel_initializer="ones",
                                                 bias_initializer="zeros",
                                                 name='fc_3')

    def call(self, inputs):
        embedding_vector = tf.nn.embedding_lookup(params=self.params, ids=inputs)
        embedding_vector = tf.reshape(embedding_vector, shape=[-1, self.slot_num * self.embed_vec_size])
        
        logit = self.fc_3(self.fc_2(self.fc_1(embedding_vector)))
        
        return logit, embedding_vector

    def summary(self):
        inputs = tf.keras.Input(shape=(self.slot_num,), 
                                dtype=args["tf_key_type"], 
                                name="input_dense")
        model = tf.keras.models.Model(inputs=inputs, outputs=self.call(inputs))
        return model.summary()

In [5]:
def train(args):
    init_tensors = np.ones(shape=[args["max_vocabulary_size"], args["embed_vec_size"]], dtype=args["np_vector_type"])
    
    model = TrainModel(init_tensors, args["slot_num"], args["embed_vec_size"])
    model.summary()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit, embedding_vector = model(inputs)
            loss = loss_fn(labels, logit)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        return logit, embedding_vector, loss

    keys, labels = generate_random_samples(args["global_batch_size"]  * args["iter_num"], args["vocabulary_range_per_slot"],  args["np_key_type"])
    dataset = tf_dataset(keys, labels, args["global_batch_size"])
    for i, (id_tensors, labels) in enumerate(dataset):
        _, embedding_vector, loss = _train_step(id_tensors, labels)
        print("-"*20, "Step {}, loss: {}".format(i, loss),  "-"*20)

    return model

In [6]:
trained_model = train(args)
weights_list = trained_model.get_weights()
embedding_weights = weights_list[-1]

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_dense (InputLayer)    [(None, 3)]               0         
                                                                 
 tf.compat.v1.nn.embedding_l  (None, 3, 16)            0         
 ookup (TFOpLambda)                                              
                                                                 
 tf.reshape (TFOpLambda)     (None, 48)                0         
                                                                 
 fc_1 (Dense)                (None, 256)               12544     
                                                                 
 fc_2 (Dense)                (None, 128)               32896     
                                                                 
 fc_3 (Dense)                (None, 1)                 129       
                                                             

2022-08-04 08:39:04.335687: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-04 08:39:06.822696: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14923 MB memory:  -> device: 0, name: Tesla V100-DGXS-32GB, pci bus id: 0000:07:00.0, compute capability: 7.0
2022-08-04 08:39:06.824828: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 28941 MB memory:  -> device: 1, name: Tesla V100-DGXS-32GB, pci bus id: 0000:08:00.0, compute capability: 7.0
2022-08-04 08:39:06.826844: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device

-------------------- Step 0, loss: 785784.0 --------------------
-------------------- Step 1, loss: 516876.71875 --------------------
-------------------- Step 2, loss: 325915.84375 --------------------
-------------------- Step 3, loss: 198665.296875 --------------------
-------------------- Step 4, loss: 115924.0859375 --------------------
-------------------- Step 5, loss: 64979.5078125 --------------------
-------------------- Step 6, loss: 34690.1328125 --------------------
-------------------- Step 7, loss: 17438.54296875 --------------------
-------------------- Step 8, loss: 8121.5849609375 --------------------
-------------------- Step 9, loss: 3425.77734375 --------------------


In [13]:
dense_model = tf.keras.models.Model(trained_model.get_layer("fc_1").input, 
                                    trained_model.get_layer("fc_3").output, 
                                    name='tf_dense_model')
dense_model.summary()
dense_model.save(args["dense_model_path"])

Model: "tf_dense_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 48)]              0         
                                                                 
 fc_1 (Dense)                (None, 256)               12544     
                                                                 
 fc_2 (Dense)                (None, 128)               32896     
                                                                 
 fc_3 (Dense)                (None, 1)                 129       
                                                                 
Total params: 45,569
Trainable params: 45,569
Non-trainable params: 0
_________________________________________________________________
INFO:tensorflow:Assets written to: naive_dnn_dense.model/assets


In [8]:
def convert_to_sparse_model(embeddings_weights, embedding_table_path, embedding_vec_size):
    os.system("mkdir -p {}".format(embedding_table_path))
    
    with open("{}/key".format(embedding_table_path), 'wb') as key_file, \
        open("{}/emb_vector".format(embedding_table_path), 'wb') as vec_file:
        for key in range(embeddings_weights.shape[0]):
            vec = embeddings_weights[key]
            key_struct = struct.pack('q', key)
            vec_struct = struct.pack(str(embedding_vec_size) + "f", *vec)
            key_file.write(key_struct)
            vec_file.write(vec_struct)

In [9]:
convert_to_sparse_model(embedding_weights, args["embedding_table_path"], args["embed_vec_size"])

In [14]:
# import gc
# del dataset
# gc.collect()