In [1]:
import hugectr
from hugectr.tools import DataGeneratorParams, DataGenerator
from mpi4py import MPI

In [2]:
data_generator_params = DataGeneratorParams(
  format = hugectr.DataReaderType_t.Parquet,
  label_dim = 1,
  dense_dim = 10,
  num_slot = 5,
  i64_input_key = True,
  nnz_array = [1, 1, 1, 1, 1],
  source = "./data_parquet/file_list.txt",
  eval_source = "./data_parquet/file_list_test.txt",
  slot_size_array = [10, 100, 1000, 10, 10],
  check_type = hugectr.Check_t.Non,
  dist_type = hugectr.Distribution_t.PowerLaw,
  power_law_type = hugectr.PowerLaw_t.Short,
  num_files = 10,
  eval_num_files = 4,
  num_samples_per_file = 10000)
data_generator = DataGenerator(data_generator_params)
data_generator.generate()

[HCTR][02:21:24.032][INFO][RK0][main]: Generate Parquet dataset
[HCTR][02:21:24.032][INFO][RK0][main]: train data folder: ./data_parquet, eval data folder: ./data_parquet, slot_size_array: 10, 100, 1000, 10, 10, nnz array: 1, 1, 1, 1, 1, #files for train: 10, #files for eval: 4, #samples per file: 10000, Use power law distribution: 1, alpha of power law: 1.3
[HCTR][02:21:24.032][INFO][RK0][main]: ./data_parquet exist
[HCTR][02:21:24.032][INFO][RK0][main]: ./data_parquet exist
[HCTR][02:21:24.032][INFO][RK0][main]: ./data_parquet/train exist
[HCTR][02:21:24.032][INFO][RK0][main]: ./data_parquet/train/gen_0.parquet
[HCTR][02:21:25.103][INFO][RK0][main]: ./data_parquet/train/gen_1.parquet
[HCTR][02:21:25.142][INFO][RK0][main]: ./data_parquet/train/gen_2.parquet
[HCTR][02:21:25.188][INFO][RK0][main]: ./data_parquet/train/gen_3.parquet
[HCTR][02:21:25.230][INFO][RK0][main]: ./data_parquet/train/gen_4.parquet
[HCTR][02:21:25.272][INFO][RK0][main]: ./data_parquet/train/gen_5.parquet
[HCTR][02

In [None]:
import os
from time import time
import re
import shutil
import glob
import warnings
BASE_DIR = "/hps_train"
train_path  = os.path.join(BASE_DIR, "train")
val_path = os.path.join(BASE_DIR, "val")
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
n_workers = len(CUDA_VISIBLE_DEVICES.split(","))
frac_size = 0.15
allow_multi_gpu = False
use_rmm_pool = False
max_day = None  # (Optional) -- Limit the dataset to day 0-max_day for debugging

if os.path.isdir(train_path):
    shutil.rmtree(train_path)
os.makedirs(train_path)

if os.path.isdir(val_path):
    shutil.rmtree(val_path)
os.makedirs(val_path)

In [None]:
ls -l $train_path

In [None]:
import pandas as pd

In [None]:
!python3 /hps_train/preprocess.py --data_path /hps_train/ --out_path /hps_train/ --freq_limit 6 --feature_cross_list C1_C2,C3_C4 --device_pool_frac 0.5  --devices "0" --num_io_threads 2

In [3]:
solver = hugectr.CreateSolver(max_eval_batches = 2000,
                              batchsize_eval = 1000,
                              batchsize = 1000,
                              lr = 0.001,
                              vvgpu = [[2]],
                              repeat_dataset = True,
                              i64_input_key = True)

In [4]:
reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
                                  source = ["./data_parquet/file_list.txt"],
                                  eval_source = "./data_parquet/file_list_test.txt",
                                  check_type = hugectr.Check_t.Non,
                                  slot_size_array = [10, 100, 1000, 10, 10])

In [5]:
optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam,
                                    update_type = hugectr.Update_t.Global,
                                    beta1 = 0.9,
                                    beta2 = 0.999,
                                    epsilon = 0.0000001)

In [6]:
model = hugectr.Model(solver, reader, optimizer)

HugeCTR Version: 3.7
[HCTR][02:22:19.230][INFO][RK0][main]: Global seed is 62272102
[HCTR][02:22:19.388][INFO][RK0][main]: Device to NUMA mapping:
  GPU 2 ->  node 1
[HCTR][02:22:21.670][INFO][RK0][main]: Start all2all warmup
[HCTR][02:22:21.672][INFO][RK0][main]: End all2all warmup
[HCTR][02:22:21.673][INFO][RK0][main]: Using All-reduce algorithm: NCCL
[HCTR][02:22:21.675][INFO][RK0][main]: Device 2: NVIDIA A100-SXM4-80GB


set_mempolicy: Operation not permitted


In [7]:
model.add(hugectr.Input(label_dim = 1, label_name = "label",
                        dense_dim = 10, dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("wide_data", 1, True, 2),
                        hugectr.DataReaderSparseParam("deep_data", 2, False, 26)]))

[HCTR][02:22:28.924][INFO][RK0][main]: num of DataReader workers: 1
[HCTR][02:22:28.928][INFO][RK0][main]: Vocabulary size: 1130
[HCTR][02:22:28.929][DEBUG][RK0][tid #139708910774016]: file_name_ ./data_parquet/val/gen_0.parquet file_total_rows_ 10000


set_mempolicy: Operation not permitted
set_mempolicy: Operation not permitted
set_mempolicy: Operation not permitted
set_mempolicy: Operation not permitted
set_mempolicy: Operation not permitted
set_mempolicy: Operation not permitted


[HCTR][02:22:28.930][DEBUG][RK0][tid #139708919166720]: file_name_ ./data_parquet/train/gen_0.parquet file_total_rows_ 10000


[HCTR][02:22:28.935][ERROR][RK0][tid #139708910774016]: Runtime error: Parquet worker : cat s-hot KeyType should be uint64/int64/int32/uint32
	Error_t::WrongInput at read_a_batch(/workspace/merlin/hugectr/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp:555)
[HCTR][02:22:28.935][ERROR][RK0][tid #139708919166720]: Runtime error: Parquet worker : cat s-hot KeyType should be uint64/int64/int32/uint32
	Error_t::WrongInput at read_a_batch(/workspace/merlin/hugectr/HugeCTR/include/data_readers/parquet_data_reader_worker.hpp:555)


In [8]:
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 8,
                            embedding_vec_size = 1,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding2",
                            bottom_name = "wide_data",
                            optimizer = optimizer))
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 135,
                            embedding_vec_size = 16,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "deep_data",
                            optimizer = optimizer))

[HCTR][02:22:32.879][INFO][RK0][main]: max_vocabulary_size_per_gpu_=699050
[HCTR][02:22:32.881][INFO][RK0][main]: max_vocabulary_size_per_gpu_=737280


In [9]:
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape1"],
                            leading_dim=416))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding2"],
                            top_names = ["reshape2"],
                            leading_dim=2))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReduceSum,
                            bottom_names = ["reshape2"],
                            top_names = ["wide_redn"],
                            axis = 1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["reshape1", "dense"],
                            top_names = ["concat1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat1"],
                            top_names = ["fc1"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu1"],
                            top_names = ["dropout1"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout1"],
                            top_names = ["fc2"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc2"],
                            top_names = ["relu2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu2"],
                            top_names = ["dropout2"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout2"],
                            top_names = ["fc3"],
                            num_output=1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Add,
                            bottom_names = ["fc3", "wide_redn"],
                            top_names = ["add1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["add1", "label"],
                            top_names = ["loss"]))

In [None]:
model.compile()
model.summary()
model.fit(max_iter = 2000, display = 1000, eval_interval = 2000, snapshot = 10000, snapshot_prefix = "hps")
model.graph_to_json(graph_config_file = "hps_train.json")


In [None]:
model.add(hugectr.Input(label_dim = 1, label_name = "label",
                        dense_dim = 10, dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("wide_data", 1, True, 2),
                        hugectr.DataReaderSparseParam("deep_data", 2, False, 26)]))

model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 8,
                            embedding_vec_size = 1,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding2",
                            bottom_name = "wide_data",
                            optimizer = optimizer))
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 135,
                            embedding_vec_size = 16,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "deep_data",
                            optimizer = optimizer))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape1"],
                            leading_dim=416))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding2"],
                            top_names = ["reshape2"],
                            leading_dim=2))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReduceSum,
                            bottom_names = ["reshape2"],
                            top_names = ["wide_redn"],
                            axis = 1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["reshape1", "dense"],
                            top_names = ["concat1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat1"],
                            top_names = ["fc1"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu1"],
                            top_names = ["dropout1"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout1"],
                            top_names = ["fc2"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc2"],
                            top_names = ["relu2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu2"],
                            top_names = ["dropout2"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout2"],
                            top_names = ["fc3"],
                            num_output=1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Add,
                            bottom_names = ["fc3", "wide_redn"],
                            top_names = ["add1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["add1", "label"],
                            top_names = ["loss"]))
model.compile()
model.summary()
model.fit(max_iter = 21000, display = 1000, eval_interval = 4000, snapshot = 20000, snapshot_prefix = "wdl")
model.graph_to_json(graph_config_file = "wdl.json")


In [None]:
data_generator = DataGenerator(data_generator_params)

In [None]:
data_generator.generate()

In [None]:
data_generator_params = DataGeneratorParams(
  format = hugectr.DataReaderType_t.Raw,
  label_dim = 1,
  dense_dim = 13,
  num_slot = 26,
  i64_input_key = False,
  source = "./dcn_raw/file_list.txt",
  eval_source = "./dcn_raw/file_list_test.txt",
  slot_size_array = [39884, 39043, 17289, 7420, 20263, 3, 7120, 1543, 39884, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63, 63, 39884, 39043, 17289, 7420, 20263, 3, 7120,
  1543],
  check_type = hugectr.Check_t.Sum,
  dist_type = hugectr.Distribution_t.PowerLaw,
  power_law_type = hugectr.PowerLaw_t.Short)

In [None]:
data_generator = DataGenerator(data_generator_params)

In [None]:
data_generator.generate()

In [None]:
!pwd

In [None]:
!mkdir etc_data

In [None]:
import hugectr
from hugectr.tools import DataGeneratorParams, DataGenerator

data_generator_params = DataGeneratorParams(
  format = hugectr.DataReaderType_t.Parquet,
  label_dim = 1,
  dense_dim = 10,
  num_slot = 4,
  i64_input_key = True,
  nnz_array = [1, 1, 1, 1],
  source = "./data_parquet/file_list.txt",
  eval_source = "./data_parquet/file_list_test.txt",
  slot_size_array = [10000, 10000, 10000, 10000],
  check_type = hugectr.Check_t.Non,
  dist_type = hugectr.Distribution_t.PowerLaw,
  power_law_type = hugectr.PowerLaw_t.Short,
  num_files = 16,
  eval_num_files = 4,
  num_samples_per_file = 40960)
data_generator = DataGenerator(data_generator_params)
data_generator.generate()

In [None]:
!mkdir etc_data

In [None]:
import hugectr
from hugectr.tools import DataGenerator, DataGeneratorParams
from mpi4py import MPI
import argparse
parser = argparse.ArgumentParser(description=("Data Generation"))

In [None]:
parser.add_argument("--num_files", type=int, help="number of files in training data", default = 8)
parser.add_argument("--eval_num_files", type=int, help="number of files in validation data", default = 2)
parser.add_argument('--num_samples_per_file', type=int, help="number of samples per file", default=1000000)
parser.add_argument('--dir_name', type=str, help="data directory name(Required)")
args = parser.parse_args()

data_generator_params = DataGeneratorParams(
  format = hugectr.DataReaderType_t.Parquet,
  label_dim = 1,
  dense_dim = 13,
  num_slot = 26,
  num_files = args.num_files,
  eval_num_files = args.eval_num_files,
  i64_input_key = True,
  num_samples_per_file = args.num_samples_per_file,
  source = "./etc_data/" + args.dir_name + "/file_list.txt",
  eval_source = "./etc_data/" + args.dir_name + "/file_list_test.txt",
  slot_size_array = [12988, 7129, 8720, 5820, 15196, 4, 4914, 1020, 30, 14274, 10220, 15088, 10, 1518, 3672, 48, 4, 820, 15, 12817, 13908, 13447, 9447, 5867, 45, 33],
  # for parquet, check_type doesn't make any difference
  check_type = hugectr.Check_t.Non,
  dist_type = hugectr.Distribution_t.PowerLaw,
  power_law_type = hugectr.PowerLaw_t.Short)
data_generator = DataGenerator(data_generator_params)
data_generator.generate()


In [None]:
import hugectr
from hugectr.tools import DataGeneratorParams, DataGenerator

data_generator_params = DataGeneratorParams(
  format = hugectr.DataReaderType_t.Parquet,
  label_dim = 2,
  dense_dim = 10,
  num_slot = 5,
  i64_input_key = True,
  nnz_array = [1, 1, 1, 1, 1],
  source = "./data_parquet/file_list.txt",
  eval_source = "./data_parquet/file_list_test.txt",
  slot_size_array = [10, 100, 1000, 10, 10],
  check_type = hugectr.Check_t.Non,
  dist_type = hugectr.Distribution_t.PowerLaw,
  power_law_type = hugectr.PowerLaw_t.Short,
  num_files = 10,
  eval_num_files = 4,
  num_samples_per_file = 4000)

# data_generator_params = DataGeneratorParams(
#   format = hugectr.DataReaderType_t.Norm,
#   label_dim = 1,
#   dense_dim = 13,
#   num_slot = 26,
#   i64_input_key = False,
#   source = "./dcn_norm/file_list.txt",
#   eval_source = "./dcn_norm/file_list_test.txt",
#   slot_size_array = [39884, 39043, 17289, 7420, 20263, 3, 7120, 1543, 39884, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63, 63, 39884, 39043, 17289, 7420, 20263, 3, 7120,
#   1543],
#   check_type = hugectr.Check_t.Sum,
#   dist_type = hugectr.Distribution_t.PowerLaw,
#   power_law_type = hugectr.PowerLaw_t.Short)

data_generator = DataGenerator(data_generator_params)
data_generator.generate()

In [None]:
import pandas as pd
df = pd.read_parquet("./data_parquet/train/gen_0.parquet")
df.head(10)

In [None]:
df.shape

In [None]:

import hugectr
from mpi4py import MPI
solver = hugectr.CreateSolver(model_name = "hps_demo",
                              max_eval_batches = 1,
                              batchsize_eval = 1024,
                              batchsize = 1024,
                              lr = 0.001,
                              vvgpu = [[0]],
                              i64_input_key = True,
                              repeat_dataset = True,
                              use_cuda_graph = True)
reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
                                  source = ["./data_parquet/file_list.txt"],
                                  eval_source = "./data_parquet/file_list_test.txt",
                                  check_type = hugectr.Check_t.Non,
                                  slot_size_array = [10000, 10000, 10000, 10000])
optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam)
model = hugectr.Model(solver, reader, optimizer)
model.add(hugectr.Input(label_dim = 1, label_name = "label",
                        dense_dim = 10, dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("data1", [1, 1], True, 2),
                        hugectr.DataReaderSparseParam("data2", [1, 1], True, 2)]))
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 4,
                            embedding_vec_size = 16,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "data1",
                            optimizer = optimizer))
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 8,
                            embedding_vec_size = 32,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding2",
                            bottom_name = "data2",
                            optimizer = optimizer))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape1"],
                            leading_dim=32))                            
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding2"],
                            top_names = ["reshape2"],
                            leading_dim=64))                            
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["reshape1", "reshape2", "dense"], top_names = ["concat1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat1"],
                            top_names = ["fc1"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["relu1"],
                            top_names = ["fc2"],
                            num_output=1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["fc2", "label"],
                            top_names = ["loss"]))
model.compile()
model.summary()
model.graph_to_json("hps_demo.json")
model.fit(max_iter = 1100, display = 200, eval_interval = 1000, snapshot = 1000, snapshot_prefix = "hps_demo")
model.export_predictions("hps_demo_pred_" + str(1000), "hps_demo_label_" + str(1000))