In [1]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# 1.Overview

In this notebook, we want to provide an tutorial how to train a wdl model using HugeCTR High-level python API. We will use original Criteo dataset as training data

1. Overview
2. Dataset Preprocessing
3. WDL Model Training
4. Save the Model Files

# 2. Dataset Preprocessing
## 2.1 Generate training and validation data folders

In [2]:
# define some data folder to store the original and preprocessed data
# Standard Libraries
import os
from time import time
import re
import shutil
import glob
import warnings
BASE_DIR = "/wdl_train"
train_path  = os.path.join(BASE_DIR, "train")
val_path = os.path.join(BASE_DIR, "val")
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
n_workers = len(CUDA_VISIBLE_DEVICES.split(","))
frac_size = 0.15
allow_multi_gpu = False
use_rmm_pool = False
max_day = None  # (Optional) -- Limit the dataset to day 0-max_day for debugging

if os.path.isdir(train_path):
    shutil.rmtree(train_path)
os.makedirs(train_path)

if os.path.isdir(val_path):
    shutil.rmtree(val_path)
os.makedirs(val_path)

In [3]:
ls -l $train_path

total 0


## 2.2 Download the Original Criteo Dataset

In [2]:
!apt-get install wget

Reading package lists... Done
Building dependency tree       
Reading state information... Done
wget is already the newest version (1.20.3-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.


In [None]:
!wget -P https://storage.googleapis.com/criteo-cail-datasets/day_0.gz

In [7]:
#unzip the split data set to training and validation 
!gzip -d -c day_0.gz > day_0
!head -n 45840617 day_0 > $train_path/train.txt
!tail -n 2000000 day_0 > $val_path/test.txt 

## 2.3 Preprocessing by NVTabular

In [1]:
%%writefile /wdl_train/preprocess.py
import os
import sys
import argparse
import glob
import time
import numpy as np
import pandas as pd
import shutil

import dask_cudf
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

import cudf
import rmm
import nvtabular as nvt
from nvtabular.utils import device_mem_size
from nvtabular.ops import Categorify, Clip, FillMissing, LambdaOp, Normalize, Rename, Operator, get_embedding_sizes
#%load_ext memory_profiler

import logging
logging.basicConfig(format='%(asctime)s %(message)s')
logging.root.setLevel(logging.NOTSET)
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('asyncio').setLevel(logging.WARNING)

# define dataset schema
CATEGORICAL_COLUMNS=["C" + str(x) for x in range(1, 27)]
CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
LABEL_COLUMNS = ['label']
COLUMNS =  LABEL_COLUMNS + CONTINUOUS_COLUMNS +  CATEGORICAL_COLUMNS
#/samples/criteo mode doesn't have dense features
criteo_COLUMN=LABEL_COLUMNS +  CATEGORICAL_COLUMNS
#For new feature cross columns
CROSS_COLUMNS = []


NUM_INTEGER_COLUMNS = 13
NUM_CATEGORICAL_COLUMNS = 26
NUM_TOTAL_COLUMNS = 1 + NUM_INTEGER_COLUMNS + NUM_CATEGORICAL_COLUMNS


# Initialize RMM pool on ALL workers
def setup_rmm_pool(client, pool_size):
    client.run(rmm.reinitialize, pool_allocator=True, initial_pool_size=pool_size)
    return None

#compute the partition size with GB
def bytesto(bytes, to, bsize=1024):
    a = {'k' : 1, 'm': 2, 'g' : 3, 't' : 4, 'p' : 5, 'e' : 6 }
    r = float(bytes)
    return bytes / (bsize ** a[to])

#process the data with NVTabular
def process_NVT(args):

    if args.feature_cross_list:
        feature_pairs = [pair.split("_") for pair in args.feature_cross_list.split(",")]
        for pair in feature_pairs:
            CROSS_COLUMNS.append(pair[0]+'_'+pair[1])


    logging.info('NVTabular processing')
    train_output = os.path.join(args.out_path, "train")
    print("Training output data: "+train_output)
    val_output = os.path.join(args.out_path, "val")
    print("Validation output data: "+val_output)
    train_input = os.path.join(args.data_path, "train/train.txt")
    print("Training dataset: "+train_input)
    val_input = os.path.join(args.data_path, "val/test.txt")
    PREPROCESS_DIR_temp_train = os.path.join(args.out_path, 'train/temp-parquet-after-conversion')  
    PREPROCESS_DIR_temp_val = os.path.join(args.out_path, "val/temp-parquet-after-conversion")
    if not os.path.exists(PREPROCESS_DIR_temp_train):
        os.makedirs(PREPROCESS_DIR_temp_train)
    
    if not os.path.exists(PREPROCESS_DIR_temp_val):
        os.makedirs(PREPROCESS_DIR_temp_val)
    
    PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val]
    
    

    # Make sure we have a clean parquet space for cudf conversion
    for one_path in PREPROCESS_DIR_temp:
        if os.path.exists(one_path):
            shutil.rmtree(one_path)
        os.mkdir(one_path)


    ## Get Dask Client

    # Deploy a Single-Machine Multi-GPU Cluster
    device_size = device_mem_size(kind="total")
    cluster = None
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS
        cluster = LocalCUDACluster(
            protocol = args.protocol,
            CUDA_VISIBLE_DEVICES = args.devices,
            n_workers = len(args.devices.split(",")),
            enable_nvlink=True,
            device_memory_limit = int(device_size * args.device_limit_frac),
            dashboard_address=":" + args.dashboard_port
        )
    else:
        cluster = LocalCUDACluster(
            protocol = args.protocol,
            n_workers = len(args.devices.split(",")),
            CUDA_VISIBLE_DEVICES = args.devices,
            device_memory_limit = int(device_size * args.device_limit_frac),
            dashboard_address=":" + args.dashboard_port
        )



    # Create the distributed client
    client = Client(cluster)
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, int(args.device_pool_frac*device_size))


    #calculate the total processing time
    runtime = time.time()

    #test dataset without the label feature
    if args.dataset_type == 'test':
        global LABEL_COLUMNS
        LABEL_COLUMNS = []

    ##-----------------------------------##
    # Dask rapids converts txt to parquet
    # Dask cudf dataframe = ddf

    ## train/valid txt to parquet
    train_valid_paths = [(train_input,PREPROCESS_DIR_temp_train),(val_input,PREPROCESS_DIR_temp_val)]

    for input, temp_output in train_valid_paths:

        ddf = dask_cudf.read_csv(input,sep='\t',names=LABEL_COLUMNS + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS)

        ## Convert label col to FP32
        if args.parquet_format and args.dataset_type == 'train':
            ddf["label"] = ddf['label'].astype('float32')

        # Save it as parquet format for better memory usage
        ddf.to_parquet(temp_output,header=True)
        ##-----------------------------------##

    COLUMNS =  LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS
    train_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_train, "*.parquet"))
    valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet"))

    categorify_op = Categorify(freq_threshold=args.freq_limit)
    cat_features = CATEGORICAL_COLUMNS >> categorify_op
    cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(min_value=0) >> Normalize()
    cross_cat_op = Categorify(encode_type="combo", freq_threshold=args.freq_limit)

    features = LABEL_COLUMNS
    
    if args.criteo_mode == 0:
        features += cont_features
        if args.feature_cross_list:
            feature_pairs = [pair.split("_") for pair in args.feature_cross_list.split(",")]
            for pair in feature_pairs:
                features += [pair] >> cross_cat_op
            
    features += cat_features

    workflow = nvt.Workflow(features, client=client)

    logging.info("Preprocessing")

    output_format = 'hugectr'
    if args.parquet_format:
        output_format = 'parquet'

    # just for /samples/criteo model
    train_ds_iterator = nvt.Dataset(train_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size))
    valid_ds_iterator = nvt.Dataset(valid_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size))

    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt.io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt.io.Shuffle.PER_PARTITION

    logging.info('Train Datasets Preprocessing.....')

    dict_dtypes = {}
    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64
    if not args.criteo_mode:
        for col in CONTINUOUS_COLUMNS:
            dict_dtypes[col] = np.float32
    for col in CROSS_COLUMNS:
        dict_dtypes[col] = np.int64
    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32
    
    conts = CONTINUOUS_COLUMNS if not args.criteo_mode else []
    
    workflow.fit(train_ds_iterator)
    
    if output_format == 'hugectr':
        workflow.transform(train_ds_iterator).to_hugectr(
                cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
                conts=conts,
                labels=LABEL_COLUMNS,
                output_path=train_output,
                shuffle=shuffle,
                out_files_per_proc=args.out_files_per_proc,
                num_threads=args.num_io_threads)
    else:
        workflow.transform(train_ds_iterator).to_parquet(
                output_path=train_output,
                dtypes=dict_dtypes,
                cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
                conts=conts,
                labels=LABEL_COLUMNS,
                shuffle=shuffle,
                out_files_per_proc=args.out_files_per_proc,
                num_threads=args.num_io_threads)
        
        
        
    ###Getting slot size###    
    #--------------------##
    embeddings_dict_cat = categorify_op.get_embedding_sizes(CATEGORICAL_COLUMNS)
    embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS)
    embeddings = [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS] + [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS]
    
    print(embeddings)
    ##--------------------##

    logging.info('Valid Datasets Preprocessing.....')

    if output_format == 'hugectr':
        workflow.transform(valid_ds_iterator).to_hugectr(
                cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
                conts=conts,
                labels=LABEL_COLUMNS,
                output_path=val_output,
                shuffle=shuffle,
                out_files_per_proc=args.out_files_per_proc,
                num_threads=args.num_io_threads)
    else:
        workflow.transform(valid_ds_iterator).to_parquet(
                output_path=val_output,
                dtypes=dict_dtypes,
                cats=CATEGORICAL_COLUMNS + CROSS_COLUMNS,
                conts=conts,
                labels=LABEL_COLUMNS,
                shuffle=shuffle,
                out_files_per_proc=args.out_files_per_proc,
                num_threads=args.num_io_threads)

    embeddings_dict_cat = categorify_op.get_embedding_sizes(CATEGORICAL_COLUMNS)
    embeddings_dict_cross = cross_cat_op.get_embedding_sizes(CROSS_COLUMNS)
    embeddings = [embeddings_dict_cat[c][0] for c in CATEGORICAL_COLUMNS] + [embeddings_dict_cross[c][0] for c in CROSS_COLUMNS]
    
    print(embeddings)
    ##--------------------##

    ## Shutdown clusters
    client.close()
    logging.info('NVTabular processing done')

    runtime = time.time() - runtime

    print("\nDask-NVTabular Criteo Preprocessing")
    print("--------------------------------------")
    print(f"data_path          | {args.data_path}")
    print(f"output_path        | {args.out_path}")
    print(f"partition size     | {'%.2f GB'%bytesto(int(args.part_mem_frac * device_size),'g')}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")


def parse_args():
    parser = argparse.ArgumentParser(description=("Multi-GPU Criteo Preprocessing"))

    #
    # System Options
    #

    parser.add_argument("--data_path", type=str, help="Input dataset path (Required)")
    parser.add_argument("--out_path", type=str, help="Directory path to write output (Required)")
    parser.add_argument(
        "-d",
        "--devices",
        default=os.environ.get("CUDA_VISIBLE_DEVICES", "0"),
        type=str,
        help='Comma-separated list of visible devices (e.g. "0,1,2,3"). '
    )
    parser.add_argument(
        "-p",
        "--protocol",
        choices=["tcp", "ucx"],
        default="tcp",
        type=str,
        help="Communication protocol to use (Default 'tcp')",
    )
    parser.add_argument(
        "--device_limit_frac",
        default=0.5,
        type=float,
        help="Worker device-memory limit as a fraction of GPU capacity (Default 0.8). "
    )
    parser.add_argument(
        "--device_pool_frac",
        default=0.9,
        type=float,
        help="RMM pool size for each worker  as a fraction of GPU capacity (Default 0.9). "
        "The RMM pool frac is the same for all GPUs, make sure each one has enough memory size",
    )
    parser.add_argument(
        "--num_io_threads",
        default=0,
        type=int,
        help="Number of threads to use when writing output data (Default 0). "
        "If 0 is specified, multi-threading will not be used for IO.",
    )

    #
    # Data-Decomposition Parameters
    #

    parser.add_argument(
        "--part_mem_frac",
        default=0.125,
        type=float,
        help="Maximum size desired for dataset partitions as a fraction "
        "of GPU capacity (Default 0.125)",
    )
    parser.add_argument(
        "--out_files_per_proc",
        default=1,
        type=int,
        help="Number of output files to write on each worker (Default 1)",
    )

    #
    # Preprocessing Options
    #

    parser.add_argument(
        "-f",
        "--freq_limit",
        default=0,
        type=int,
        help="Frequency limit for categorical encoding (Default 0)",
    )
    parser.add_argument(
        "-s",
        "--shuffle",
        choices=["PER_WORKER", "PER_PARTITION", "NONE"],
        default="PER_PARTITION",
        help="Shuffle algorithm to use when writing output data to disk (Default PER_PARTITION)",
    )

    parser.add_argument(
        "--feature_cross_list", default=None, type=str, help="List of feature crossing cols (e.g. C1_C2, C3_C4)"
    )

    #
    # Diagnostics Options
    #

    parser.add_argument(
        "--profile",
        metavar="PATH",
        default=None,
        type=str,
        help="Specify a file path to export a Dask profile report (E.g. dask-report.html)."
        "If this option is excluded from the command, not profile will be exported",
    )
    parser.add_argument(
        "--dashboard_port",
        default="8787",
        type=str,
        help="Specify the desired port of Dask's diagnostics-dashboard (Default `3787`). "
        "The dashboard will be hosted at http://<IP>:<PORT>/status",
    )

    #
    # Format
    #

    parser.add_argument('--criteo_mode', type=int, default=0)
    parser.add_argument('--parquet_format', type=int, default=1)
    parser.add_argument('--dataset_type', type=str, default='train')

    args = parser.parse_args()
    args.n_workers = len(args.devices.split(","))
    return args
if __name__ == '__main__':

    args = parse_args()

    process_NVT(args)

Overwriting /wdl_train/preprocess.py


In [2]:
import pandas as pd

In [4]:
!python3 /wdl_train/preprocess.py --data_path /wdl_train/ --out_path /wdl_train/ --freq_limit 6 --feature_cross_list C1_C2,C3_C4 --device_pool_frac 0.5 --num_io_threads 2

2022-11-11 08:14:51,395 NVTabular processing
Training output data: /wdl_train/train
Validation output data: /wdl_train/val
Training dataset: /wdl_train/train/train.txt
2022-11-11 08:14:53,489 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2022-11-11 08:14:53,506 Unable to start CUDA Context
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pynvml/nvml.py", line 782, in _nvmlGetFunctionPointer
    _nvmlGetFunctionPointer_cache[name] = getattr(nvmlLib, name)
  File "/usr/lib/python3.8/ctypes/__init__.py", line 386, in __getattr__
    func = self.__getitem__(name)
  File "/usr/lib/python3.8/ctypes/__init__.py", line 391, in __getitem__
    func = self._FuncPtr((name_or_ordinal, self))
AttributeError: /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1: undefined symbol: nvmlDeviceGetComputeRunningProcesses_v2

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib

### 2.4 Checke the preprocessed training data

In [11]:
!ls -ll /wdl_train/train

total 14581444
-rw-r--r-- 1 root root          34 Nov 11 07:47 _file_list.txt
-rw-r--r-- 1 root root      450252 Nov 11 07:47 _metadata
-rw-r--r-- 1 root root        1510 Nov 11 07:47 _metadata.json
lrwxrwxrwx 1 root root           8 Nov 11 07:36 day_0.gz -> day_0.gz
-rw-r--r-- 1 root root  3381186419 Nov 11 07:47 part_0.parquet
-rw-r--r-- 1 root root       27296 Nov 11 07:47 schema.pbtxt
drwxr-xr-x 2 root root        4096 Nov 11 07:46 temp-parquet-after-conversion
-rw-r--r-- 1 root root 11549710546 Nov 11 07:45 train.txt


In [12]:
import pandas as pd
df = pd.read_parquet("/wdl_train/train/part_0.parquet")
df.head(2)

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,-0.032831,0.118178,-0.594327,0.275234,-0.100776,-0.206385,-0.064249,3.35703,-0.760031,-0.470383,...,1,1,2,1,1,1,1,335,1,2
1,-0.050565,-0.527089,-0.379705,-0.115539,0.581131,-0.206385,-0.064249,-0.279201,-0.615432,-0.470383,...,2,1,4,13472,14005,14002,1822,92,6,5


## 3. WDL Model Training

In [17]:
%%writefile './model.py'
import hugectr
from mpi4py import MPI
solver = hugectr.CreateSolver(max_eval_batches = 4000,
                              batchsize_eval = 2720,
                              batchsize = 2720,
                              lr = 0.001,
                              vvgpu = [[2]],
                              repeat_dataset = True,
                              i64_input_key = True)

reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
                                  source = ["/wdl_train/train/_file_list.txt"],
                                  eval_source = "/wdl_train/val/_file_list.txt",
                                  check_type = hugectr.Check_t.Non,
                                  slot_size_array = [278018, 415262,249058, 19561, 14212, 6890, 18592, 4, 6356, 1254, 52, 226170, 80508, 72308, 11, 2169, 7597, 61, 4, 923, 15, 249619, 168974, 243480, 68212, 9169, 75, 34])
optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam,
                                    update_type = hugectr.Update_t.Global,
                                    beta1 = 0.9,
                                    beta2 = 0.999,
                                    epsilon = 0.0000001)
model = hugectr.Model(solver, reader, optimizer)

model.add(hugectr.Input(label_dim = 1, label_name = "label",
                        dense_dim = 13, dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("wide_data", 1, True, 2),
                        hugectr.DataReaderSparseParam("deep_data", 2, False, 26)]))

model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 80,
                            embedding_vec_size = 1,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding2",
                            bottom_name = "wide_data",
                            optimizer = optimizer))
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 1350,
                            embedding_vec_size = 16,
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "deep_data",
                            optimizer = optimizer))

model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding1"],
                            top_names = ["reshape1"],
                            leading_dim=416))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape,
                            bottom_names = ["sparse_embedding2"],
                            top_names = ["reshape2"],
                            leading_dim=2))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReduceSum,
                            bottom_names = ["reshape2"],
                            top_names = ["wide_redn"],
                            axis = 1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat,
                            bottom_names = ["reshape1", "dense"],
                            top_names = ["concat1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["concat1"],
                            top_names = ["fc1"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu1"],
                            top_names = ["dropout1"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout1"],
                            top_names = ["fc2"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc2"],
                            top_names = ["relu2"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout,
                            bottom_names = ["relu2"],
                            top_names = ["dropout2"],
                            dropout_rate=0.5))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dropout2"],
                            top_names = ["fc3"],
                            num_output=1))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Add,
                            bottom_names = ["fc3", "wide_redn"],
                            top_names = ["add1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["add1", "label"],
                            top_names = ["loss"]))
model.compile()
model.summary()
model.fit(max_iter = 21000, display = 1000, eval_interval = 4000, snapshot = 20000, snapshot_prefix = "wdl")
model.graph_to_json(graph_config_file = "wdl.json")

Overwriting ./model.py


In [18]:
!python ./model.py

HugeCTR Version: 4.0
[HCTR][07:50:47.782][INFO][RK0][main]: Global seed is 3928542589
[HCTR][07:50:47.785][INFO][RK0][main]: Device to NUMA mapping:
  GPU 2 ->  node 0
[HCTR][07:50:49.887][INFO][RK0][main]: Start all2all warmup
[HCTR][07:50:49.887][INFO][RK0][main]: End all2all warmup
[HCTR][07:50:49.888][INFO][RK0][main]: Using All-reduce algorithm: NCCL
[HCTR][07:50:49.888][INFO][RK0][main]: Device 2: Tesla V100-SXM2-32GB
[HCTR][07:50:49.889][INFO][RK0][main]: num of DataReader workers for train: 1
[HCTR][07:50:49.889][INFO][RK0][main]: num of DataReader workers for eval: 1
[HCTR][07:50:49.894][INFO][RK0][main]: Vocabulary size: 2138588
[HCTR][07:50:49.895][INFO][RK0][main]: max_vocabulary_size_per_gpu_=6990506
[HCTR][07:50:49.910][INFO][RK0][main]: max_vocabulary_size_per_gpu_=7372800
[HCTR][07:50:49.916][INFO][RK0][main]: Graph analysis to resolve tensor dependency
[HCTR][07:50:58.374][INFO][RK0][main]: gpu0 start to init embedding
[HCTR][07:50:58.374][INFO][RK0][main]: gpu0 init e

## 4. Inference Validation

In [19]:
!ls -l /wdl_train/val

total 639344
-rw-r--r-- 1 root root        32 Nov 11 07:47 _file_list.txt
-rw-r--r-- 1 root root     21896 Nov 11 07:47 _metadata
-rw-r--r-- 1 root root      1509 Nov 11 07:47 _metadata.json
-rw-r--r-- 1 root root 144853353 Nov 11 07:47 part_0.parquet
-rw-r--r-- 1 root root     27296 Nov 11 07:47 schema.pbtxt
drwxr-xr-x 2 root root      4096 Nov 11 07:46 temp-parquet-after-conversion
-rw-r--r-- 1 root root 509766965 Nov 11 07:45 test.txt


In [20]:
import pandas as pd
df = pd.read_parquet("/wdl_train/val/part_0.parquet")

In [21]:
df.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,-0.045245,-0.501278,-0.594327,-0.157301,-0.162767,-0.206385,-0.064249,-0.253116,-0.760031,-0.470383,...,1,2,1,4,5,4,2,658,1,4
1,-0.048792,-0.230945,-0.272394,-0.139403,0.271173,-0.206385,0.743047,-0.279201,0.107562,-0.470383,...,3,2,1,127,193,145,2635,518,2,10
2,-0.061206,-0.176607,2.517687,-0.157301,-0.224758,-0.206385,-0.064249,-0.28181,2.131947,-0.470383,...,0,1,1,0,0,0,0,18,1,1
3,-0.020417,-0.548824,-0.379705,-0.154318,-0.224758,0.205918,-0.064249,-0.28181,-0.615432,1.386036,...,2,1,1,0,6070,7978,3238,342,1,14
4,0.000864,0.566469,-0.165084,-0.157301,-0.224758,-0.206385,-0.064249,-0.28181,-0.543133,-0.470383,...,0,1,9,2,2,2,0,443,2,1


In [22]:
df.head(10).to_csv('/wdl_train/infer_test.csv', sep=',', index=False,header=True)

In [34]:
%%writefile wdl2predict.py
from hugectr.inference import InferenceParams, CreateInferenceSession

import sys
import pandas as pd
import numpy as np

slot_size_array = [249058, 19561, 14212, 6890, 18592, 4, 6356, 1254, 52, 226170, 80508, 72308, 11, 2169, 7597, 61, 4, 923, 15, 249619, 168974, 243480, 68212, 9169, 75, 34, 281564, 415262]

def wdl_inference(model_name, network_file, dense_file, embedding_file_list, data_file,enable_cache):
    CATEGORICAL_COLUMNS=["C1_C2","C3_C4"]+["C" + str(x) for x in range(1, 27)]
    CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
    LABEL_COLUMNS = ['label']
    shift = np.insert(np.cumsum(slot_size_array), 0, 0)[:-1]
    test_df=pd.read_csv(data_file,sep=',')
    config_file = network_file
    row_ptrs = list(range(0,21))+list(range(0,261))
    dense_features =  list(test_df[CONTINUOUS_COLUMNS].values.flatten())
    test_df[CATEGORICAL_COLUMNS].astype(np.int64)
    embedding_columns = list((test_df[CATEGORICAL_COLUMNS]+shift).values.flatten())

    # create parameter server, embedding cache and inference session
    inference_params = InferenceParams(model_name = model_name,
                                max_batchsize = 64,
                                hit_rate_threshold = 0.9,
                                dense_model_file = dense_file,
                                sparse_model_files = embedding_file_list,
                                device_id = 0,
                                use_gpu_embedding_cache = enable_cache,
                                cache_size_percentage = 0.9,
                                i64_input_key = True,
                                use_mixed_precision = False
                                )
    inference_session = CreateInferenceSession(config_file, inference_params)
    output = inference_session.predict(dense_features, embedding_columns, row_ptrs)
    print("WDL multi-embedding table inference result is {}".format(output))
    
if __name__ == "__main__":
    model_name = sys.argv[1]
    network_file = sys.argv[2]
    dense_file = sys.argv[3]
    embedding_file_list = str(sys.argv[4]).split(',')
    print(embedding_file_list)
    data_file = sys.argv[5]
  

    #wdl_inference(model_name, network_file, dense_file, embedding_file_list, data_file, True,hugectr.Database_t.Redis)
    wdl_inference(model_name, network_file, dense_file, embedding_file_list, data_file, True)
    #wdl_inference(model_name, network_file, dense_file, embedding_file_list, data_file, False)


Overwriting wdl2predict.py


In [35]:
!python wdl2predict.py "wdl" "wdl.json" "wdl_dense_20000.model" "wdl0_sparse_20000.model,wdl1_sparse_20000.model" "/wdl_train/infer_test.csv"

['wdl0_sparse_20000.model', 'wdl1_sparse_20000.model']
[HCTR][08:02:48.458][INFO][RK0][main]: default_emb_vec_value is not specified using default: 0
[HCTR][08:02:48.458][INFO][RK0][main]: default_emb_vec_value is not specified using default: 0
[HCTR][08:02:48.458][INFO][RK0][main]: Creating HashMap CPU database backend...
[HCTR][08:02:48.458][DEBUG][RK0][main]: Created blank database backend in local memory!
[HCTR][08:02:48.458][INFO][RK0][main]: Volatile DB: initial cache rate = 1
[HCTR][08:02:48.458][INFO][RK0][main]: Volatile DB: cache missed embeddings = 0
[HCTR][08:02:48.458][DEBUG][RK0][main]: Created raw model loader in local memory!
[HCTR][08:02:48.458][INFO][RK0][main]: Using Local file system backend.
[HCTR][08:02:49.421][INFO][RK0][main]: Table: hps_et.wdl.sparse_embedding2; cached 693280 / 693280 embeddings in volatile database (HashMapBackend); load: 693280 / 18446744073709551615 (0.00%).
[HCTR][08:02:49.421][INFO][RK0][main]: Using Local file system backend.
[HCTR][08:02