In [None]:
# Copyright 2021 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# 1.Overview

In this notebook, we want to provide an tutorial how to train a standard dlrm model using HugeCTR High-level python API. We will use original Criteo dataset as training data

1. [Overview](#1)
2. [Dataset Preprocessing](#2)
3. [DLRM Model Training](#3)
4. [Save the Model Files](#4)

# 2. Dataset Preprocessing
## 2.1 Generate training and validation data folders

In [1]:
# define some data folder to store the original and preprocessed data
# Standard Libraries
import os
from time import time
import re
import shutil
import glob
import warnings
BASE_DIR = "/dlrm_train"
train_path  = os.path.join(BASE_DIR, "train")
val_path = os.path.join(BASE_DIR, "val")
CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
n_workers = len(CUDA_VISIBLE_DEVICES.split(","))
frac_size = 0.15
allow_multi_gpu = False
use_rmm_pool = False
max_day = None  # (Optional) -- Limit the dataset to day 0-max_day for debugging

if os.path.isdir(train_path):
    shutil.rmtree(train_path)
os.makedirs(train_path)

if os.path.isdir(val_path):
    shutil.rmtree(val_path)
os.makedirs(val_path)

## 2.2 Download the Original Criteo Dataset

In [1]:
!apt-get install wget

Reading package lists... Done
Building dependency tree       
Reading state information... Done
wget is already the newest version (1.20.3-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


In [None]:
!wget -P $train_path https://storage.googleapis.com/criteo-cail-datasets/day_0.gz

In [2]:
#Download the split data set to training and validation 
!gzip -d -c $train_path/day_0.gz > day_0
!head -n 45840617 day_0 > $train_path/train.txt 
!tail -n 2000000 day_0 > $val_path/test.txt 

## 2.3 Preprocessing by NVTabular

In [3]:
%%writefile /dlrm_train/preprocess.py
import os
import sys
import argparse
import glob
import time
import re
import warnings
from cudf.io.parquet import ParquetWriter
import numpy as np
import pandas as pd
import concurrent.futures as cf
from concurrent.futures import as_completed
import shutil

import dask_cudf
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask.utils import parse_bytes
from dask.delayed import delayed

import cudf
import numpy as np
import cupy as cp
import rmm
import nvtabular as nvt
from nvtabular.io import Shuffle
from nvtabular.ops import Categorify, Clip, FillMissing, HashBucket, LambdaOp, LogOp, Normalize, Rename, get_embedding_sizes
from nvtabular.utils import _pynvml_mem_size, device_mem_size

#%load_ext memory_profiler

import logging
logging.basicConfig(format='%(asctime)s %(message)s')
logging.root.setLevel(logging.NOTSET)
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('asyncio').setLevel(logging.WARNING)

# define dataset schema
CATEGORICAL_COLUMNS=["C" + str(x) for x in range(1, 27)]
CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
LABEL_COLUMNS = ['label']
COLUMNS =  LABEL_COLUMNS + CONTINUOUS_COLUMNS +  CATEGORICAL_COLUMNS
#/samples/criteo mode doesn't have dense features
criteo_COLUMN=LABEL_COLUMNS +  CATEGORICAL_COLUMNS
#For new feature cross columns
CROSS_COLUMNS = []


NUM_INTEGER_COLUMNS = 13
NUM_CATEGORICAL_COLUMNS = 26
NUM_TOTAL_COLUMNS = 1 + NUM_INTEGER_COLUMNS + NUM_CATEGORICAL_COLUMNS


# Initialize RMM pool on ALL workers
def setup_rmm_pool(client, pool_size):
    client.run(rmm.reinitialize, pool_allocator=True, initial_pool_size=pool_size)
    return None

#compute the partition size with GB
def bytesto(bytes, to, bsize=1024):
    a = {'k' : 1, 'm': 2, 'g' : 3, 't' : 4, 'p' : 5, 'e' : 6 }
    r = float(bytes)
    return bytes / (bsize ** a[to])


#process the data with NVTabular
def process_NVT(args):

    if args.feature_cross_list:
        feature_pairs = [pair.split("_") for pair in args.feature_cross_list.split(",")]
        for pair in feature_pairs:
            CROSS_COLUMNS.append(pair[0]+'_'+pair[1])


    logging.info('NVTabular processing')
    train_input = os.path.join(args.data_path, "train/train.txt")
    val_input = os.path.join(args.data_path, "val/test.txt")
    PREPROCESS_DIR_temp_train = os.path.join(args.out_path, 'train/temp-parquet-after-conversion')
    PREPROCESS_DIR_temp_val = os.path.join(args.out_path, 'val/temp-parquet-after-conversion')
    if not os.path.exists(PREPROCESS_DIR_temp_train):
        os.makedirs(PREPROCESS_DIR_temp_train)
    
    if not os.path.exists(PREPROCESS_DIR_temp_val):
        os.makedirs(PREPROCESS_DIR_temp_val)
    PREPROCESS_DIR_temp = [PREPROCESS_DIR_temp_train, PREPROCESS_DIR_temp_val]
    train_output = os.path.join(args.out_path, "train")
    val_output = os.path.join(args.out_path, "val")

    # Make sure we have a clean parquet space for cudf conversion
    for one_path in PREPROCESS_DIR_temp:
        if os.path.exists(one_path):
           shutil.rmtree(one_path)
        os.mkdir(one_path)


    ## Get Dask Client

    # Deploy a Single-Machine Multi-GPU Cluster
    device_size = device_mem_size(kind="total")
    cluster = None
    if args.protocol == "ucx":
        UCX_TLS = os.environ.get("UCX_TLS", "tcp,cuda_copy,cuda_ipc,sockcm")
        os.environ["UCX_TLS"] = UCX_TLS
        cluster = LocalCUDACluster(
            protocol = args.protocol,
            CUDA_VISIBLE_DEVICES = args.devices,
            n_workers = len(args.devices.split(",")),
            enable_nvlink=True,
            device_memory_limit = int(device_size * args.device_limit_frac),
            dashboard_address=":" + args.dashboard_port
        )
    else:
        cluster = LocalCUDACluster(
            protocol = args.protocol,
            n_workers = len(args.devices.split(",")),
            CUDA_VISIBLE_DEVICES = args.devices,
            device_memory_limit = int(device_size * args.device_limit_frac),
            dashboard_address=":" + args.dashboard_port
        )



    # Create the distributed client
    client = Client(cluster)
    if args.device_pool_frac > 0.01:
        setup_rmm_pool(client, int(args.device_pool_frac*device_size))


    #calculate the total processing time
    runtime = time.time()

    #test dataset without the label feature
    if args.dataset_type == 'test':
        global LABEL_COLUMNS
        LABEL_COLUMNS = []

    ##-----------------------------------##
    # Dask rapids converts txt to parquet
    # Dask cudf dataframe = ddf

    ## train/valid txt to parquet
    train_valid_paths = [(train_input,PREPROCESS_DIR_temp_train),(val_input,PREPROCESS_DIR_temp_val)]

    for input, temp_output in train_valid_paths:

        ddf = dask_cudf.read_csv(input,sep='\t',names=LABEL_COLUMNS + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS)

        ## Convert label col to FP32
        if args.parquet_format and args.dataset_type == 'train':
            ddf["label"] = ddf['label'].astype('float32')

        # Save it as parquet format for better memory usage
        ddf.to_parquet(temp_output,header=True)
        ##-----------------------------------##

    COLUMNS =  LABEL_COLUMNS + CONTINUOUS_COLUMNS + CROSS_COLUMNS + CATEGORICAL_COLUMNS
    train_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_train, "*.parquet"))
    valid_paths = glob.glob(os.path.join(PREPROCESS_DIR_temp_val, "*.parquet"))
 
    num_buckets=10000000
    categorify_op = Categorify(out_path="./", max_size=num_buckets)
    cat_features = CATEGORICAL_COLUMNS >> categorify_op
    logging.info('Fillmissing processing')
    logging.info('Nomalization processing')
    cont_features = CONTINUOUS_COLUMNS >> FillMissing() >> Clip(min_value=0) >> Normalize()
    features = cat_features + cont_features + LABEL_COLUMNS
    workflow = nvt.Workflow(features, client=client)


    ##Define the output format##
    output_format='hugectr'
    if args.parquet_format:
        output_format='parquet'
    ##--------------------##

    # just for /samples/criteo model
    train_ds_iterator = nvt.Dataset(train_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size))
    valid_ds_iterator = nvt.Dataset(valid_paths, engine='parquet', part_size=int(args.part_mem_frac * device_size))

    shuffle = None
    if args.shuffle == "PER_WORKER":
        shuffle = nvt.io.Shuffle.PER_WORKER
    elif args.shuffle == "PER_PARTITION":
        shuffle = nvt.io.Shuffle.PER_PARTITION
        
    dict_dtypes={}

    for col in CATEGORICAL_COLUMNS:
        dict_dtypes[col] = np.int64
    
    for col in CONTINUOUS_COLUMNS:
        dict_dtypes[col] = np.float32
    
    for col in LABEL_COLUMNS:
        dict_dtypes[col] = np.float32

    logging.info('Train Datasets Preprocessing.....')

    workflow.fit(train_ds_iterator)

    workflow.transform(train_ds_iterator).to_parquet(output_path=train_output,
                                         shuffle=shuffle, 
                                         dtypes=dict_dtypes,
                                         labels=LABEL_COLUMNS,
                                         conts=CONTINUOUS_COLUMNS,
                                         cats=CATEGORICAL_COLUMNS)



    logging.info('Valid Datasets Preprocessing.....')

    workflow.transform(valid_ds_iterator).to_parquet(output_path=val_output, 
                                             dtypes=dict_dtypes,
                                             labels=LABEL_COLUMNS,
                                             conts=CONTINUOUS_COLUMNS,
                                             cats=CATEGORICAL_COLUMNS)
    #--------------------##
    #Output slot_size for each categorical feature
    embeddings = [c[0] for c in categorify_op.get_embedding_sizes(CATEGORICAL_COLUMNS).values()]
    embeddings = np.clip(a=embeddings, a_min=None, a_max=num_buckets).tolist()
    print(embeddings)
    ##--------------------##

    ## Shutdown clusters
    client.close()
    logging.info('NVTabular processing done')

    runtime = time.time() - runtime

    print("\nDask-NVTabular Criteo Preprocessing")
    print("--------------------------------------")
    print(f"data_path          | {args.data_path}")
    print(f"output_path        | {args.out_path}")
    print(f"partition size     | {'%.2f GB'%bytesto(int(args.part_mem_frac * device_size),'g')}")
    print(f"protocol           | {args.protocol}")
    print(f"device(s)          | {args.devices}")
    print(f"rmm-pool-frac      | {(args.device_pool_frac)}")
    print(f"out-files-per-proc | {args.out_files_per_proc}")
    print(f"num_io_threads     | {args.num_io_threads}")
    print(f"shuffle            | {args.shuffle}")
    print("======================================")
    print(f"Runtime[s]         | {runtime}")
    print("======================================\n")


def parse_args():
    parser = argparse.ArgumentParser(description=("Multi-GPU Criteo Preprocessing"))

    #
    # System Options
    #

    parser.add_argument("--data_path", type=str, help="Input dataset path (Required)")
    parser.add_argument("--out_path", type=str, help="Directory path to write output (Required)")
    parser.add_argument(
        "-d",
        "--devices",
        default=os.environ.get("CUDA_VISIBLE_DEVICES", "0"),
        type=str,
        help='Comma-separated list of visible devices (e.g. "0,1,2,3"). '
    )
    parser.add_argument(
        "-p",
        "--protocol",
        choices=["tcp", "ucx"],
        default="tcp",
        type=str,
        help="Communication protocol to use (Default 'tcp')",
    )
    parser.add_argument(
        "--device_limit_frac",
        default=0.5,
        type=float,
        help="Worker device-memory limit as a fraction of GPU capacity (Default 0.8). "
    )
    parser.add_argument(
        "--device_pool_frac",
        default=0.9,
        type=float,
        help="RMM pool size for each worker  as a fraction of GPU capacity (Default 0.9). "
        "The RMM pool frac is the same for all GPUs, make sure each one has enough memory size",
    )
    parser.add_argument(
        "--num_io_threads",
        default=0,
        type=int,
        help="Number of threads to use when writing output data (Default 0). "
        "If 0 is specified, multi-threading will not be used for IO.",
    )

    #
    # Data-Decomposition Parameters
    #

    parser.add_argument(
        "--part_mem_frac",
        default=0.125,
        type=float,
        help="Maximum size desired for dataset partitions as a fraction "
        "of GPU capacity (Default 0.125)",
    )
    parser.add_argument(
        "--out_files_per_proc",
        default=8,
        type=int,
        help="Number of output files to write on each worker (Default 8)",
    )

    #
    # Preprocessing Options
    #

    parser.add_argument(
        "-f",
        "--freq_limit",
        default=0,
        type=int,
        help="Frequency limit for categorical encoding (Default 0)",
    )
    parser.add_argument(
        "-s",
        "--shuffle",
        choices=["PER_WORKER", "PER_PARTITION", "NONE"],
        default="PER_PARTITION",
        help="Shuffle algorithm to use when writing output data to disk (Default PER_PARTITION)",
    )

    parser.add_argument(
        "--feature_cross_list", default=None, type=str, help="List of feature crossing cols (e.g. C1_C2, C3_C4)"
    )

    #
    # Diagnostics Options
    #

    parser.add_argument(
        "--profile",
        metavar="PATH",
        default=None,
        type=str,
        help="Specify a file path to export a Dask profile report (E.g. dask-report.html)."
        "If this option is excluded from the command, not profile will be exported",
    )
    parser.add_argument(
        "--dashboard_port",
        default="8787",
        type=str,
        help="Specify the desired port of Dask's diagnostics-dashboard (Default `3787`). "
        "The dashboard will be hosted at http://<IP>:<PORT>/status",
    )

    #
    # Format
    #

    parser.add_argument('--parquet_format', type=int, default=1)
    parser.add_argument('--dataset_type', type=str, default='train')

    args = parser.parse_args()
    args.n_workers = len(args.devices.split(","))
    return args
if __name__ == '__main__':

    args = parse_args()

    process_NVT(args)

Writing /dlrm_train/preprocess.py


In [4]:
!python3 ./preprocess.py --data_path /dlrm_train --out_path /dlrm_train --freq_limit 6 --device_limit_frac 0.5 --device_pool_frac 0.5 --out_files_per_proc 1  --devices "0" --num_io_threads 2

2021-11-29 06:41:04,839 NVTabular processing
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2021-11-29 06:41:29,382 Fillmissing processing
2021-11-29 06:41:29,382 Nomalization processing
2021-11-29 06:41:29,573 Train Datasets Preprocessing.....
2021-11-29 06:42:26,554 Valid Datasets Preprocessing.....
[4976199, 25419, 14705, 7112, 19283, 4, 6391, 1282, 60, 3289052, 282487, 138210, 11, 2203, 8901, 67, 4, 948, 15, 5577159, 1385790, 4348882, 178673, 10023, 88, 34]
2021-11-29 06:42:31,046 NVTabular processing done

Dask-NVTabular Criteo Preprocessing
--------------------------------------
data_path          | /dlrm_train
output_path        | /dlrm_train
partition size     | 1.97 GB
protocol           | tcp
device(s)          | 0
rmm-pool-frac      | 0.5
out-files-per-proc | 1
num_io_threads     | 2
shuffle            | PER_PARTITION
Runtime[s]         | 83.59456992149353



## 3. DLRM Model Training

In [6]:
%%writefile './model.py'
import hugectr
from mpi4py import MPI

# 1. Create Solver, DataReaderParams and Optimizer
solver = hugectr.CreateSolver(max_eval_batches = 300,
                              batchsize_eval = 16384,
                              batchsize = 16384,
                              lr = 0.001,
                              vvgpu = [[0,1,2,3]],
                              repeat_dataset = True,
                              i64_input_key = True)
reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet,
                                  source = ["./train/_file_list.txt"],
                                  eval_source = "./val/_file_list.txt",
                                  slot_size_array = [4976199, 25419, 14705, 7112, 19283, 4, 6391, 1282, 60, 3289052, 282487, 138210, 11, 2203, 8901, 67, 4, 948, 15, 5577159, 1385790, 4348882, 178673, 10023, 88, 34],
                                  check_type = hugectr.Check_t.Non)
optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.SGD,
                                    update_type = hugectr.Update_t.Local,
                                    atomic_update = True)
# 2. Initialize the Model instance
model = hugectr.Model(solver, reader, optimizer)

# 3. Construct the Model graph
model.add(hugectr.Input(label_dim = 1, label_name = "label",
                        dense_dim = 13, dense_name = "dense",
                        data_reader_sparse_param_array = 
                        [hugectr.DataReaderSparseParam("data1", 2, False, 26)]))
                        
model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, 
                            workspace_size_per_gpu_in_mb = 10000,
                            embedding_vec_size = 128,                            
                            combiner = "sum",
                            sparse_embedding_name = "sparse_embedding1",
                            bottom_name = "data1",
                            slot_size_array = [4976199, 3289052, 282487, 138210, 11, 2203, 8901, 67, 4, 948, 15, 25419, 5577159, 1385790, 4348882, 178673, 10023, 88, 34, 14705, 7112, 19283, 4, 6391, 1282, 60],
                            optimizer = optimizer))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["dense"],
                            top_names = ["fc1"],
                            num_output=512))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc1"],
                            top_names = ["relu1"]))                           
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["relu1"],
                            top_names = ["fc2"],
                            num_output=256))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc2"],
                            top_names = ["relu2"]))                            
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["relu2"],
                            top_names = ["fc3"],
                            num_output=128))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc3"],
                            top_names = ["relu3"]))                              
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Interaction,
                            bottom_names = ["relu3","sparse_embedding1"],
                            top_names = ["interaction1"]))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["interaction1"],
                            top_names = ["fc4"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc4"],
                            top_names = ["relu4"]))                              
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["relu4"],
                            top_names = ["fc5"],
                            num_output=1024))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc5"],
                            top_names = ["relu5"]))                              
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["relu5"],
                            top_names = ["fc6"],
                            num_output=512))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc6"],
                            top_names = ["relu6"]))                               
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["relu6"],
                            top_names = ["fc7"],
                            num_output=256))
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU,
                            bottom_names = ["fc7"],
                            top_names = ["relu7"]))                                                                              
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct,
                            bottom_names = ["relu7"],
                            top_names = ["fc8"],
                            num_output=1))                                                                                           
model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss,
                            bottom_names = ["fc8", "label"],
                            top_names = ["loss"]))

# 4. Dump the Model graph to JSON
model.graph_to_json(graph_config_file = "dlrm.json")

# 5. Compile & Fit
model.compile()
model.summary()
model.fit(max_iter = 21000, display = 1000, eval_interval = 4000, snapshot = 20000, snapshot_prefix = "dlrm")


Overwriting ./model.py


In [7]:
!python model.py

--------------------------------------------------------------------------
By default, for Open MPI 4.0 and later, infiniband ports on a device
are not used by default.  The intent is to use UCX for these devices.
You can override this policy by setting the btl_openib_allow_ib MCA parameter
to true.

  Local host:              prm-dgx-09
  Local adapter:           mlx5_0
  Local port:              1

--------------------------------------------------------------------------
--------------------------------------------------------------------------

  Local host:   prm-dgx-09
  Local device: mlx5_1
--------------------------------------------------------------------------
HugeCTR Version: 3.2.0
[29d07h02m29s][HUGECTR][INFO]: Global seed is 2643346044
[29d07h02m30s][HUGECTR][INFO]: Device to NUMA mapping:
  GPU 0 ->  node 0
  GPU 1 ->  node 0
  GPU 2 ->  node 0
  GPU 3 ->  node 0

[prm-dgx-09:01029] 1 more process has sent help message help-mpi-btl-openib.txt / ib port not selected
[prm-

[29d70h50m58s][HUGECTR][INFO]: Iter: 14000 Time(1000 iters): 12.451465s Loss: 0.147939 lr:0.001000
[29d70h60m10s][HUGECTR][INFO]: Iter: 15000 Time(1000 iters): 12.422438s Loss: 0.145342 lr:0.001000
[29d70h60m23s][HUGECTR][INFO]: Iter: 16000 Time(1000 iters): 12.411487s Loss: 0.136565 lr:0.001000
[29d70h60m24s][HUGECTR][INFO]: Evaluation, AUC: 0.625069
[29d70h60m24s][HUGECTR][INFO]: Eval Time for 300 iters: 1.658554s
[29d70h60m37s][HUGECTR][INFO]: Iter: 17000 Time(1000 iters): 14.107148s Loss: 0.138869 lr:0.001000
[29d70h60m49s][HUGECTR][INFO]: Iter: 18000 Time(1000 iters): 12.450642s Loss: 0.150656 lr:0.001000
[29d70h70m20s][HUGECTR][INFO]: Iter: 19000 Time(1000 iters): 12.438676s Loss: 0.139049 lr:0.001000
[29d70h70m14s][HUGECTR][INFO]: Iter: 20000 Time(1000 iters): 12.395778s Loss: 0.147212 lr:0.001000
[29d70h70m16s][HUGECTR][INFO]: Evaluation, AUC: 0.632774
[29d70h70m16s][HUGECTR][INFO]: Eval Time for 300 iters: 1.698418s
[29d70h70m21s][HUGECTR][INFO]: Rank0: Write hash table to fil

## 4. Save the Model Files & Inference Validation

In [8]:
!ls -l *20000.model

-rw-r--r-- 1 root root       0 Nov 29 07:07 dlrm0_opt_sparse_20000.model
-rw-r--r-- 1 root root 9479684 Nov 29 07:07 dlrm_dense_20000.model
-rw-r--r-- 1 root root       0 Nov 29 07:07 dlrm_opt_dense_20000.model

dlrm0_sparse_20000.model:
total 10294912
-rw-r--r-- 1 root root 10379768832 Nov 29 07:07 emb_vector
-rw-r--r-- 1 root root   162183888 Nov 29 07:07 key


In [9]:
!ls -l /dlrm_train/val

total 702242
-rw-r--r-- 1 root root        33 Nov 29 06:42 _file_list.txt
-rw-r--r-- 1 root root  81092112 Nov 29 06:42 _hugectr.keyset
-rw-r--r-- 1 root root     21528 Nov 29 06:42 _metadata
-rw-r--r-- 1 root root      1437 Nov 29 06:42 _metadata.json
-rw-r--r-- 1 root root 128131055 Nov 29 06:42 part_0.parquet
-rw-r--r-- 1 root root     19945 Nov 29 06:42 schema.pbtxt
drwxr-xr-x 2 root root      4096 Nov 29 06:41 temp-parquet-after-conversion
-rw-r--r-- 1 root root 509766965 Nov 29 06:39 test.txt


In [11]:
import pandas as pd
df = pd.read_parquet("/dlrm_train/val/part_0.parquet")

In [12]:
df.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,I5,I6,I7,I8,I9,I10,I11,I12,I13,label
0,669260,28,473,18,2,1,157,4,3,19900,...,-0.209261,-0.206385,-0.064249,-0.28181,0.035263,-0.470383,-0.261958,-0.17375,-0.262248,0.0
1,1,1,31,2,20,1,51,3,1,1,...,-0.209261,-0.206385,-0.064249,-0.258333,-0.760031,-0.470383,-0.261958,-0.19454,-0.262248,0.0
2,2,25,148,52,2,1,101,6,5,2,...,0.240178,0.205918,-0.064249,-0.276593,2.204247,1.386036,0.690729,-0.277389,0.046789,0.0
3,177,61,377,2,1,1,402,17,1,209,...,0.023207,0.068484,-0.064249,-0.276593,1.987348,1.386036,0.055604,-0.289136,-0.306396,0.0
4,2,3,30,4,2,1,8,1,2,2,...,1.526501,-0.206385,0.339399,-0.28181,0.39676,-0.470383,2.596102,-0.25714,0.399973,0.0


In [14]:
df.head(10).to_csv('/dlrm_train/infer_test.csv', sep=',', index=False,header=True)

In [35]:
%%writefile /dlrm_train/dlrm2predict.py
from hugectr.inference import InferenceParams, CreateInferenceSession
import hugectr
import pandas as pd
import numpy as np
import sys
from mpi4py import MPI

config_file = "/dlrm_train/dlrm.json"
CATEGORICAL_COLUMNS=["C" + str(x) for x in range(1, 27)]
CONTINUOUS_COLUMNS=["I" + str(x) for x in range(1, 14)]
LABEL_COLUMNS = ['label']
emb_size = [4976199, 3289052, 282487, 138210, 11, 2203, 8901, 67, 4, 948, 15, 25419, 5577159, 1385790, 4348882, 178673, 10023, 88, 34, 14705, 7112, 19283, 4, 6391, 1282, 60]
shift = np.insert(np.cumsum(emb_size), 0, 0)[:-1]
test_df=pd.read_csv("/dlrm_train/infer_test.csv",sep=',')
row_ptrs = list(range(0,261))
dense_features = list(test_df[CONTINUOUS_COLUMNS].values.flatten())
test_df[CATEGORICAL_COLUMNS].astype(np.int64)
embedding_columns = list((test_df[CATEGORICAL_COLUMNS]+shift).values.flatten())
                

# create parameter server, embedding cache and inference session
inference_params = InferenceParams(model_name = "dlrm",
                                max_batchsize = 64,
                                hit_rate_threshold = 0.5,
                                dense_model_file = "/dlrm_train/dlrm_dense_20000.model",
                                sparse_model_files = ["/dlrm_train/dlrm0_sparse_20000.model"],
                                device_id = 0,
                                use_gpu_embedding_cache = True,
                                cache_size_percentage = 0.2,
                                i64_input_key = True,
                                use_mixed_precision = False)
inference_session = CreateInferenceSession(config_file, inference_params)
output = inference_session.predict(dense_features, embedding_columns, row_ptrs)
print(output)

Overwriting /dlrm_train/dlrm2predict.py


In [36]:
!python /dlrm_train/dlrm2predict.py

--------------------------------------------------------------------------
By default, for Open MPI 4.0 and later, infiniband ports on a device
are not used by default.  The intent is to use UCX for these devices.
You can override this policy by setting the btl_openib_allow_ib MCA parameter
to true.

  Local host:              prm-dgx-09
  Local adapter:           mlx5_0
  Local port:              1

--------------------------------------------------------------------------
--------------------------------------------------------------------------

  Local host:   prm-dgx-09
  Local device: mlx5_2
--------------------------------------------------------------------------
[29d07h19m36s][HUGECTR][INFO]: default_emb_vec_value is not specified using default: 0.000000
[prm-dgx-09:01716] 2 more processes have sent help message help-mpi-btl-openib.txt / ib port not selected
[prm-dgx-09:01716] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
[29d07h20m33s][HUG