# ETL with NVTabular

* following tutotial [here](https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/main/examples/getting-started-session-based/01-ETL-with-NVTabular.ipynb)

In [1]:
import os
import glob

import numpy as np
import pandas as pd

import nvtabular as nvt
from nvtabular.ops import *
from merlin.schema.tags import Tags

In [None]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
PROJECT_NUM = !gcloud projects list --filter="$PROJECT_ID" --format="value(PROJECT_NUMBER)"
PROJECT_NUM = PROJECT_NUM[0]
LOCATION = 'us-central1'
REGION = "us-central1"

# VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com'
VERTEX_SA = 'jt-vertex-sa@hybrid-vertex.iam.gserviceaccount.com'

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"PROJECT_NUM: {PROJECT_NUM}")
print(f"LOCATION: {LOCATION}")
print(f"REGION: {REGION}")
print(f"VERTEX_SA: {VERTEX_SA}")

In [2]:
!pwd

/home/jupyter/t4rec-nvidia-docs


In [31]:
REPO_WORKSPACE = 'workspace'

DATA_DIR = 'data'
INPUT_DATA_DIR=f'{REPO_WORKSPACE}/{DATA_DIR}'
TRANSFORMED_WORKFLOW=f'{INPUT_DATA_DIR}/processed_nvt'
OUTPUT_WORKFLOW_DIR=f'{INPUT_DATA_DIR}/workflow_etl'
OUTPUT_DIR=f'{INPUT_DATA_DIR}/sessions_by_day'
TRAIN_PATHS=f'{OUTPUT_DIR}/1' #/train.parquet'


print(f"INPUT_DATA_DIR: {INPUT_DATA_DIR}")
print(f"TRANSFORMED_WORKFLOW: {TRANSFORMED_WORKFLOW}")
print(f"OUTPUT_WORKFLOW_DIR: {OUTPUT_WORKFLOW_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")
print(f"TRAIN_PATHS: {TRAIN_PATHS}")

INPUT_DATA_DIR: workspace/data
TRANSFORMED_WORKFLOW: workspace/data/processed_nvt
OUTPUT_WORKFLOW_DIR: workspace/data/workflow_etl
OUTPUT_DIR: workspace/data/sessions_by_day
TRAIN_PATHS: workspace/data/sessions_by_day/1


In [32]:
# Make the training subfolder
# ! rm -rf {REPO_WORKSPACE}
! mkdir {REPO_WORKSPACE}
! mkdir {INPUT_DATA_DIR}
! mkdir {TRANSFORMED_WORKFLOW}
! mkdir {OUTPUT_WORKFLOW_DIR}
! mkdir {OUTPUT_DIR}
! mkdir {TRAIN_PATHS}

In [3]:
# INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/workspace/data/")
# INPUT_DATA_DIR

'/workspace/data/'

In [26]:
# !ls

'[JT] 01-ETL-w-NVTabular.ipynb'   __pycache__   categories   data_utils.py


## Create synthetic data

In [33]:
NUM_ROWS = os.environ.get("NUM_ROWS", 100000)

In [34]:
long_tailed_item_distribution = np.clip(np.random.lognormal(3., 1., int(NUM_ROWS)).astype(np.int32), 1, 50000)
# generate random item interaction features 
df = pd.DataFrame(np.random.randint(70000, 90000, int(NUM_ROWS)), columns=['session_id'])
df['item_id'] = long_tailed_item_distribution

# generate category mapping for each item-id
df['category'] = pd.cut(df['item_id'], bins=334, labels=np.arange(1, 335)).astype(np.int32)
df['age_days'] = np.random.uniform(0, 1, int(NUM_ROWS)).astype(np.float32)
df['weekday_sin']= np.random.uniform(0, 1, int(NUM_ROWS)).astype(np.float32)

# generate day mapping for each session 
map_day = dict(zip(df.session_id.unique(), np.random.randint(1, 10, size=(df.session_id.nunique()))))
df['day'] =  df.session_id.map(map_day)

In [35]:
df.head()

Unnamed: 0,session_id,item_id,category,age_days,weekday_sin,day
0,82415,9,2,0.81763,0.732386,6
1,75804,6,2,0.059395,0.336924,2
2,87913,35,8,0.420358,0.248626,3
3,77737,30,7,0.332786,0.454536,3
4,87705,3,1,0.458632,0.05669,1


## Feature Engineering with NVTabular

In [36]:
SESSIONS_MAX_LENGTH =20

# Categorify categorical features
categ_feats = ['session_id', 'item_id', 'category'] >> nvt.ops.Categorify()

# Define Groupby Workflow
groupby_feats = categ_feats + ['day', 'age_days', 'weekday_sin']

# Group interaction features by session
groupby_features = groupby_feats >> nvt.ops.Groupby(
    groupby_cols=["session_id"], 
    aggs={
        "item_id": ["list", "count"],
        "category": ["list"],     
        "day": ["first"],
        "age_days": ["list"],
        'weekday_sin': ["list"],
        },
    name_sep="-")

# Select and truncate the sequential features
sequence_features_truncated = (
    groupby_features['category-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH) 
)

sequence_features_truncated_item = (
    groupby_features['item_id-list']
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH) 
    >> TagAsItemID()
)  
sequence_features_truncated_cont = (
    groupby_features['age_days-list', 'weekday_sin-list'] 
    >> nvt.ops.ListSlice(-SESSIONS_MAX_LENGTH) 
    >> nvt.ops.AddMetadata(tags=[Tags.CONTINUOUS])
)

# Filter out sessions with length 1 (not valid for next-item prediction training and evaluation)
MINIMUM_SESSION_LENGTH = 2
selected_features = (
    groupby_features['item_id-count', 'day-first', 'session_id'] + 
    sequence_features_truncated_item +
    sequence_features_truncated + 
    sequence_features_truncated_cont
)
    
filtered_sessions = selected_features >> nvt.ops.Filter(f=lambda df: df["item_id-count"] >= MINIMUM_SESSION_LENGTH)

seq_feats_list = filtered_sessions['item_id-list', 'category-list', 'age_days-list', 'weekday_sin-list'] >>  nvt.ops.ValueCount()


workflow = nvt.Workflow(filtered_sessions['session_id', 'day-first', 'item_id-count'] + seq_feats_list)

dataset = nvt.Dataset(df, cpu=False)
# Generate statistics for the features
workflow.fit(dataset)
# Apply the preprocessing and return an NVTabular dataset
sessions_ds = workflow.transform(dataset)
# Convert the NVTabular dataset to a Dask cuDF dataframe (`to_ddf()`) and then to cuDF dataframe (`.compute()`)
sessions_gdf = sessions_ds.to_ddf().compute()



In [37]:
sessions_gdf.head(3)

Unnamed: 0,session_id,day-first,item_id-count,item_id-list,category-list,age_days-list,weekday_sin-list
0,1,5,16,"[38, 27, 70, 19, 5, 13, 48, 88, 9, 44, 17, 22,...","[8, 7, 14, 5, 3, 4, 10, 18, 2, 9, 4, 5, 53, 2,...","[0.5251261, 0.87913096, 0.7250461, 0.43472463,...","[0.5770256, 0.35902506, 0.079794206, 0.9617632..."
1,2,3,16,"[34, 84, 28, 36, 5, 38, 2, 176, 5, 46, 44, 23,...","[6, 18, 7, 8, 3, 8, 1, 34, 3, 10, 9, 5, 9, 10,...","[0.10864424, 0.5010008, 0.86823916, 0.10962348...","[0.31416067, 0.6658211, 0.26234916, 0.27628836..."
2,3,5,15,"[35, 19, 2, 21, 4, 22, 46, 14, 77, 9, 17, 1, 3...","[8, 5, 1, 5, 1, 5, 10, 3, 16, 2, 4, 1, 6, 11, 1]","[0.77158105, 0.56738484, 0.120141625, 0.657774...","[0.33148926, 0.66894, 0.59043574, 0.6323525, 0..."


In [38]:
workflow.output_schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.freq_threshold,properties.max_size,properties.start_index,properties.cat_path,properties.domain.min,properties.domain.max,properties.domain.name,properties.embedding_sizes.cardinality,properties.embedding_sizes.dimension,properties.value_count.min,properties.value_count.max
0,session_id,(Tags.CATEGORICAL),int64,False,False,,0.0,0.0,0.0,.//categories/unique.session_id.parquet,0.0,19867.0,session_id,19868.0,408.0,,
1,day-first,(),int64,False,False,,,,,,,,,,,,
2,item_id-count,(Tags.CATEGORICAL),int32,False,False,,0.0,0.0,0.0,.//categories/unique.item_id.parquet,0.0,490.0,item_id,491.0,51.0,,
3,item_id-list,"(Tags.CATEGORICAL, Tags.LIST, Tags.ITEM_ID, Ta...",int64,True,True,,0.0,0.0,0.0,.//categories/unique.item_id.parquet,0.0,490.0,item_id,491.0,51.0,2.0,16.0
4,category-list,"(Tags.CATEGORICAL, Tags.LIST)",int64,True,True,,0.0,0.0,0.0,.//categories/unique.category.parquet,0.0,146.0,category,147.0,26.0,2.0,16.0
5,age_days-list,"(Tags.CONTINUOUS, Tags.LIST)",float32,True,True,,,,,,,,,,,2.0,16.0
6,weekday_sin-list,"(Tags.CONTINUOUS, Tags.LIST)",float32,True,True,,,,,,,,,,,2.0,16.0


In [39]:
# workflow.fit_transform(dataset).to_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt"))

workflow.fit_transform(dataset).to_parquet(TRANSFORMED_WORKFLOW)

!ls $TRANSFORMED_WORKFLOW



_file_list.txt	_metadata  _metadata.json  part_0.parquet  schema.pbtxt


In [40]:
# workflow.save(os.path.join(INPUT_DATA_DIR, "workflow_etl"))

workflow.save(OUTPUT_WORKFLOW_DIR)

!ls $OUTPUT_WORKFLOW_DIR

categories  metadata.json  workflow.pkl


## Export pre-processed data by day

In [41]:
# OUTPUT_DIR = os.environ.get("OUTPUT_DIR",os.path.join(INPUT_DATA_DIR, "sessions_by_day"))

!ls $OUTPUT_DIR

1


In [42]:
from transformers4rec.data.preprocessing import save_time_based_splits

save_time_based_splits(
    data=nvt.Dataset(sessions_gdf),
    output_dir= OUTPUT_DIR,
    partition_col='day-first',
    timestamp_col='session_id', 
)

Creating time-based splits: 100% 9/9 [00:00<00:00, 26.74it/s]


## Checking the preprocessed outputs

In [43]:
# TRAIN_PATHS = os.path.join(OUTPUT_DIR, "1", "train.parquet")

TRAIN_DATA = f'{TRAIN_PATHS}/train.parquet'
print(f"TRAIN_DATA: {TRAIN_DATA}")

TRAIN_DATA: workspace/data/sessions_by_day/1/train.parquet


In [44]:
df = pd.read_parquet(TRAIN_DATA)
df

Unnamed: 0,session_id,item_id-count,item_id-list,category-list,age_days-list,weekday_sin-list
0,9,14,"[49, 36, 130, 20, 48, 15, 3, 126, 96, 31, 24, ...","[10, 8, 26, 3, 10, 4, 1, 23, 20, 6, 5, 4, 1, 4]","[0.48491085, 0.86484003, 0.13697912, 0.6252731...","[0.014036216, 0.3720214, 0.5400261, 0.6242002,..."
1,35,12,"[34, 16, 34, 7, 6, 9, 24, 37, 3, 48, 1, 9]","[6, 4, 6, 2, 1, 2, 5, 8, 1, 10, 1, 2]","[0.9514152, 0.9970582, 0.5264118, 0.057201244,...","[0.5489233, 0.3369006, 0.80283314, 0.9907031, ..."
2,66,12,"[3, 8, 14, 63, 20, 25, 36, 12, 52, 18, 2, 30]","[1, 3, 3, 14, 3, 7, 8, 2, 12, 4, 1, 6]","[0.59860885, 0.7828945, 0.8959286, 0.59513724,...","[0.28273273, 0.8098918, 0.86178094, 0.30507022..."
4,86,12,"[38, 62, 12, 35, 27, 15, 16, 46, 17, 8, 12, 58]","[8, 13, 2, 8, 7, 4, 4, 10, 4, 3, 2, 13]","[0.06666987, 0.7871961, 0.8365424, 0.49183917,...","[0.47157973, 0.78612095, 0.30395415, 0.6325353..."
5,87,12,"[94, 20, 23, 31, 24, 22, 72, 7, 11, 4, 43, 6]","[20, 3, 5, 6, 5, 5, 15, 2, 2, 1, 9, 1]","[0.43543532, 0.4677896, 0.531344, 0.97253376, ...","[0.058378555, 0.07599554, 0.87675023, 0.737522..."
...,...,...,...,...,...,...
2016,19137,2,"[1, 2]","[1, 1]","[0.6261729, 0.4914603]","[0.9872869, 0.33724102]"
2017,19138,2,"[354, 99]","[73, 22]","[0.527412, 0.34500536]","[0.6040353, 0.14440879]"
2018,19139,2,"[32, 239]","[6, 58]","[0.97328794, 0.52698237]","[0.6735554, 0.0022826127]"
2019,19140,2,"[15, 36]","[4, 8]","[0.33646894, 0.19001935]","[0.8783984, 0.05833189]"


In [45]:
import gc
del df
gc.collect()

1501