# Preprocessing

> Convert BQ table to parquet files
> orchestrate job with `Vertex Pipelines`

Data originally converted to parquet using the job config below:

```
BUCKET = 'gs://spotify-builtin-2t'
PROJECT = 'hybrid-vertex'
DATASET_ID = 'spotify_train_3'
TABLE = 'train_flatten'
TABLE_SMALL = 'train_json_export_table_small'
LOCATION = 'us-central1'

from google.cloud import bigquery
client = bigquery.Client()

destination_uri = f"{BUCKET}/train_data_parquet/*.snappy.parquet"
dataset_ref = bigquery.DatasetReference(PROJECT, DATASET_ID)
table_ref = dataset_ref.table(TABLE)
job_config = bigquery.job.ExtractJobConfig()
job_config.destination_format = bigquery.DestinationFormat.PARQUET
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    job_config=job_config,
    # Location must match that of the source table.
    location=LOCATION,
)  # API request
extract_job.result()  # Waits for job to complete.
```

## Setup

### pip

In [2]:
!pip install google-cloud-aiplatform 
!pip install google-cloud-pipeline-components 
!pip install google-cloud-bigquery-storage 
!pip install kfp

Collecting protobuf<4.0.0dev,>=3.19.0
  Downloading protobuf-3.20.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: protobuf
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tfx-bsl 1.9.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.52.0 which is incompatible.
tfx-bsl 1.9.0 requires pyarrow<6,>=1, but you have pyarrow 8.0.0 which is incompatible.
tensorflow 2.9.0rc2 requires tensorboard<2.10,>=2.9, but you have tensorboard 2.8.0 which is incompatible.
tensorflow-transform 1.9.0 requires pyarrow<6,>=1, but you have pyarrow 8.0.0 which is incompatible.
tensorflow-serving-api 2.9.0 requires tensorflow<3,>=2.9.0, but you have tensorflow 2.9.

### import packages

In [3]:
import os
import json
from datetime import datetime
from google.cloud import aiplatform as vertex_ai
from kfp.v2 import compiler

In [5]:
# TODO: Project definitions
PROJECT_ID = 'hybrid-vertex' # Change to your project ID.
REGION = 'us-central1' # Change to your region.

# TODO: Service Account address
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com' # Change to your service account with Vertex AI Admin permitions.

# TODO: define GCS Bucket
BUCKET_parquet = 'spotify-builtin-2t'
BUCKET = 'spotify-merlin-v1'

## Define preprocess pipeline

In [6]:
# Bucket definitions
VERSION = 'v00-subset'
APP = 'spotify'
MODEL_DISPLAY_NAME = f'nvt-preprocessing-{APP}-{VERSION}'
WORKSPACE = f'gs://{BUCKET}/{MODEL_DISPLAY_NAME}'

# Docker definitions
IMAGE_NAME = 'nvt-preprocessing'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
DOCKERNAME = 'nvtabular'

# Pipeline definitions
PREPROCESS_PARQUET_PIPELINE_NAME = 'nvt-parquet-pipeline'
PREPROCESS_PARQUET_PIPELINE_ROOT = os.path.join(WORKSPACE, PREPROCESS_PARQUET_PIPELINE_NAME)

# Instance configuration
GPU_LIMIT = '2'
GPU_TYPE = 'NVIDIA_TESLA_A100'
CPU_LIMIT = '96'
MEMORY_LIMIT = '680'

In [None]:
os.environ['PROJECT_ID'] = PROJECT_ID
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET
os.environ['WORKSPACE'] = WORKSPACE

os.environ['NVT_IMAGE_URI'] = IMAGE_URI
os.environ['PREPROCESS_PARQUET_PIPELINE_NAME'] = PREPROCESS_PARQUET_PIPELINE_NAME
os.environ['PREPROCESS_PARQUET_PIPELINE_ROOT'] = PREPROCESS_PARQUET_PIPELINE_ROOT
os.environ['DOCKERNAME'] = DOCKERNAME

os.environ['GPU_LIMIT'] = GPU_LIMIT
os.environ['GPU_TYPE'] = GPU_TYPE
os.environ['CPU_LIMIT'] = CPU_LIMIT
os.environ['MEMORY_LIMIT'] = MEMORY_LIMIT

In [None]:
FILE_LOCATION = './src'
! gcloud builds submit --config src/cloudbuild.yaml --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION --timeout=2h --machine-type=e2-highcpu-8

# Parquet Preprocessing Pipeline

### Inputs

In [None]:
# Subset
TRAIN_FILES = f"{BUCKET_parquet}/train_data_parquet/0000000000**.snappy.parquet"
VALID_FILES = f"{BUCKET_parquet}/validation_data_parquet/00000000000*.snappy.parquet"

# full dataset
# TRAIN_FILES = f"{BUCKET_parquet}/train_data_parquet/*.snappy.parquet"
# VALID_FILES = f"{BUCKET_parquet}/validation_data_parquet/*.snappy.parquet"

MAX_PADDING = 375

### Outputs

In [None]:
OUTPUT_PATH = os.path.join(WORKSPACE, "nvt-processed")
OUTPUT_TRAIN_DIR = os.path.join(OUTPUT_PATH, 'train/')
OUTPUT_VALID_DIR = os.path.join(OUTPUT_PATH, 'valid/')
OUTPUT_WORKFLOW_DIR = os.path.join(OUTPUT_PATH, 'workflow/')


print(f"Train data dir: {OUTPUT_TRAIN_DIR}\nValid data dir: {OUTPUT_VALID_DIR}")

## TODO: Create pipeline parameters

In [None]:
# # Training files
# TRAIN_PATHS = ['gs://<PATH TO CSV FILES>'] # Change to the GCS path where CSV files are located
# # Validation files
# VALID_PATHS = ['gs://<PATH TO CSV FILES>'] # Change to the GCS path where CSV files are located

# num_output_files_train = 24 # Number of output files after converting CSV to Parquet
# num_output_files_valid = 1 # Number of output files after converting CSV to Parquet

# csv_parameter_values = {
#     'train_paths': json.dumps(TRAIN_PATHS),
#     'valid_paths': json.dumps(VALID_PATHS),
#     'num_output_files_train': num_output_files_train,
#     'num_output_files_valid': num_output_files_valid,
#     'shuffle': json.dumps(None) # select PER_PARTITION, PER_WORKER, FULL, or None.
# }

## Compile KFP pipeline

In [None]:
# from src.pipelines.preprocessing_pipelines import preprocessing_csv

# csv_compiled_pipeline_path = f'{PREPROCESS_CSV_PIPELINE_NAME}.json'
# compiler.Compiler().compile(
#        pipeline_func=preprocessing_csv,
#        package_path=csv_compiled_pipeline_path
# )

## Submit pipeline to Vertex AI

In [None]:
# job_name = f'{datetime.now().strftime("%Y%m%d%H%M%S")}_{PREPROCESS_CSV_PIPELINE_NAME}'

# pipeline_job = vertex_ai.PipelineJob(
#     display_name=job_name,
#     template_path=csv_compiled_pipeline_path,
#     enable_caching=False,
#     parameter_values=csv_parameter_values,
# )

# pipeline_job.submit(service_account=VERTEX_SA)