# Preprocessing pipeline


> Convert BQ table to parquet files

> orchestrate job with `Vertex Pipelines`

![](https://user-images.githubusercontent.com/39886184/181261762-69e9edbc-6efd-4a83-92a1-8b822acfe1d6.png)

Data originally converted to parquet using the job config below:

```
BUCKET = 'gs://spotify-builtin-2t'
PROJECT = 'hybrid-vertex'
DATASET_ID = 'spotify_train_3'
TABLE = 'train_flatten'
TABLE_SMALL = 'train_json_export_table_small'
LOCATION = 'us-central1'

from google.cloud import bigquery
client = bigquery.Client()

destination_uri = f"{BUCKET}/train_data_parquet/*.snappy.parquet"
dataset_ref = bigquery.DatasetReference(PROJECT, DATASET_ID)
table_ref = dataset_ref.table(TABLE)
job_config = bigquery.job.ExtractJobConfig()
job_config.destination_format = bigquery.DestinationFormat.PARQUET
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    job_config=job_config,
    # Location must match that of the source table.
    location=LOCATION,
)  # API request
extract_job.result()  # Waits for job to complete.
```

## Setup

### pip

In [2]:
!pip install google-cloud-aiplatform 
!pip install google-cloud-pipeline-components 
!pip install google-cloud-bigquery-storage 
!pip install kfp

Collecting protobuf<4.0.0dev,>=3.19.0
  Downloading protobuf-3.20.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: protobuf
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tfx-bsl 1.9.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.52.0 which is incompatible.
tfx-bsl 1.9.0 requires pyarrow<6,>=1, but you have pyarrow 8.0.0 which is incompatible.
tensorflow 2.9.0rc2 requires tensorboard<2.10,>=2.9, but you have tensorboard 2.8.0 which is incompatible.
tensorflow-transform 1.9.0 requires pyarrow<6,>=1, but you have pyarrow 8.0.0 which is incompatible.
tensorflow-serving-api 2.9.0 requires tensorflow<3,>=2.9.0, but you have tensorflow 2.9.

### import packages

In [1]:
import os
import json
from datetime import datetime
from google.cloud import aiplatform as vertex_ai
from kfp.v2 import compiler

In [2]:
# TODO: Project definitions
PROJECT_ID = 'hybrid-vertex' # Change to your project ID.
REGION = 'us-central1' # Change to your region.

# TODO: Service Account address
VERTEX_SA = '934903580331-compute@developer.gserviceaccount.com' # Change to your service account with Vertex AI Admin permitions.

# TODO: define GCS Bucket
BUCKET_parquet = 'spotify-builtin-2t'
BUCKET = 'spotify-merlin-v1'

## Define preprocess pipeline

In [3]:
# Bucket definitions
VERSION = 'v32-subset'
APP = 'spotify'
MODEL_DISPLAY_NAME = f'nvt-preprocessing-{APP}-{VERSION}'
WORKSPACE = f'gs://{BUCKET}/{MODEL_DISPLAY_NAME}'

# Docker definitions
IMAGE_NAME = 'nvt-preprocessing'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
DOCKERNAME = 'nvtabular'

# Pipeline definitions
PREPROCESS_PARQUET_PIPELINE_NAME = f'nvtabular-parquet-pipeline-{VERSION}'
PREPROCESS_PARQUET_PIPELINE_ROOT = os.path.join(WORKSPACE, PREPROCESS_PARQUET_PIPELINE_NAME)

# # Instance configuration
GPU_LIMIT = '4'
GPU_TYPE = 'NVIDIA_TESLA_T4'
CPU_LIMIT = '64'
MEMORY_LIMIT = '624G'
INSTANCE_TYPE = "n1-highmem-64"

# # Instance configuration
# GPU_LIMIT = '1'
# GPU_TYPE = 'NVIDIA_TESLA_A100'
# CPU_LIMIT = '96' # '64'
# MEMORY_LIMIT = '680' #'624G'
# INSTANCE_TYPE = "a2-highgpu-1g"

In [5]:
os.environ['PROJECT_ID'] = PROJECT_ID
os.environ['REGION'] = REGION
os.environ['BUCKET'] = BUCKET
os.environ['WORKSPACE'] = WORKSPACE

os.environ['NVT_IMAGE_URI'] = IMAGE_URI
os.environ['PREPROCESS_PARQUET_PIPELINE_NAME'] = PREPROCESS_PARQUET_PIPELINE_NAME
os.environ['PREPROCESS_PARQUET_PIPELINE_ROOT'] = PREPROCESS_PARQUET_PIPELINE_ROOT
os.environ['DOCKERNAME'] = DOCKERNAME

os.environ['GPU_LIMIT'] = GPU_LIMIT
os.environ['GPU_TYPE'] = GPU_TYPE
os.environ['CPU_LIMIT'] = CPU_LIMIT
os.environ['MEMORY_LIMIT'] = MEMORY_LIMIT
os.environ['INSTANCE_TYPE'] = INSTANCE_TYPE

### TODO: Write Config File

In [6]:
# """Vertex pipeline configurations."""

# import os

# PROJECT_ID = os.getenv("PROJECT_ID", "")
# REGION = os.getenv("REGION", "us-central1")
# BUCKET = os.getenv("BUCKET", "")
# BUCKET_NAME = os.getenv("BUCKET_NAME", "")
# VERTEX_SA = os.getenv("VERTEX_SA",
#                       f"vertex-sa@{PROJECT_ID}.iam.gserviceaccount.com")

# VERSION = os.getenv("VERSION", "")

# MODEL_DISPLAY_NAME = os.getenv("MODEL_DISPLAY_NAME", "")

# WORKSPACE = os.getenv("WORKSPACE", "")
# NVT_IMAGE_URI = os.getenv("NVT_IMAGE_URI", "")
# PREPROCESS_PARQUET_PIPELINE_NAME = os.getenv("PREPROCESS_PARQUET_PIPELINE_NAME", "")
# PREPROCESS_PARQUET_PIPELINE_ROOT = os.getenv("PREPROCESS_PARQUET_PIPELINE_ROOT", "")
# DOCKERNAME = os.getenv("DOCKERNAME", "")

# INSTANCE_TYPE = os.getenv("INSTANCE_TYPE", "n1-highmem-64")
# CPU_LIMIT = os.getenv("CPU_LIMIT", "64")
# MEMORY_LIMIT = os.getenv("MEMORY_LIMIT", "624G")
# GPU_LIMIT = os.getenv("GPU_LIMIT", "4")
# GPU_TYPE = os.getenv("GPU_TYPE", "NVIDIA_TESLA_T4")

# # INSTANCE_TYPE = os.getenv("INSTANCE_TYPE", "a2-highgpu-1g")
# # CPU_LIMIT = os.getenv("CPU_LIMIT", "96")
# # MEMORY_LIMIT = os.getenv("MEMORY_LIMIT", "680")
# # GPU_LIMIT = os.getenv("GPU_LIMIT", "2")
# # GPU_TYPE = os.getenv("GPU_TYPE", "NVIDIA_TESLA_A100")

# # train & valid parquet files
# # TRAIN_DIR_PARQUET = os.getenv("TRAIN_DIR_PARQUET", "gs://spotify-builtin-2t/train_data_parquet/0000000000**.snappy.parquet")
# # VALID_DIR_PARQUET = os.getenv("VALID_DIR_PARQUET", "gs://spotify-builtin-2t/validation_data_parquet/00000000000*.snappy.parquet")

In [7]:
MEMORY_LIMIT

'624G'

In [8]:
# Initialize Vertex AI API
vertex_ai.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=os.path.join(WORKSPACE, 'stg') 
)

In [9]:
os.chdir('/home/jupyter/spotify-merlin')
os.getcwd()

'/home/jupyter/spotify-merlin'

In [10]:
FILE_LOCATION = './src'
! gcloud builds submit --config src/cloudbuild.yaml --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI,_FILE_LOCATION=$FILE_LOCATION --timeout=2h --machine-type=e2-highcpu-8

Creating temporary tarball archive of 78 file(s) totalling 1.7 MiB before compression.
Uploading tarball of [.] to [gs://hybrid-vertex_cloudbuild/source/1662949103.31352-dcd9628e835a4874b86242625a32387d.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/hybrid-vertex/locations/global/builds/f0c01f3a-0c7c-4d02-a1f7-08c5fa4d701d].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/f0c01f3a-0c7c-4d02-a1f7-08c5fa4d701d?project=934903580331].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "f0c01f3a-0c7c-4d02-a1f7-08c5fa4d701d"

FETCHSOURCE
Fetching storage object: gs://hybrid-vertex_cloudbuild/source/1662949103.31352-dcd9628e835a4874b86242625a32387d.tgz#1662949103772589
Copying gs://hybrid-vertex_cloudbuild/source/1662949103.31352-dcd9628e835a4874b86242625a32387d.tgz#1662949103772589...
/ [1 files][204.6 KiB/204.6 KiB]                                                
Operation completed over 1 objects/204.6 KiB

# Parquet Preprocessing Pipeline

### Pipeline Params

* Get Train & Valid file counts
* Define data dirs and files

In [9]:
# Subset
# TRAIN_DIR_PARQUET = f"{BUCKET_parquet}/train_data_parquet/0000000000**.snappy.parquet"
# VALID_DIR_PARQUET = f"{BUCKET_parquet}/validation_data_parquet/00000000000*.snappy.parquet"

# full dataset
# TRAIN_DIR_PARQUET = f"{BUCKET_parquet}/train_data_parquet/*.snappy.parquet"
# VALID_DIR_PARQUET = f"{BUCKET_parquet}/validation_data_parquet/*.snappy.parquet"

# MAX_PADDING = 375

In [11]:
# TRAIN_PATH='gs://spotify-merlin-v1/nvt-preprocessing-spotify-v10-subset/nvt-processed/train'

TRAIN_PATH = 'gs://spotify-builtin-2t/train_data_parquet'

# !gsutil du $TRAIN_PATH | wc -l
!gsutil ls -lR $TRAIN_PATH | tail -n 1

TOTAL: 5000 objects, 326467491281 bytes (304.05 GiB)


In [12]:
# VALID_PATH='gs://spotify-merlin-v1/nvt-preprocessing-spotify-v10-subset/nvt-processed/valid'

VALID_PATH = 'gs://spotify-builtin-2t/validation_data_parquet'

!gsutil ls -lR $VALID_PATH | tail -n 1

TOTAL: 152 objects, 16521928490 bytes (15.39 GiB)


In [13]:
from google.cloud import storage
storage_client = storage.Client()

BUCKET_NAME = 'spotify-builtin-2t' # 'spotify-merlin-v1' | spotify-builtin-2t
delimiter = '/'
FILE_PATTERN = "*.parquet"
# FILE_PATTERN = '0000000000**.snappy.parquet'
# ===========================
# train data
# ===========================

TRAIN_PREFIX = 'train_data_parquet'
# TRAIN_PREFIX = 'train_data_parquet/0000000000**.snappy.parquet' # subset of data
# TRAIN_PREFIX = 'nvt-preprocessing-spotify-v10-subset/nvt-processed/train'
train_files = []
    
train_blobs = storage_client.list_blobs(BUCKET_NAME, prefix=f'{TRAIN_PREFIX}/', delimiter=delimiter)
for blob in train_blobs:
    if blob.name[-7:] == 'parquet':
        train_files.append(f'gs://{BUCKET_NAME}/{blob.name}')
    
# ===========================
# valid data
# ===========================

VALID_PREFIX = 'validation_data_parquet'
# VALID_PREFIX = 'nvt-preprocessing-spotify-v10-subset/nvt-processed/valid'
valid_files = []

valid_blobs = storage_client.list_blobs(BUCKET_NAME, prefix=f'{VALID_PREFIX}/', delimiter=delimiter)
for blob in valid_blobs:
    if blob.name[-7:] == 'parquet':
        valid_files.append(f'gs://{BUCKET_NAME}/{blob.name}')
    

# ===========================
# get info
# ===========================
COUNT_TRAIN_FILES = len(train_files)
COUNT_VALID_FILES = len(valid_files)

print(f'COUNT_TRAIN_FILES : {COUNT_TRAIN_FILES}')
print(f'COUNT_VALID_FILES : {COUNT_VALID_FILES}')
# COUNT_ORIGINAL_TRAIN_FILES : 5000
# COUNT_ORIGINAL_VALID_FILES : 152

TRAIN_PATH = f'gs://{BUCKET_NAME}/{TRAIN_PREFIX}/{FILE_PATTERN}'
VALID_PATH = f'gs://{BUCKET_NAME}/{VALID_PREFIX}/{FILE_PATTERN}'

print(f'TRAIN_PATH : {TRAIN_PATH}')
print(f'VALID_PATH : {VALID_PATH}')

print(f'TRAIN_PREFIX : {TRAIN_PREFIX}')
print(f'VALID_PREFIX : {VALID_PREFIX}')

COUNT_TRAIN_FILES : 5000
COUNT_VALID_FILES : 152
TRAIN_PATH : gs://spotify-builtin-2t/train_data_parquet/*.parquet
VALID_PATH : gs://spotify-builtin-2t/validation_data_parquet/*.parquet
TRAIN_PREFIX : train_data_parquet
VALID_PREFIX : validation_data_parquet


In [14]:
train_files[:1]

['gs://spotify-builtin-2t/train_data_parquet/000000000000.snappy.parquet']

In [15]:
valid_files[:1]

['gs://spotify-builtin-2t/validation_data_parquet/000000000000.snappy.parquet']

### Parquet files from BQ extract

In [16]:
# Training files
# TRAIN_PATHS = train_files

# Validation files
# VALID_PATHS = valid_files

### Outputs

In [17]:
# Define output directories
OUTPUT_DEFINED_DIR = os.path.join(WORKSPACE, "nvt-defined")
OUTPUT_WORKFLOW_DIR = os.path.join(WORKSPACE, "nvt-analyzed")
OUTPUT_TRANSFORMED_DIR = os.path.join(WORKSPACE, "nvt-processed")

print(f"WORKSPACE: {WORKSPACE}")
      
print(f"OUTPUT_DEFINED_DIR: {OUTPUT_DEFINED_DIR}\nOUTPUT_WORKFLOW_DIR: {OUTPUT_WORKFLOW_DIR}\nOUTPUT_TRANSFORMED_DIR: {OUTPUT_TRANSFORMED_DIR}")

WORKSPACE: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset
OUTPUT_DEFINED_DIR: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-defined
OUTPUT_WORKFLOW_DIR: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-analyzed
OUTPUT_TRANSFORMED_DIR: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-processed


## TODO: Create pipeline parameters

In [18]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

# trying to achieve avg file size of ~100 mb
num_output_files_train = 100 #0 # Number of output Parquet files
num_output_files_valid = 10 #2 # Number of output Parquet files

# TRAIN_PATTERN_small = "gs://spotify-builtin-2t/train_data_parquet/0000000000**.snappy.parquet"
# VALID_PATTERN_small = "gs://spotify-builtin-2t/validation_data_parquet/00000000000*.snappy.parquet"


parq_parameter_values = {
    'bucket_name': BUCKET_NAME,
    # 'train_paths': TRAIN_PATH,                                              
    # 'valid_paths': VALID_PATH,
    # 'train_pattern':f'{TRAIN_PATTERN_small}',
    # 'valid_pattern':f'{VALID_PATTERN_small}',             
    'train_prefix': f'{TRAIN_PREFIX}',
    'valid_prefix': f'{VALID_PREFIX}',
    'num_output_files_train': num_output_files_train,
    'num_output_files_valid': num_output_files_valid,
    'output_path_defined_dir': f'{OUTPUT_DEFINED_DIR}',
    'output_path_analyzed_dir': f'{OUTPUT_WORKFLOW_DIR}',
    'output_path_transformed_dir': f'{OUTPUT_TRANSFORMED_DIR}',
    'version':f'{VERSION}',
    'shuffle': json.dumps(None) # select PER_PARTITION, PER_WORKER, FULL, or None.
}

In [19]:
from pprint import pprint

# pprint(parq_parameter_values)

print(f"num_output_files_train: {parq_parameter_values['num_output_files_train']}")
print(f"num_output_files_valid: {parq_parameter_values['num_output_files_valid']}")
print(f"output_path_defined_dir: {parq_parameter_values['output_path_defined_dir']}")
print(f"output_path_analyzed_dir: {parq_parameter_values['output_path_analyzed_dir']}")
print(f"output_path_transformed_dir: {parq_parameter_values['output_path_transformed_dir']}")
print(f"shuffle: {parq_parameter_values['shuffle']}")
# print(f"train_paths[0]: {parq_parameter_values['train_paths'][:74]} ...]") # json array
# print(f"valid_paths[1]: {parq_parameter_values['valid_paths'][:79]} ...]") # json array

num_output_files_train: 100
num_output_files_valid: 10
output_path_defined_dir: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-defined
output_path_analyzed_dir: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-analyzed
output_path_transformed_dir: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-processed
shuffle: null


### Adapting pipeline params

* Two limtis exist:
> * pipeline params = 200k bytes
> * Vertex `CustomJob` input argument size limit = 100k bytes
* see [b/203570894](https://b.corp.google.com/issues/203570894)
* specific error when running pipeline:

```
com.google.cloud.ai.platform.common.errors.AiPlatformException: code=INVALID_ARGUMENT, 
message=Metadata field cannot exceed 204800 bytes, but was 382496 bytes., cause=null
```

#### Workaround:
* get train and & valid files in pipeline component

In [20]:
import sys

print(f"Size of parq_parameter_values: {sys.getsizeof(parq_parameter_values)} bytes")
print(f"Size of TRAIN_PATH: {sys.getsizeof(TRAIN_PATH)} bytes")
print(f"Size of train_files: {sys.getsizeof(train_files)} bytes")
print(f"Size of VALID_PATH: {sys.getsizeof(VALID_PATH)} bytes")
print(f"Size of VALID_PATH: {sys.getsizeof(valid_files)} bytes")

Size of parq_parameter_values: 376 bytes
Size of TRAIN_PATH: 101 bytes
Size of train_files: 43048 bytes
Size of VALID_PATH: 106 bytes
Size of VALID_PATH: 1456 bytes


## Compile KFP pipeline

In [21]:
#list the current work dir
os.getcwd()

'/home/jupyter/spotify-merlin'

In [22]:
PREPROCESS_PARQUET_PIPELINE_NAME

'nvtabular-parquet-pipeline-v32-subset'

In [24]:
os.chdir('/home/jupyter/spotify-merlin/src')

from pipelines.preprocessing_pipelines import preprocessing_parquet

_compiled_pipeline_path = f'{PREPROCESS_PARQUET_PIPELINE_NAME}.json'

compiler.Compiler().compile(
       pipeline_func=preprocessing_parquet,
       package_path=_compiled_pipeline_path
)

In [25]:
parq_parameter_values

{'bucket_name': 'spotify-builtin-2t',
 'train_prefix': 'train_data_parquet',
 'valid_prefix': 'validation_data_parquet',
 'num_output_files_train': 100,
 'num_output_files_valid': 10,
 'output_path_defined_dir': 'gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-defined',
 'output_path_analyzed_dir': 'gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-analyzed',
 'output_path_transformed_dir': 'gs://spotify-merlin-v1/nvt-preprocessing-spotify-v32-subset/nvt-processed',
 'version': 'v32-subset',
 'shuffle': 'null'}

In [26]:
CPU_LIMIT

'64'

## Submit pipeline to Vertex AI

In [27]:
PREPROCESS_PARQUET_PIPELINE_NAME

'nvtabular-parquet-pipeline-v32-subset'

In [28]:
job_name = f'{PREPROCESS_PARQUET_PIPELINE_NAME}_{TIMESTAMP}' #{TIMESTAMP}'

pipeline_job = vertex_ai.PipelineJob(
    display_name=job_name,
    template_path=_compiled_pipeline_path,
    enable_caching=False,
    parameter_values=parq_parameter_values,
)

pipeline_job.submit(service_account=VERTEX_SA)

Creating PipelineJob
PipelineJob created. Resource name: projects/934903580331/locations/us-central1/pipelineJobs/nvtabular-parquet-pipeline-v32-subset-20220912022430
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/934903580331/locations/us-central1/pipelineJobs/nvtabular-parquet-pipeline-v32-subset-20220912022430')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/nvtabular-parquet-pipeline-v32-subset-20220912022430?project=934903580331


In [39]:
data_paths = []
bucket_name = 'spotify-builtin-2t'
data_blobs = storage_client.list_blobs(bucket_name, prefix=f'{TRAIN_PREFIX}/', delimiter=delimiter)
for blob in data_blobs:
    if blob.name[-7:] == 'parquet':
        data_paths.append(f'gs://{bucket_name}/{blob.name}')
        
len(data_paths)

5000

# Local testing

In [44]:
from google.cloud import storage
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.blob import Blob

def _upload_blob_gcs(gcs_uri, source_file_name, destination_blob_name):
    """Uploads a file to GCS bucket"""
    client = storage.Client()
    blob = Blob.from_string(os.path.join(gcs_uri, destination_blob_name))
    blob.bucket._client = client
    blob.upload_from_filename(source_file_name)
    
def _read_blob_gcs(bucket_name, source_blob_name, destination_filename):
    """Downloads a file from GCS to local directory"""
    client = storage.Client()
    # blob = Blob.from_string(os.path.join(gcs_uri, gcs_file_name))
    # blob.bucket._client = client
    bucket = client.get_bucket(bucket_name)
    # blob = bucket.get_blob(file_path)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_filename)

In [34]:
!gsutil cp gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/train/_file_list.txt ./local_file_list.txt

# spotify-merlin/src/_file_list.txt

Copying gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/train/_file_list.txt...
/ [1 files][  4.7 KiB/  4.7 KiB]                                                
Operation completed over 1 objects/4.7 KiB.                                      


In [37]:
local_directory = os.getcwd()
local_directory

'/home/jupyter/spotify-merlin/src'

In [48]:
local_directory = os.getcwd()

bucket_name='spotify-merlin-v1'
prefix='nvt-preprocessing-spotify-v09-subset/nvt-processed/train'
destination_filename = f'{local_directory}/local_file_list.txt'
FILENAME='_file_list.txt'

_SOURCE_BLOB_NAME = 'nvt-preprocessing-spotify-v09-subset/nvt-processed/train/_file_list.txt'

# local_file = f'{local_directory}/local_file_list.txt'

local_file_dest = f'{local_directory}/local_file_list2.txt'
print(f"local_file_dest: {local_file_dest}")

_read_blob_gcs(bucket_name=bucket_name,source_blob_name=f'{_SOURCE_BLOB_NAME}', destination_filename=local_file_dest)
print(f"local_file_dest: {local_file_dest}")

new_lines = []
with open(local_file_dest, 'r') as fp:
    lines = fp.readlines()
    new_lines.append(lines[0])
    for line in lines[1:]:
        new_lines.append(line.replace('gs://', '/gcs/'))

new_local_filename = f'{local_directory}/_gcs_file_list2.txt'
print(f"new_local_filename: {new_local_filename}")

with open(new_local_filename, 'w') as fp:
    fp.writelines(new_lines)
    
_GCS_URI = 'gs://jt-merlin-test/folder2'

_upload_blob_gcs(gcs_uri=_GCS_URI, source_file_name=new_local_filename, destination_blob_name='_gcs_file_list.txt')

local_file_dest: /home/jupyter/spotify-merlin/src/local_file_list2.txt
local_file_dest: /home/jupyter/spotify-merlin/src/local_file_list2.txt
new_local_filename: /home/jupyter/spotify-merlin/src/_gcs_file_list2.txt


In [42]:
new_local_filename

'/home/jupyter/spotify-merlin/src/_gcs_file_list.txt'

In [30]:
from google.cloud import storage

storage_client = storage.Client()

# spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/train
# spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/train/_file_list.txt
bucket_name='spotify-merlin-v1'
prefix='nvt-preprocessing-spotify-v09-subset/nvt-processed/train/'
delimiter='/'
FILENAME='_file_list.txt'

local_file = _read_blob_gcs(bucket=bucket_name,file_path=f'{prefix}/{FILENAME}',destination_filename='local_file_list.txt')
local_file
# new_lines = []
# with open(local_file, 'r') as fp:
#     lines = fp.readlines()
#     new_lines.append(lines[0])
#     for line in lines[1:]:
#         new_lines.append(line.replace('gs://', '/gcs/'))

# Note: Client.list_blobs requires at least package version 1.17.0.
# blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)

# print("Blobs:")
# for blob in blobs:
#     print(blob.name)

!gsutil -cp gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/train/_file_list.txt .

AttributeError: 'NoneType' object has no attribute 'download_to_filename'

In [31]:
# gs://spotify-merlin-v1/nvt-preprocessing-spotify-v01-subset/nvt-parquet-pipeline/934903580331/nvt-parquet-pipeline-20220720101605/convert-parquet-op_-8784796089988415488/output_dataset/_file_list.txt
bucket_name='spotify-merlin-v1'
prefix='nvt-preprocessing-spotify-v01-subset/nvt-parquet-pipeline/934903580331/nvt-parquet-pipeline-20220720101605/convert-parquet-op_-8784796089988415488/output_dataset/'
delimiter='/'
FILENAME='_file_list.txt'

local_file = _read_blob_gcs(bucket=bucket_name,file_path=f'{prefix}/{FILENAME}',destination_filename='local_file_list.txt')
local_file

AttributeError: 'NoneType' object has no attribute 'download_to_filename'

In [None]:
_read_blob_gcs(bucket=bucket_name,file_path=f'{prefix}/{FILENAME}',destination_filename='local_file_list.txt')

In [23]:
# logging.info(f'output_path_transformed_dir/split: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/valid')
# file_list = os.path.join('gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/valid', '_file_list.txt')
file_list = 'gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/valid/_file_list.txt'
print(f"file_list: {file_list}")

new_lines = []
with open(file_list, 'r') as fp:
    lines = fp.readlines()
    new_lines.append(lines[0])
    for line in lines[1:]:
        new_lines.append(line.replace('gs://', '/gcs/'))

file_list: gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/valid/__file_list.txt


FileNotFoundError: [Errno 2] No such file or directory: 'gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/valid/__file_list.txt'

In [None]:
gcs_file_list = os.path.join(f'gs://spotify-merlin-v1/nvt-preprocessing-spotify-v09-subset/nvt-processed/valid', f'_gcs_file_list.txt')

with open(gcs_file_list, 'w') as fp:
    fp.writelines(new_lines)