# Environment Setup for training with TFRS

## Install Packages
Run pip requirements.txt in either (1) the notebook cell below or (2) in a notebook terminal window

In [1]:
# !pwd

In [1]:
# naming convention for all cloud resources
VERSION        = "v1"                  # TODO
PREFIX         = f'ndr-{VERSION}'      # TODO

print(f"PREFIX = {PREFIX}")

PREFIX = ndr-v1


## GCP project

In [2]:
# creds, PROJECT_ID = google.auth.default()
GCP_PROJECTS             = !gcloud config get-value project
PROJECT_ID               = GCP_PROJECTS[0]

PROJECT_NUM              = !gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NUM              = PROJECT_NUM[0]

VERTEX_SA                = f'{PROJECT_NUM}-compute@developer.gserviceaccount.com'

VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

# locations / regions for cloud resources
LOCATION                 = 'us-central1'        
REGION                   = LOCATION
BQ_LOCATION              = 'US'

print(f"PROJECT_ID       = {PROJECT_ID}")
print(f"PROJECT_NUM      = {PROJECT_NUM}")
print(f"VPC_NETWORK_NAME = {VPC_NETWORK_NAME}")
print(f"LOCATION         = {LOCATION}")
print(f"REGION           = {REGION}")
print(f"BQ_LOCATION      = {BQ_LOCATION}")

PROJECT_ID       = myproject32549
PROJECT_NUM      = 683169793466
VPC_NETWORK_NAME = ucaip-haystack-vpc-network
LOCATION         = us-central1
REGION           = us-central1
BQ_LOCATION      = US


In [3]:
APP                      = 'sp'
MODEL_TYPE               = '2tower'
FRAMEWORK                = 'tfrs'
DATA_VERSION             = "v1" # version tag for dataflow pipeline | "v2-0-0" # v1-0-0
TRACK_HISTORY            = 5        # length of playlist tracks to consider

In [4]:
# GCS bucket and paths
BUCKET_NAME              = f'{PREFIX}-{PROJECT_ID}-bucket'
BUCKET_URI               = f'gs://{BUCKET_NAME}'
SOURCE_BUCKET            = 'spotify-million-playlist-dataset'

# Location to write TF-Records
DATA_GCS_PREFIX          = "data"
DATA_PATH                = f"{BUCKET_URI}/{DATA_GCS_PREFIX}"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = 'vocab_dict.pkl'

CANDIDATE_PREFIX         = 'candidates'
TRAIN_DIR_PREFIX         = 'train'
VALID_DIR_PREFIX         = 'valid'

VPC_NETWORK_FULL         = f"projects/{PROJECT_NUM}/global/networks/{VPC_NETWORK_NAME}"

# BigQuery parameters
BQ_DATASET               = 'spotify_e2e_test'
BQ_TABLE_TRAIN           = 'train_flatten_last_5'
BQ_TABLE_VALID           = 'train_flatten_valid_last_5'
BQ_TABLE_CANDIDATES      = 'candidates'

# repo
REPO_SRC                 = 'src'
PIPELINES_SUB_DIR        = 'feature_pipes'

# container registry
REPOSITORY               = f'{PREFIX}-spotify'
IMAGE_NAME               = f'train-{VERSION}'
REMOTE_IMAGE_NAME        = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE_NAME}"
DOCKERNAME               = f'tfrs'

# serving images
SERVING_IMAGE_URI_CPU    = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-11:latest'
SERVING_IMAGE_URI_GPU    = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-gpu.2-11:latest'

print(f"BUCKET_NAME              : {BUCKET_NAME}")
print(f"BUCKET_URI               : {BUCKET_URI}")
print(f"SOURCE_BUCKET            : {SOURCE_BUCKET}")

print(f"DATA_GCS_PREFIX          : {DATA_GCS_PREFIX}")
print(f"DATA_PATH                : {DATA_PATH}")
print(f"VOCAB_SUBDIR             : {VOCAB_SUBDIR}")
print(f"VOCAB_FILENAME           : {VOCAB_FILENAME}")

print(f"CANDIDATE_PREFIX         : {CANDIDATE_PREFIX}")
print(f"TRAIN_DIR_PREFIX         : {TRAIN_DIR_PREFIX}")
print(f"VALID_DIR_PREFIX         : {VALID_DIR_PREFIX}")

print(f"VPC_NETWORK_FULL         : {VPC_NETWORK_FULL}")

print(f"BQ_DATASET               : {BQ_DATASET}")
print(f"BQ_TABLE_TRAIN           : {BQ_TABLE_TRAIN}")
print(f"BQ_TABLE_VALID           : {BQ_TABLE_VALID}")
print(f"BQ_TABLE_CANDIDATES      : {BQ_TABLE_CANDIDATES}")

print(f"REPO_SRC                 : {REPO_SRC}")
print(f"PIPELINES_SUB_DIR        : {PIPELINES_SUB_DIR}")

print(f"REPOSITORY               : {REPOSITORY}")
print(f"IMAGE_NAME               : {IMAGE_NAME}")
print(f"REMOTE_IMAGE_NAME        : {REMOTE_IMAGE_NAME}")
print(f"DOCKERNAME               : {DOCKERNAME}")

print(f"SERVING_IMAGE_URI_CPU    : {SERVING_IMAGE_URI_CPU}")
print(f"SERVING_IMAGE_URI_GPU    : {SERVING_IMAGE_URI_GPU}")

BUCKET_NAME              : ndr-v1-myproject32549-bucket
BUCKET_URI               : gs://ndr-v1-myproject32549-bucket
SOURCE_BUCKET            : spotify-million-playlist-dataset
DATA_GCS_PREFIX          : data
DATA_PATH                : gs://ndr-v1-myproject32549-bucket/data
VOCAB_SUBDIR             : vocabs
VOCAB_FILENAME           : vocab_dict.pkl
CANDIDATE_PREFIX         : candidates
TRAIN_DIR_PREFIX         : train
VALID_DIR_PREFIX         : valid
VPC_NETWORK_FULL         : projects/683169793466/global/networks/ucaip-haystack-vpc-network
BQ_DATASET               : spotify_e2e_test
BQ_TABLE_TRAIN           : train_flatten_last_5
BQ_TABLE_VALID           : train_flatten_valid_last_5
BQ_TABLE_CANDIDATES      : candidates
REPO_SRC                 : src
PIPELINES_SUB_DIR        : feature_pipes
REPOSITORY               : ndr-v1-spotify
IMAGE_NAME               : train-v1
REMOTE_IMAGE_NAME        : us-central1-docker.pkg.dev/myproject32549/ndr-v1-spotify/train-v1
DOCKERNAME               :

In [5]:
# create bucket
! gsutil mb -l $REGION $BUCKET_URI

Creating gs://ndr-v1-myproject32549-bucket/...
ServiceException: 409 A Cloud Storage bucket named 'ndr-v1-myproject32549-bucket' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [6]:
# ! gsutil iam ch serviceAccount:{VERTEX_SA}:roles/storage.objects.get $BUCKET_URI
# ! gsutil iam ch serviceAccount:{VERTEX_SA}:roles/storage.objects.get $BUCKET_URI

## Save Notebook Configuration Data
If you want to avoid having to re-enter these across notebooks

In [7]:
config = f"""
PROJECT_ID               = \"{PROJECT_ID}\"
PROJECT_NUM              = \"{PROJECT_NUM}\"
LOCATION                 = \"{LOCATION}\"

REGION                   = \"{REGION}\"
BQ_LOCATION              = \"{BQ_LOCATION}\"
VPC_NETWORK_NAME         = \"{VPC_NETWORK_NAME}\"

VERTEX_SA                = \"{VERTEX_SA}\"

PREFIX                   = \"{PREFIX}\"
VERSION                  = \"{VERSION}\"

APP                      = \"{APP}\"
MODEL_TYPE               = \"{MODEL_TYPE}\"
FRAMEWORK                = \"{FRAMEWORK}\"
DATA_VERSION             = \"{DATA_VERSION}\"
TRACK_HISTORY            = \"{TRACK_HISTORY}\"

BUCKET_NAME              = \"{BUCKET_NAME}\"
BUCKET_URI               = \"{BUCKET_URI}\"
SOURCE_BUCKET            = \"{SOURCE_BUCKET}\"

DATA_GCS_PREFIX          = \"{DATA_GCS_PREFIX}\"
DATA_PATH                = \"{DATA_PATH}\"
VOCAB_SUBDIR             = \"{VOCAB_SUBDIR}\"
VOCAB_FILENAME           = \"{VOCAB_FILENAME}\"

CANDIDATE_PREFIX         = \"{CANDIDATE_PREFIX}\"
TRAIN_DIR_PREFIX         = \"{TRAIN_DIR_PREFIX}\"
VALID_DIR_PREFIX         = \"{VALID_DIR_PREFIX}\"

VPC_NETWORK_FULL         = \"{VPC_NETWORK_FULL}\"

BQ_DATASET               = \"{BQ_DATASET}\"
BQ_TABLE_TRAIN           = \"{BQ_TABLE_TRAIN}\"
BQ_TABLE_VALID           = \"{BQ_TABLE_VALID}\"
BQ_TABLE_CANDIDATES      = \"{BQ_TABLE_CANDIDATES}\"

REPO_SRC                 = \"{REPO_SRC}\"
PIPELINES_SUB_DIR        = \"{PIPELINES_SUB_DIR}\"

REPOSITORY               = \"{REPOSITORY}\"
IMAGE_NAME               = \"{IMAGE_NAME}\"
REMOTE_IMAGE_NAME        = \"{REMOTE_IMAGE_NAME}\"
DOCKERNAME               = \"{DOCKERNAME}\"

SERVING_IMAGE_URI_CPU    = \"{SERVING_IMAGE_URI_CPU}\"
SERVING_IMAGE_URI_GPU    = \"{SERVING_IMAGE_URI_GPU}\"
"""
print(config)


PROJECT_ID               = "myproject32549"
PROJECT_NUM              = "683169793466"
LOCATION                 = "us-central1"

REGION                   = "us-central1"
BQ_LOCATION              = "US"
VPC_NETWORK_NAME         = "ucaip-haystack-vpc-network"

VERTEX_SA                = "683169793466-compute@developer.gserviceaccount.com"

PREFIX                   = "ndr-v1"
VERSION                  = "v1"

APP                      = "sp"
MODEL_TYPE               = "2tower"
FRAMEWORK                = "tfrs"
DATA_VERSION             = "v1"
TRACK_HISTORY            = "5"

BUCKET_NAME              = "ndr-v1-myproject32549-bucket"
BUCKET_URI               = "gs://ndr-v1-myproject32549-bucket"
SOURCE_BUCKET            = "spotify-million-playlist-dataset"

DATA_GCS_PREFIX          = "data"
DATA_PATH                = "gs://ndr-v1-myproject32549-bucket/data"
VOCAB_SUBDIR             = "vocabs"
VOCAB_FILENAME           = "vocab_dict.pkl"

CANDIDATE_PREFIX         = "candidates"
TRAIN_DIR_PREFIX  

In [8]:
!echo '{config}' | gsutil cp - {BUCKET_URI}/config/notebook_env.py

Copying from <STDIN>...
/ [1 files][    0.0 B/    0.0 B]                                                
Operation completed over 1 objects.                                              


In [9]:
!gsutil ls $BUCKET_URI

gs://ndr-v1-myproject32549-bucket/config/


## Create BigQuery Dataset

In [10]:
import time
from google.cloud import bigquery

bigquery_client = bigquery.Client(project=PROJECT_ID, location=BQ_LOCATION)

In [11]:
# Create a bigquery dataset (one time operation)
# Construct a full Dataset object to send to the API.
dataset = bigquery.Dataset(f"`{PROJECT_ID}.{BQ_DATASET}`")

# TODO(developer): Specify the geographic location where the dataset should reside.
dataset.location = BQ_LOCATION

# Send the dataset to the API for creation, with an explicit timeout.
# Raises google.api_core.exceptions.Conflict if the Dataset already
# exists within the project.
dataset = bigquery_client.create_dataset(BQ_DATASET, timeout=30)  # Make an API request.
print("Created dataset {}.{}".format(bigquery_client.project, dataset.dataset_id))

Conflict: 409 POST https://bigquery.googleapis.com/bigquery/v2/projects/myproject32549/datasets?prettyPrint=false: Already Exists: Dataset myproject32549:spotify_e2e_test

## gitignore

In [None]:
%%writefile .gitignore
__init__.cpython*
candidate_pipeline.cypthon*
*.cpython-310.pyc
*.cpython-37.pyc
*-checkpoint.py*
*.ipynb_checkpoints
*.ipynb_checkpoints/*
.tensorboard-*
*WIP*
*ARCHIVED*
# .gcloudignore
# .git
.github
*__pycache__
*cpython-37.pyc
.gitignore
.DS_Store

# Pyhon byte-compiled / optimized files
__pycache__/
*.py[cod]
*$py.class

### Delete `__pycache__` directories

First run `LIST_CMD` to validate query results

In [12]:
LIST_CMD = 'find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)"'
DELETE_CMD = 'find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)" | xargs rm -rf'

# set variables if running in terminal
print("copy these commands into terminal:\n")
print(f"{LIST_CMD}")
print(f"{DELETE_CMD}")

copy these commands into terminal:

find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)"
find . | grep -E "(/__pycache__$|\.pyc$|\.pyo$)" | xargs rm -rf


**Finished**