In [1]:
from google.cloud import aiplatform
from datetime import datetime
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

def get_timestamp():
    return datetime.now().strftime("%Y%m%d%H%M%S")

PROJECT_ID = 'mwrite-a835'

PACKAGE_NAME = 'mpr-research-trainer'
PACKAGE_VERSION = '0.0.3'
PACKAGE_GCS_BUCKET_NAME = 'mpr-research-package'

source_package_file_name = f"./dist/{PACKAGE_NAME}-{PACKAGE_VERSION}.tar.gz"
python_package_gcs_uri = (
    f"gs://{PACKAGE_GCS_BUCKET_NAME}/trainer-{PACKAGE_VERSION}.tar.gz"
)
python_module_name = "trainer.task"

PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI = (
    'us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest'
)

aiplatform.init(project=PROJECT_ID, staging_bucket=PACKAGE_GCS_BUCKET_NAME)

In [2]:
%%writetemplate ./setup.cfg
[options]

package_dir =
    = src
python_requires = >=3.7
packages = find:

install_requires =
    transformers == 4.19
    datasets == 2.2
    torch == 1.11
    tqdm == 4.64
    openpyxl == 3.0
    cloudml-hypertune == 0.1.0.dev6


include_package_data = True

[options.packages.find]
where = src

[metadata]
name = {PACKAGE_NAME}
version = {PACKAGE_VERSION}
# author = Kapotaksha
# author_email = takposha@umich.edu
description = Used to train and tune MWrite data using PeerBERT
long_description = file: README.md
long_description_content_type = text/markdown
# url = https://github.com/pypa/sampleproject
# project_urls =
#    Bug Tracker = https://github.com/pypa/sampleproject/issues
classifiers =
    Programming Language :: Python :: 3
    Operating System :: OS Independent

In [3]:
!python -m build

[1m* Creating virtualenv isolated environment...[0m
[1m* Installing packages in isolated environment... (setuptools>=42)[0m
[1m* Getting dependencies for sdist...[0m
running egg_info
creating src/mpr_research_trainer.egg-info
writing src/mpr_research_trainer.egg-info/PKG-INFO
writing dependency_links to src/mpr_research_trainer.egg-info/dependency_links.txt
writing requirements to src/mpr_research_trainer.egg-info/requires.txt
writing top-level names to src/mpr_research_trainer.egg-info/top_level.txt
writing manifest file 'src/mpr_research_trainer.egg-info/SOURCES.txt'
reading manifest file 'src/mpr_research_trainer.egg-info/SOURCES.txt'
writing manifest file 'src/mpr_research_trainer.egg-info/SOURCES.txt'
[1m* Building sdist...[0m
running sdist
running egg_info
writing src/mpr_research_trainer.egg-info/PKG-INFO
writing dependency_links to src/mpr_research_trainer.egg-info/dependency_links.txt
writing requirements to src/mpr_research_trainer.egg-info/requires.txt
writing top-le

In [4]:
!gsutil cp {source_package_file_name} {python_package_gcs_uri}
!gsutil ls -l {python_package_gcs_uri}

Copying file://./dist/mpr-research-trainer-0.0.3.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  6.0 KiB/  6.0 KiB]                                                
Operation completed over 1 objects/6.0 KiB.                                      
      6177  2022-07-18T15:29:12Z  gs://mpr-research-package/trainer-0.0.3.tar.gz
TOTAL: 1 objects, 6177 bytes (6.03 KiB)


In [5]:
for tierLevel in [1,2]:
    JOB_NAME = f"{PACKAGE_NAME}-tierLevel-{tierLevel}-pytorch-pkg-ar-{get_timestamp()}"

    job = aiplatform.CustomPythonPackageTrainingJob(
        display_name=f"{JOB_NAME}",
        python_package_gcs_uri=python_package_gcs_uri,
        python_module_name=python_module_name,
        container_uri=PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI,
    )

    training_args = ['--tier-level', str(tierLevel), '--hidden-layers', '-1' ]

    model = job.run(
        replica_count=1,
        machine_type='n1-standard-8',
        accelerator_type='NVIDIA_TESLA_V100',
        accelerator_count=1,
        args=training_args,
        sync=False,
    )


Training Output directory:
gs://mpr-research-package/aiplatform-custom-training-2022-07-18-15:29:17.429 
Training Output directory:
gs://mpr-research-package/aiplatform-custom-training-2022-07-18-15:29:17.437 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6184632234838851584?project=824680261042
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/8976864003808559104?project=824680261042
CustomPythonPackageTrainingJob projects/824680261042/locations/us-central1/trainingPipelines/6184632234838851584 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/4848769587556122624?project=824680261042
CustomPythonPackageTrainingJob projects/824680261042/locations/us-central1/trainingPipelines/8976864003808559104 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.