Skip to content

Commit

Permalink
Adds gcloud jobs and logs commands
Browse files Browse the repository at this point in the history
  • Loading branch information
psoto committed Aug 7, 2017
1 parent 8d025a7 commit 1073d24
Showing 1 changed file with 115 additions and 9 deletions.
124 changes: 115 additions & 9 deletions luminoth/tools/cloud/gcloud.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import click
import os.path
import time
import tensorflow as tf
import googleapiclient.discovery as discovery

Expand All @@ -9,6 +10,11 @@
from google.oauth2 import service_account


@click.group(help='Train models in Google Cloud ML')
def gc():
pass


def get_bucket(service_account_json, bucket_name):
storage_client = storage.Client.from_service_account_json(
service_account_json)
Expand All @@ -26,14 +32,23 @@ def upload_file(bucket, base_path, filename):
return path


@click.command(help='Start a training job in Google Cloud ML')
def get_credentials(file):
return service_account.Credentials.from_service_account_file(file)


def cloud_service(credentials, service, version='v1'):
return discovery.build(service, version, credentials=credentials)


@gc.command(help='Start a training job')
@click.option('--job-id', help='JobId for saving models and logs.')
@click.option('--project-id', required=True)
@click.option('--service-account-json', required=True)
@click.option('--bucket', 'bucket_name', required=True, help='Where to save models and logs.')
@click.option('--dataset', required=True, help='Bucket where the dataset is located.')
@click.option('--config')
def gc(job_id, project_id, service_account_json, bucket_name, config, dataset):
@click.option('--bucket', 'bucket_name', required=True, help='Where to save models and logs.') # noqa
@click.option('--dataset', required=True, help='Bucket where the dataset is located.') # noqa
@click.option('--config', help='Path to config to use in training.')
def train(job_id, project_id, service_account_json, bucket_name, config,
dataset):
args = []

if not job_id:
Expand All @@ -60,13 +75,14 @@ def gc(job_id, project_id, service_account_json, bucket_name, config, dataset):
path = upload_file(bucket, base_path, config)
args.extend(['--config', 'gs://{}/{}'.format(bucket_name, path)])

credentials = service_account.Credentials.from_service_account_file(
service_account_json)
cloudml = discovery.build('ml', 'v1', credentials=credentials)
credentials = get_credentials(service_account_json)
cloudml = cloud_service(credentials, 'ml')

training_inputs = {
'scaleTier': 'BASIC_GPU',
'packageUris': ['gs://luminoth-config/luminoth-0.0.1-py2-none-any.whl'],
'packageUris': [
'gs://luminoth-config/luminoth-0.0.1-py2-none-any.whl'
],
'pythonModule': 'luminoth.train',
'args': args,
'region': 'us-central1',
Expand All @@ -91,3 +107,93 @@ def gc(job_id, project_id, service_account_json, bucket_name, config, dataset):
'There was an error creating the training job. '
'Check the details: \n{}'.format(err._get_reason())
)


@gc.command(help='List project jobs')
@click.option('--project-id', required=True)
@click.option('--service-account-json', required=True)
@click.option('--running', is_flag=True, help='List only jobs that are running.') # noqa
def jobs(project_id, service_account_json, running):
credentials = get_credentials(service_account_json)
cloudml = cloud_service(credentials, 'ml')
request = cloudml.projects().jobs().list(
parent='projects/{}'.format(project_id))

try:
response = request.execute()
jobs = response['jobs']

if not jobs:
click.echo('There are no jobs for this project.')
return

if running:
jobs = [j for j in jobs if j['state'] == 'RUNNING']
if not jobs:
click.echo('There are no jobs running.')
return

for job in jobs:
click.echo('Id: {} Created: {} State: {}'.format(
job['jobId'], job['createTime'], job['state']))
except Exception as err:
click.echo(
'There was an error fetching jobs. '
'Check the details: \n{}'.format(err._get_reason())
)


@gc.command(help='Show logs from a running job')
@click.argument('job_id')
@click.option('--project-id', required=True)
@click.option('--service-account-json', required=True)
@click.option('--polling-interval', default=60, help='Polling interval in seconds.') # noqa
def logs(job_id, project_id, service_account_json, polling_interval):
credentials = get_credentials(service_account_json)
cloudlog = cloud_service(credentials, 'logging', 'v2')

job_filter = 'resource.labels.job_id = "{}"'.format(job_id)
last_timestamp = None
while True:
filters = [job_filter]
if last_timestamp:
filters.append('timestamp > "{}"'.format(last_timestamp))

# Fetch all pages.
entries = []
next_page = None
while True:
request = cloudlog.entries().list(body={
'resourceNames': 'projects/{}'.format(project_id),
'filter': ' AND '.join(filters),
'pageToken': next_page,
})

try:
response = request.execute()
next_page = response.get('nextPageToken', None)
entries.extend(response.get('entries', []))
if not next_page:
break
except Exception as err:
click.echo(
'There was an error fetching the logs. '
'Check the details: \n{}'.format(err._get_reason())
)
break

for entry in entries:
last_timestamp = entry['timestamp']

if 'jsonPayload' in entry:
message = entry['jsonPayload']['message']
elif 'textPayload' in entry:
message = entry['textPayload']
else:
continue

click.echo('{:30} :: {:7} :: {}'.format(
entry['timestamp'], entry['severity'], message.strip()
))

time.sleep(polling_interval)

0 comments on commit 1073d24

Please sign in to comment.