In [6]:
PROJECT = "qwiklabs-gcp-ml-49b827b781ab"  # Replace with your PROJECT
BUCKET = "qwiklabs-gcp-ml-49b827b781ab"  # Replace with your BUCKET
REGION = "us-central1"            # Choose an available region for Cloud MLE
TFVERSION = "1.14"                # TF version for CMLE to use

In [None]:
import tensorflow as tf
import apache_beam as beam
import shutil
import os
print(tf.__version__)

In [1]:
query = """
WITH CTE_Raw_Data AS (
SELECT
  weight_pounds,
  CAST(is_male AS STRING) AS is_male,
  mother_age,
  plurality,
  gestation_weeks,
  ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth
FROM
  publicdata.samples.natality
WHERE
  year > 2000
  AND weight_pounds > 0
  AND mother_age > 0
  AND plurality > 0
  AND gestation_weeks > 0
  AND month > 0)

-- Ultrasound
SELECT
  weight_pounds,
  is_male,
  mother_age,
  CASE
    WHEN plurality = 1 THEN "Single(1)"
    WHEN plurality = 2 THEN "Twins(2)"
    WHEN plurality = 3 THEN "Triplets(3)"
    WHEN plurality = 4 THEN "Quadruplets(4)"
    WHEN plurality = 5 THEN "Quintuplets(5)"
    ELSE "NULL"
  END AS plurality,
  gestation_weeks,
  hashmonth
FROM
  CTE_Raw_Data
UNION ALL
-- No ultrasound
SELECT
  weight_pounds,
  "Unknown" AS is_male,
  mother_age,
  CASE
    WHEN plurality = 1 THEN "Single(1)"
    WHEN plurality > 1 THEN "Multiple(2+)"
  END AS plurality,
  gestation_weeks,
  hashmonth
FROM
    CTE_Raw_Data
"""

In [2]:
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# Set dataset_id to the ID of the dataset to create.
dataset_name = "temp_babyweight_dataset"
dataset_id = "{}.{}".format(client.project, dataset_name)

# Construct a full Dataset object to send to the API.
dataset = bigquery.Dataset.from_string(dataset_id)

# Specify the geographic location where the dataset should reside.
dataset.location = "US"

# Send the dataset to the API for creation.
# Raises google.api_core.exceptions.Conflict if the Dataset already
# exists within the project.
try:
    dataset = client.create_dataset(dataset)  # API request
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
except:
    print("Dataset {}.{} already exists".format(client.project, dataset.dataset_id))

Dataset qwiklabs-gcp-ml-49b827b781ab.temp_babyweight_dataset already exists


In [4]:
job_config = bigquery.QueryJobConfig()
for step in ["train", "eval"]:
    if step == "train":
        selquery = "SELECT * FROM ({}) WHERE MOD(hashmonth, 100) < 80".format(query)
    elif step == "eval":
        selquery = "SELECT * FROM ({}) WHERE MOD(hashmonth, 100) >= 80 AND MOD(hashmonth, 100) < 90".format(query)
    else: 
        selquery = "SELECT * FROM ({}) WHERE MOD(hashmonth, 100) >= 90".format(query)
    # Set the destination table
    table_name = "babyweight_{}".format(step)
    table_ref = client.dataset(dataset_name).table(table_name)
    job_config.destination = table_ref
    job_config.write_disposition = "WRITE_TRUNCATE"

    # Start the query, passing in the extra configuration.
    query_job = client.query(
        query=selquery,
        # Location must match that of the dataset(s) referenced in the query
        # and of the destination table.
        location="US",
        job_config=job_config)  # API request - starts the query

    query_job.result()  # Waits for the query to finish
    print("Query results loaded to table {}".format(table_ref.path))

Query results loaded to table /projects/qwiklabs-gcp-ml-49b827b781ab/datasets/temp_babyweight_dataset/tables/babyweight_train
Query results loaded to table /projects/qwiklabs-gcp-ml-49b827b781ab/datasets/temp_babyweight_dataset/tables/babyweight_eval


<img src="20191206_bigquery_1.png" alt="Drawing" style="width: 800px;"/>

In [7]:
dataset_ref = client.dataset(dataset_id=dataset_name, project=PROJECT)

for step in ["train", "eval"]:
    destination_uri = "gs://{}/{}".format(BUCKET, "babyweight/bq_data/{}*.csv".format(step))
    table_name = "babyweight_{}".format(step)
    table_ref = dataset_ref.table(table_name)
    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        # Location must match that of the source table.
        location="US",
    )  # API request
    extract_job.result()  # Waits for job to complete.

    print("Exported {}:{}.{} to {}".format(PROJECT, dataset_name, table_name, destination_uri))

Exported qwiklabs-gcp-ml-49b827b781ab:temp_babyweight_dataset.babyweight_train to gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/bq_data/train*.csv
Exported qwiklabs-gcp-ml-49b827b781ab:temp_babyweight_dataset.babyweight_eval to gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/bq_data/eval*.csv


<img src="20191206_storage_1.png" alt="Drawing" style="width: 800px;"/>

# View results

In [9]:
!gsutil ls gs://$BUCKET/babyweight/bq_data/*000000000000*

gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/bq_data/eval000000000000.csv
gs://qwiklabs-gcp-ml-49b827b781ab/babyweight/bq_data/train000000000000.csv
