In [1]:
""" Union of Hybas and GADM in Bigquey.
-------------------------------------------------------------------------------

Performance has been significantly improved with the help of Google Experts on
the Bigquery forum.

Author: Rutger Hofste
Date: 20181114
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

"""

TESTING = 0
OVERWRITE_OUTPUT = 1
SCRIPT_NAME = 'Y2018M11D14_RH_Hybas_Union_GADM_BQ_V02'
OUTPUT_VERSION = 2

BQ_PROJECT_ID = "aqueduct30"
BQ_DATASET_NAME = "geospatial_geog_v01"

BQ_INPUT_TABLE_LEFT = "y2018m11d12_rh_hybas_rds_to_bq_v01_v01"
BQ_INPUT_TABLE_RIGHT = "y2018m11d12_rh_gadm36_level1_rds_to_bq_v01_v01"

BQ_OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_output_path = "/volumes/data/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)
s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)


print("\nBQ_DATASET_NAME: ", BQ_DATASET_NAME,
      "\nBQ_INPUT_TABLE_LEFT: ",BQ_INPUT_TABLE_LEFT,
      "\nBQ_INPUT_TABLE_RIGHT: ",BQ_INPUT_TABLE_RIGHT,
      "\nBQ_OUTPUT_TABLE_NAME: ", BQ_OUTPUT_TABLE_NAME,
      "\nec2_output_path:",ec2_output_path,
      "\ns3_output_path:",s3_output_path)



BQ_DATASET_NAME:  geospatial_geog_v01 
BQ_INPUT_TABLE_LEFT:  y2018m11d12_rh_hybas_rds_to_bq_v01_v01 
BQ_INPUT_TABLE_RIGHT:  y2018m11d12_rh_gadm36_level1_rds_to_bq_v01_v01 
BQ_OUTPUT_TABLE_NAME:  y2018m11d14_rh_hybas_union_gadm_bq_v02_v02 
ec2_output_path: /volumes/data/Y2018M11D14_RH_Hybas_Union_GADM_BQ_V02/output_V02/ 
s3_output_path: s3://wri-projects/Aqueduct30/processData/Y2018M11D14_RH_Hybas_Union_GADM_BQ_V02/output_V02/


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M11D21 UTC 15:08


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_output_path}
!mkdir -p {ec2_output_path}

In [4]:
import os
import sqlalchemy
import pandas as pd
import geopandas as gpd
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

In [5]:
q = """
  -- input data
WITH
  polys1 AS (
  SELECT
    t1.pfaf_id,
    t1.geog as g
  FROM
    `{}.{}` t1 ),
  polys2 AS (
  SELECT
    t1.gid_1,
    t1.gid_0,
    t1.geog as g
  FROM
    `{}.{}` t1 ),
  -- intersections
  intersections AS (
    SELECT pfaf_id, gid_1, ST_INTERSECTION(a.g, b.g) i, a.g AS g1, b.g AS g2 
    FROM polys1 a, polys2 b WHERE ST_INTERSECTS(a.g, b.g)
  ),
  -- per-row union of intersections with this row
  diff1 AS (
    SELECT pfaf_id, ST_UNION_AGG(i) i FROM intersections GROUP BY pfaf_id
  ),
  diff2 AS (
    SELECT gid_1, ST_UNION_AGG(i) i FROM intersections GROUP BY gid_1
  ),
  -- various combinations of intersections
  pairs AS (
    SELECT pfaf_id, gid_1, i AS g FROM intersections
    UNION ALL
    SELECT p.pfaf_id, NULL, IF(i IS NULL, g, ST_DIFFERENCE(g, i)) FROM polys1 p LEFT JOIN diff1 d ON p.pfaf_id = d.pfaf_id
    UNION ALL 
    SELECT NULL, p.gid_1, IF(i IS NULL, g, ST_DIFFERENCE(g, i)) FROM polys2 p LEFT JOIN diff2 d ON p.gid_1 = d.gid_1
  )
  SELECT CONCAT(COALESCE(CAST(pfaf_id AS STRING),'nodata'),
         "-",
         COALESCE(gid_1,'nodata')) AS id_pfafgadm, 
         *
  FROM pairs WHERE NOT ST_IsEmpty(g)
""".format(BQ_DATASET_NAME,BQ_INPUT_TABLE_LEFT,BQ_DATASET_NAME,BQ_INPUT_TABLE_RIGHT)

In [6]:
job_config = bigquery.QueryJobConfig()

In [7]:
destination_dataset_ref = client.dataset(BQ_DATASET_NAME)

In [8]:
destination_table_ref = destination_dataset_ref.table(BQ_OUTPUT_TABLE_NAME)

In [9]:
job_config.destination = destination_table_ref

In [10]:
query_job = client.query(query=q,
                         job_config=job_config)

In [11]:
rows = query_job.result()

In [12]:
q = """
SELECT
    pfaf_id as id,
    gid_1 as name,
    ST_AsGeoJSON(g) geom   
FROM 
    {}.{} 
LIMIT 
    10
""".format(BQ_DATASET_NAME,BQ_OUTPUT_TABLE_NAME)

In [13]:
df = pd.read_gbq(query=q,
                 dialect='standard')

In [14]:
df.head()

Unnamed: 0,id,name,geom
0,752176,MEX.7_1,"{ ""type"": ""Polygon"", ""coordinates"": [ [ [-103...."
1,614000,GRD.2_1,"{ ""type"": ""Polygon"", ""coordinates"": [ [ [-61.6..."
2,122722,ZWE.6_1,"{ ""type"": ""Polygon"", ""coordinates"": [ [ [28.90..."
3,294691,IRQ.16_1,"{ ""type"": ""Polygon"", ""coordinates"": [ [ [42.06..."
4,641607,URY.17_1,"{ ""type"": ""MultiPolygon"", ""coordinates"": [ [ [..."


In [15]:
import json
from shapely.geometry import MultiPolygon, shape

In [16]:
df["geom_shapely"] = df["geom"].apply(lambda x: MultiPolygon([shape(json.loads(x))]),1)

In [17]:
df = df.drop("geom",1)

In [18]:
gdf = gpd.GeoDataFrame(data=df,geometry="geom_shapely")

In [19]:
gdf.crs = "+init=epsg:4326"

In [20]:
output_file_path = "{}/{}_V{:02.0f}.gpkg".format(ec2_output_path,SCRIPT_NAME,OUTPUT_VERSION)

In [21]:
gdf.to_file(filename=output_file_path,
            driver="GPKG")

In [22]:
!aws s3 cp {ec2_output_path} {s3_output_path} --recursive

Completed 192.0 KiB/192.0 KiB (59.2 KiB/s) with 1 file(s) remainingupload: ../../../../data/Y2018M11D14_RH_Hybas_Union_GADM_BQ_V02/output_V02/Y2018M11D14_RH_Hybas_Union_GADM_BQ_V02_V02.gpkg to s3://wri-projects/Aqueduct30/processData/Y2018M11D14_RH_Hybas_Union_GADM_BQ_V02/output_V02/Y2018M11D14_RH_Hybas_Union_GADM_BQ_V02_V02.gpkg


In [23]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:09:21.846651


Previous Runs:  
0:13:06.158866  
0:09:21.846651
    