In [1]:
""" Process master shapefile and store in multiple formats.
-------------------------------------------------------------------------------

Author: Rutger Hofste
Date: 20181206
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

"""

SCRIPT_NAME = "Y2018M12D06_RH_Master_Shape_V01"
OUTPUT_VERSION = 2

NODATA_VALUE = -9999

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2018M12D06_RH_Master_Shape_Dissolve_01/output"
INPUT_FILE_NAME = "Y2018M12D06_RH_Master_Shape_Dissolved_V01.shp"

BQ_PROJECT_ID = "aqueduct30"
BQ_OUTPUT_DATASET_NAME = "aqueduct30v01"

RDS_DATABASE_ENDPOINT = "aqueduct30v05.cgpnumwmfcqc.eu-central-1.rds.amazonaws.com"
RDS_DATABASE_NAME = "database01"

OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 

s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)

print("S3_INPUT_PATH: ",S3_INPUT_PATH,
      "\nec2_input_path: ",ec2_input_path,
      "\nec2_output_path: ",ec2_output_path,
      "\nBQ_OUTPUT_DATASET_NAME: ", BQ_OUTPUT_DATASET_NAME,
      "\nOUTPUT_TABLE_NAME: ",OUTPUT_TABLE_NAME,
      "\ns3_output_path: ", s3_output_path
      )

S3_INPUT_PATH:  s3://wri-projects/Aqueduct30/processData/Y2018M12D06_RH_Master_Shape_Dissolve_01/output 
ec2_input_path:  /volumes/data/Y2018M12D06_RH_Master_Shape_V01/input_V02 
ec2_output_path:  /volumes/data/Y2018M12D06_RH_Master_Shape_V01/output_V02 
BQ_OUTPUT_DATASET_NAME:  aqueduct30v01 
OUTPUT_TABLE_NAME:  y2018m12d06_rh_master_shape_v01_v02 
s3_output_path:  s3://wri-projects/Aqueduct30/processData/Y2018M12D06_RH_Master_Shape_V01/output_V02/


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M12D07 UTC 15:48


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_input_path}
!rm -r {ec2_output_path}
!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

In [4]:
!aws s3 cp {S3_INPUT_PATH} {ec2_input_path} --recursive 

download: s3://wri-projects/Aqueduct30/processData/Y2018M12D06_RH_Master_Shape_Dissolve_01/output/Y2018M12D06_RH_Master_Shape_Dissolved_V01.cpg to ../../../../data/Y2018M12D06_RH_Master_Shape_V01/input_V02/Y2018M12D06_RH_Master_Shape_Dissolved_V01.cpg
download: s3://wri-projects/Aqueduct30/processData/Y2018M12D06_RH_Master_Shape_Dissolve_01/output/Y2018M12D06_RH_Master_Shape_Dissolved_V01.shp.xml to ../../../../data/Y2018M12D06_RH_Master_Shape_V01/input_V02/Y2018M12D06_RH_Master_Shape_Dissolved_V01.shp.xml
download: s3://wri-projects/Aqueduct30/processData/Y2018M12D06_RH_Master_Shape_Dissolve_01/output/Y2018M12D06_RH_Master_Shape_Dissolved_V01.prj to ../../../../data/Y2018M12D06_RH_Master_Shape_V01/input_V02/Y2018M12D06_RH_Master_Shape_Dissolved_V01.prj
download: s3://wri-projects/Aqueduct30/processData/Y2018M12D06_RH_Master_Shape_Dissolve_01/output/Y2018M12D06_RH_Master_Shape_Dissolved_V01.sbx to ../../../../data/Y2018M12D06_RH_Master_Shape_V01/input_V02/Y2018M12D06_RH_Master_Shape_Di

In [5]:
import os
import sqlalchemy
import multiprocessing
import pandas as pd
import geopandas as gpd
import numpy as np
from google.cloud import bigquery
from shapely.geometry.multipolygon import MultiPolygon
from geoalchemy2 import Geometry, WKTElement

pd.set_option('display.max_columns', 500)

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

In [6]:
F = open("/.password","r")
password = F.read().splitlines()[0]
F.close()

engine = sqlalchemy.create_engine("postgresql://rutgerhofste:{}@{}:5432/{}".format(password,RDS_DATABASE_ENDPOINT,RDS_DATABASE_NAME))
connection = engine.connect()

In [7]:
input_path = "{}/{}".format(ec2_input_path,INPUT_FILE_NAME)

In [8]:
gdf = gpd.read_file(input_path)

In [9]:
gdf.head()

Unnamed: 0,string_id,geometry
0,111011-EGY.11_1-3365,"POLYGON ((31.90590570688292 29.85788703615783,..."
1,111011-EGY.15_1-3365,"POLYGON ((32.37500000014998 30.09166666628367,..."
2,111011-EGY.15_1-None,"(POLYGON ((32.5295365298621 29.95075831581867,..."
3,111011-None-3365,(POLYGON ((32.46194054146073 29.89250514754305...
4,111011-None-None,"(POLYGON ((32.5295365298621 29.95075831581867,..."


In [10]:
gdf.shape

(68511, 2)

In [11]:
gdf.dtypes

string_id    object
geometry     object
dtype: object

In [12]:
gdf[['pfaf_id','gid_1','aqid']] = gdf.string_id.str.split('-', expand=True)

In [13]:
gdf.replace("None",str(NODATA_VALUE),inplace=True)

In [14]:
gdf["pfaf_id"] = pd.to_numeric(gdf["pfaf_id"])
gdf["aqid"] = pd.to_numeric(gdf["aqid"])

In [15]:
gdf = gdf.sort_values("string_id")

In [16]:
gdf["aq30_id"] = gdf.index

In [17]:
gdf = gdf.reindex(sorted(gdf.columns), axis=1)

In [18]:
gdf.head()

Unnamed: 0,aq30_id,aqid,geometry,gid_1,pfaf_id,string_id
0,0,3365,"POLYGON ((31.90590570688292 29.85788703615783,...",EGY.11_1,111011,111011-EGY.11_1-3365
1,1,3365,"POLYGON ((32.37500000014998 30.09166666628367,...",EGY.15_1,111011,111011-EGY.15_1-3365
2,2,-9999,"(POLYGON ((32.5295365298621 29.95075831581867,...",EGY.15_1,111011,111011-EGY.15_1-None
3,3,3365,(POLYGON ((32.46194054146073 29.89250514754305...,-9999,111011,111011-None-3365
4,4,-9999,"(POLYGON ((32.5295365298621 29.95075831581867,...",-9999,111011,111011-None-None


In [19]:
def uploadGDFtoPostGIS(gdf,tableName,saveIndex):
    # this function uploads a polygon shapefile to table in AWS RDS. 
    # It handles combined polygon/multipolygon geometry and stores it in valid multipolygon in epsg 4326.
    
    # gdf = input geoDataframe
    # tableName = postGIS table name (string)
    # saveIndex = save index column in separate column in postgresql, otherwise discarded. (Boolean)
    
    
    gdf["type"] = gdf.geometry.geom_type    
    geomTypes = ["Polygon","MultiPolygon"]
    
    for geomType in geomTypes:
        gdfType = gdf.loc[gdf["type"]== geomType]
        geomTypeLower = str.lower(geomType)
        gdfType['geom'] = gdfType['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))
        gdfType.drop(["geometry","type"],1, inplace=True)      
        print("Create table temp%s" %(geomTypeLower)) 
        gdfType.to_sql(
            name = "temp%s" %(geomTypeLower),
            con = engine,
            if_exists='replace',
            index= saveIndex, 
            dtype={'geom': Geometry(str.upper(geomType), srid= 4326)}
        )
        
    # Merge both tables and make valid
    sql = []
    sql.append("DROP TABLE IF EXISTS %s"  %(tableName))
    sql.append("ALTER TABLE temppolygon ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);")
    sql.append("CREATE TABLE %s AS (SELECT * FROM temppolygon UNION SELECT * FROM tempmultipolygon);" %(tableName))
    sql.append("UPDATE %s SET geom = st_makevalid(geom);" %(tableName))
    sql.append("DROP TABLE temppolygon,tempmultipolygon")

    for statement in sql:
        print(statement)
        result = connection.execute(statement)    
    gdfFromSQL =gpd.GeoDataFrame.from_postgis("select * from %s" %(tableName),connection,geom_col='geom' )
    return gdfFromSQL

In [20]:
gdf.shape

(68511, 6)

In [None]:
gdfFromSQL = uploadGDFtoPostGIS(gdf,OUTPUT_TABLE_NAME,False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Create table temppolygon


In [None]:
gdfFromSQL.shape

In [None]:
gdfFromSQL.head()

In [None]:
destination_table = "{}.{}".format(BQ_OUTPUT_DATASET_NAME,OUTPUT_TABLE_NAME)

In [None]:
gdfFromSQL.to_gbq(destination_table=destination_table,
                  project_id=BQ_PROJECT_ID,
                  chunksize=1000,
                  if_exists="replace")

In [None]:
output_file_path = "{}/{}".format(ec2_output_path,SCRIPT_NAME)

In [None]:
gdf.to_pickle(output_file_path + ".pkl")

In [None]:
gdf.to_file(output_file_path + ".shp",driver="ESRI Shapefile")

In [None]:
!aws s3 cp {ec2_output_path} {s3_output_path} --recursive

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

Previous runs:   
0:01:12.245867  
0:48:09.273757