In [1]:
""" Upload WHYMAP geospatial data to RDS.
-------------------------------------------------------------------------------

The script requires a file called .password to be stored in the current working
directory with the password to the database.

Please note that columns with uppercase should be referred to by using double 
quotes whereas strings need single quotes. Please note that the script will 
consolidate two polygons in Russia that spans two hemispheres into one.

Edit: 2019 07 09 added shapefile to export format.


Author: Rutger Hofste
Date: 20181114
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

Args:
    TESTING (Boolean) : Toggle testing case.
    SCRIPT_NAME (string) : Script name.
    OUTPUT_VERSION (integer) : output version.
    DATABASE_ENDPOINT (string) : RDS or postGreSQL endpoint.
    DATABASE_NAME (string) : Database name.
    TABLE_NAME_AREA_30SPFAF06 (string) : Table name used for areas. Must exist
        on same database as used in rest of script.
    S3_INPUT_PATH_RIVERDISCHARGE (string) : AWS S3 input path for 
        riverdischarge.    
    S3_INPUT_PATH_DEMAND (string) : AWS S3 input path for 
        demand.    

"""

SCRIPT_NAME = "Y2018M11D14_RH_WHYMAP_to_RDS_V01"
OUTPUT_VERSION= 1

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/rawData/Deltares/groundwater/Final_Oct_2017/data/Spatial_Units/"
INPUT_FILENAME = "whymap_wgs1984.shp" 

# Database settings
RDS_DATABASE_ENDPOINT = "aqueduct30v05.cgpnumwmfcqc.eu-central-1.rds.amazonaws.com"
RDS_DATABASE_NAME = "database01"
OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)

print("\nInput ec2: " + ec2_input_path,
      "\nInput s3 : " + S3_INPUT_PATH,
      "\nOutput postGIS table : " + OUTPUT_TABLE_NAME)



Input ec2: /volumes/data/Y2018M11D14_RH_WHYMAP_to_RDS_V01/input_V01 
Input s3 : s3://wri-projects/Aqueduct30/rawData/Deltares/groundwater/Final_Oct_2017/data/Spatial_Units/ 
Output postGIS table : y2018m11d14_rh_whymap_to_rds_v01_v01


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2019M07D09 UTC 11:40


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_input_path}
!rm -r {ec2_output_path}

!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

rm: cannot remove '/volumes/data/Y2018M11D14_RH_WHYMAP_to_RDS_V01/input_V01': No such file or directory
rm: cannot remove '/volumes/data/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01': No such file or directory


In [4]:
!aws s3 cp {S3_INPUT_PATH} {ec2_input_path} --recursive --quiet

In [5]:
import os
import sqlalchemy
import geopandas as gpd
from shapely.geometry.multipolygon import MultiPolygon
from geoalchemy2 import Geometry, WKTElement

In [6]:
F = open("/.password","r")
password = F.read().splitlines()[0]
F.close()

engine = sqlalchemy.create_engine("postgresql://rutgerhofste:{}@{}:5432/{}".format(password,RDS_DATABASE_ENDPOINT,RDS_DATABASE_NAME))
connection = engine.connect()

In [7]:
input_file_path = "{}/{}".format(ec2_input_path,INPUT_FILENAME)

In [8]:
gdf = gpd.read_file(input_file_path)

In [9]:
def uploadGDFtoPostGIS(gdf,tableName,saveIndex):
    # this function uploads a polygon shapefile to table in AWS RDS. 
    # It handles combined polygon/multipolygon geometry and stores it in valid multipolygon in epsg 4326.
    
    # gdf = input geoDataframe
    # tableName = postGIS table name (string)
    # saveIndex = save index column in separate column in postgresql, otherwise discarded. (Boolean)
    
    
    gdf["type"] = gdf.geometry.geom_type    
    geomTypes = ["Polygon","MultiPolygon"]
    
    for geomType in geomTypes:
        gdfType = gdf.loc[gdf["type"]== geomType]
        geomTypeLower = str.lower(geomType)
        gdfType['geom'] = gdfType['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))
        gdfType.drop(["geometry","type"],1, inplace=True)      
        print("Create table temp%s" %(geomTypeLower)) 
        gdfType.to_sql(
            name = "temp%s" %(geomTypeLower),
            con = engine,
            if_exists='replace',
            index= saveIndex, 
            dtype={'geom': Geometry(str.upper(geomType), srid= 4326)}
        )
        
    # Merge both tables and make valid
    sql = []
    sql.append("DROP TABLE IF EXISTS %s"  %(tableName))
    sql.append("ALTER TABLE temppolygon ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);")
    sql.append("CREATE TABLE %s AS (SELECT * FROM temppolygon UNION SELECT * FROM tempmultipolygon);" %(tableName))
    sql.append("UPDATE %s SET geom = st_makevalid(geom);" %(tableName))
    sql.append("DROP TABLE temppolygon,tempmultipolygon")

    for statement in sql:
        print(statement)
        result = connection.execute(statement)    
    gdfFromSQL =gpd.GeoDataFrame.from_postgis("select * from %s" %(tableName),connection,geom_col='geom' )
    return gdfFromSQL

In [10]:
gdf.head()

Unnamed: 0,AqID,geometry
0,0,POLYGON ((-27.61888898379225 81.47499585950835...
1,1,POLYGON ((-39.03055869110457 83.28637865242807...
2,2,POLYGON ((-38.84027676756097 83.10748143284015...
3,3,POLYGON ((-41.25250193478435 83.28526081079953...
4,4,POLYGON ((-42.14888349535629 83.24136222699917...


In [11]:
gdf.columns = map(str.lower, gdf.columns)

In [12]:
gdfFromSQL = uploadGDFtoPostGIS(gdf,OUTPUT_TABLE_NAME,False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Create table temppolygon
Create table tempmultipolygon
DROP TABLE IF EXISTS y2018m11d14_rh_whymap_to_rds_v01_v01
ALTER TABLE temppolygon ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);
CREATE TABLE y2018m11d14_rh_whymap_to_rds_v01_v01 AS (SELECT * FROM temppolygon UNION SELECT * FROM tempmultipolygon);
UPDATE y2018m11d14_rh_whymap_to_rds_v01_v01 SET geom = st_makevalid(geom);
DROP TABLE temppolygon,tempmultipolygon


In [13]:
filename_gpkg = "{}/{}.gpkg".format(ec2_output_path,SCRIPT_NAME)
filename_shp = "{}/{}.shp".format(ec2_output_path,SCRIPT_NAME)

In [14]:
gdfFromSQL.to_file(filename=filename_gpkg,
                   driver="GPKG",
                   encoding="UTF-8")

In [15]:
gdfFromSQL.to_file(filename=filename_shp,
                   driver="ESRI Shapefile",
                   encoding="UTF-8")

In [16]:
!aws s3 cp {ec2_output_path} {s3_output_path} --recursive

upload: ../../../../data/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018M11D14_RH_WHYMAP_to_RDS_V01.cpg to s3://wri-projects/Aqueduct30/processData/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018M11D14_RH_WHYMAP_to_RDS_V01.cpg
upload: ../../../../data/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018M11D14_RH_WHYMAP_to_RDS_V01.shx to s3://wri-projects/Aqueduct30/processData/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018M11D14_RH_WHYMAP_to_RDS_V01.shx
upload: ../../../../data/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018M11D14_RH_WHYMAP_to_RDS_V01.dbf to s3://wri-projects/Aqueduct30/processData/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018M11D14_RH_WHYMAP_to_RDS_V01.dbf
upload: ../../../../data/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018M11D14_RH_WHYMAP_to_RDS_V01.gpkg to s3://wri-projects/Aqueduct30/processData/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018M11D14_RH_WHYMAP_to_RDS_V01.gpkg
upload: ../../../../data/Y2018M11D14_RH_WHYMAP_to_RDS_V01/output_V01/Y2018

In [17]:
engine.dispose()

In [18]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:00:22.256098


Previous runs:  
0:00:15.656671  
0:00:15.656671