In [1]:
""" Add hydrobasins geometry and table to postGIS database. 
-------------------------------------------------------------------------------

The script requires a file called .password to be stored in the current working
directory with the password to the database.

Please note that columns with uppercase should be referred to by using double 
quotes whereas strings need single quotes. Please note that the script will 
consolidate two polygons in Russia that spans two hemispheres into one.

Author: Rutger Hofste
Date: 20171115
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

Args:
    TESTING (Boolean) : Toggle testing case.
    SCRIPT_NAME (string) : Script name.
    OUTPUT_VERSION (integer) : output version.
    DATABASE_ENDPOINT (string) : RDS or postGreSQL endpoint.
    DATABASE_NAME (string) : Database name.
    TABLE_NAME_AREA_30SPFAF06 (string) : Table name used for areas. Must exist
        on same database as used in rest of script.
    S3_INPUT_PATH_RIVERDISCHARGE (string) : AWS S3 input path for 
        riverdischarge.    
    S3_INPUT_PATH_DEMAND (string) : AWS S3 input path for 
        demand.    

"""

SCRIPT_NAME = "Y2017M11D15_RH_Add_HydroBasins_postGIS_V01"
OUTPUT_VERSION= 4

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/"
INPUT_FILENAME = "hybas_lev06_v1c_merged_fiona_V04" 

# Database settings
DATABASE_IDENTIFIER = "aqueduct30v05"
DATABASE_NAME = "database01"
OUTPUT_TABLE_NAME = "hybas06_v{:02.0f}".format(OUTPUT_VERSION)

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)

print("\nInput ec2: " + ec2_input_path,
      "\nInput s3 : " + S3_INPUT_PATH,
      "\nOutput postGIS table : " + OUTPUT_TABLE_NAME)



Input ec2: /volumes/data/Y2017M11D15_RH_Add_HydroBasins_postGIS_V01/input_V04 
Input s3 : s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/ 
Output postGIS table : hybas06_v04


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M05D24 UTC 12:12


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_input_path}
!rm -r {ec2_output_path}

!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

rm: cannot remove '/volumes/data/Y2017M11D15_RH_Add_HydroBasins_postGIS_V01/input_V04': No such file or directory
rm: cannot remove '/volumes/data/Y2017M11D15_RH_Add_HydroBasins_postGIS_V01/output_V04': No such file or directory


In [4]:
!aws s3 cp {S3_INPUT_PATH} {ec2_input_path} --recursive --quiet

In [5]:
import os
import boto3
import botocore
import pandas as pd
import geopandas as gpd
from sqlalchemy import *
from shapely.geometry.multipolygon import MultiPolygon
from geoalchemy2 import Geometry, WKTElement

In [6]:
def rdsConnect(database_identifier,database_name):
    rds = boto3.client('rds')
    F = open("/.password","r")
    password = F.read().splitlines()[0]
    F.close()
    response = rds.describe_db_instances(DBInstanceIdentifier="%s"%(database_identifier))
    status = response["DBInstances"][0]["DBInstanceStatus"]
    print("Status:",status)
    endpoint = response["DBInstances"][0]["Endpoint"]["Address"]
    print("Endpoint:",endpoint)
    engine = create_engine('postgresql://rutgerhofste:%s@%s:5432/%s' %(password,endpoint,database_name))
    connection = engine.connect()
    return engine, connection

def uploadGDFtoPostGIS(gdf,tableName,saveIndex):
    # this function uploads a polygon shapefile to table in AWS RDS. 
    # It handles combined polygon/multipolygon geometry and stores it in valid multipolygon in epsg 4326.
    
    # gdf = input geoDataframe
    # tableName = postGIS table name (string)
    # saveIndex = save index column in separate column in postgresql, otherwise discarded. (Boolean)
    
    
    gdf["type"] = gdf.geometry.geom_type    
    geomTypes = ["Polygon","MultiPolygon"]
    
    for geomType in geomTypes:
        gdfType = gdf.loc[gdf["type"]== geomType]
        geomTypeLower = str.lower(geomType)
        gdfType['geom'] = gdfType['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))
        gdfType.drop(["geometry","type"],1, inplace=True)      
        print("Create table temp%s" %(geomTypeLower)) 
        gdfType.to_sql(
            name = "temp%s" %(geomTypeLower),
            con = engine,
            if_exists='replace',
            index= saveIndex, 
            dtype={'geom': Geometry(str.upper(geomType), srid= 4326)}
        )
        
    # Merge both tables and make valid
    sql = []
    sql.append("DROP TABLE IF EXISTS %s"  %(tableName))
    sql.append("ALTER TABLE temppolygon ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);")
    sql.append("CREATE TABLE %s AS (SELECT * FROM temppolygon UNION SELECT * FROM tempmultipolygon);" %(tableName))
    sql.append("UPDATE %s SET geom = st_makevalid(geom);" %(tableName))
    sql.append("DROP TABLE temppolygon,tempmultipolygon")

    for statement in sql:
        print(statement)
        result = connection.execute(statement)    
    gdfFromSQL =gpd.GeoDataFrame.from_postgis("select * from %s" %(tableName),connection,geom_col='geom' )
    return gdfFromSQL


In [7]:
engine, connection = rdsConnect(DATABASE_IDENTIFIER,DATABASE_NAME)

Status: available
Endpoint: aqueduct30v05.cgpnumwmfcqc.eu-central-1.rds.amazonaws.com


In [8]:
gdf = gpd.read_file(os.path.join(ec2_input_path,INPUT_FILENAME+".shp"))

In [9]:
gdf.shape

(16397, 14)

In [10]:
gdf.columns = map(str.lower, gdf.columns)

In [11]:
gdf = gdf.set_index("pfaf_id", drop=False)

In [12]:
gdf.head()

Unnamed: 0_level_0,hybas_id,next_down,next_sink,main_bas,dist_sink,dist_main,sub_area,up_area,pfaf_id,endo,coast,order,sort,geometry
pfaf_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
811101,8060000010,0,8060000010,8060000010,0.0,0.0,928.6,928.6,811101,0,1,0,1,POLYGON ((-137.9333333333333 58.90416666666669...
811102,8060000060,0,8060000060,8060000060,0.0,0.0,28068.3,28068.3,811102,0,0,1,2,"POLYGON ((-136.425 59.8416666666667, -136.4283..."
811103,8060000070,0,8060000070,8060000070,0.0,0.0,4585.4,4585.4,811103,0,1,0,3,(POLYGON ((-139.6430555555555 59.6625000000000...
811104,8060000290,0,8060000290,8060000290,0.0,0.0,3774.6,3775.0,811104,0,0,1,4,POLYGON ((-139.6791666666666 60.74583333333334...
811105,8060000300,0,8060000300,8060000300,0.0,0.0,1351.1,1351.1,811105,0,1,0,5,"POLYGON ((-139.9791666666667 59.7791666666667,..."


Dissolve polygon in Siberia with pfaf_id 353020

In [13]:
gdf = gdf.dissolve(by="pfaf_id")

Defaulting to column but this will raise an ambiguity error in a future version
  aggregated_data = data.groupby(by=by).agg(aggfunc)
Defaulting to column but this will raise an ambiguity error in a future version
  g = self.groupby(by=by, group_keys=False)[self.geometry.name].agg(merge_geometries)


In [14]:
gdf["pfaf_id"] = gdf.index

In [15]:
gdfFromSQL = uploadGDFtoPostGIS(gdf,OUTPUT_TABLE_NAME,False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Create table temppolygon
Create table tempmultipolygon
DROP TABLE IF EXISTS hybas06_v04
ALTER TABLE temppolygon ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);
CREATE TABLE hybas06_v04 AS (SELECT * FROM temppolygon UNION SELECT * FROM tempmultipolygon);
UPDATE hybas06_v04 SET geom = st_makevalid(geom);
DROP TABLE temppolygon,tempmultipolygon


### Testing

In [16]:
gdfFromSQL.head()

Unnamed: 0,hybas_id,next_down,next_sink,main_bas,dist_sink,dist_main,sub_area,up_area,endo,coast,order,sort,pfaf_id,geom
0,1060000160,0,1060000160,1060000160,0.0,0.0,16638.1,16638.1,0,1,0,5,111015,(POLYGON ((34.00000000000003 26.62500000000002...
1,1060001090,0,1060001090,1060001090,0.0,0.0,6566.3,6566.3,0,1,0,9,111019,(POLYGON ((36.24444444444446 23.55833333333337...
2,1060001370,0,1060001370,1060001370,0.0,0.0,11678.4,11678.5,0,0,1,10,111020,(POLYGON ((35.14166666666669 22.58333333333336...
3,1060001510,0,1060001510,1060001510,0.0,0.0,13728.7,42400.1,0,0,1,12,111041,(POLYGON ((36.52916666666668 21.25000000000003...
4,1060001520,0,1060001520,1060001520,0.0,0.0,16198.5,16198.5,0,1,0,21,111050,(POLYGON ((37.19583333333335 20.00416666666668...


In [17]:
gdfFromSQL.shape

(16396, 14)

In [18]:
connection.close()

In [19]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:05:42.930054


Previous Runs:  
0:05:42.930054
