### Add HydroBasin data to Postgis Database server

* Purpose of script: Ingest Data from HydroBasins last step into a postGIS database
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20171110

The script requires a file called .password to be stored in the current working directory with the password to the database.

Please note that columns with uppercase should be referred to by using double quotes whereas strings need single quotes. 

In [1]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2017M11D15 UTC 17:54


'3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:26:58) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
SCRIPT_NAME = "Y2017M11D15_RH_Add_HydroBasins_postGIS_V01"

INPUT_VERSION = 1

EC2_INPUT_PATH = "/volumes/data/%s/input" %(SCRIPT_NAME)
EC2_OUTPUT_PATH = "/volumes/data/%s/output" %(SCRIPT_NAME)

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/"

INPUT_FILENAME = "hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V%0.2d" %(INPUT_VERSION)

# Database settings
DATABASE_IDENTIFIER = "aqueduct30v01"
DATABASE_NAME = "database01"
TABLE_NAME = "hybasvalid03"

In [3]:
import os
import boto3
import botocore
from sqlalchemy import *
import geopandas as gpd
import pandas as pd
from shapely.geometry.multipolygon import MultiPolygon
from geoalchemy2 import Geometry, WKTElement

In [4]:
rds = boto3.client('rds')

In [5]:
F = open(".password","r")
password = F.read().splitlines()[0]
F.close()

In [6]:
response = rds.describe_db_instances(DBInstanceIdentifier="%s"%(DATABASE_IDENTIFIER))

In [7]:
status = response["DBInstances"][0]["DBInstanceStatus"]

In [8]:
print(status)

available


In [9]:
endpoint = response["DBInstances"][0]["Endpoint"]["Address"]

In [10]:
print(endpoint)

aqueduct30v01.cgpnumwmfcqc.eu-central-1.rds.amazonaws.com


In [11]:
engine = create_engine('postgresql://rutgerhofste:%s@%s:5432/%s' %(password,endpoint,DATABASE_NAME))

In [12]:
connection = engine.connect()

In [None]:
!rm -r {EC2_INPUT_PATH}
!rm -r {EC2_OUTPUT_PATH}

!mkdir -p {EC2_INPUT_PATH}
!mkdir -p {EC2_OUTPUT_PATH}

In [None]:
!aws s3 cp {S3_INPUT_PATH} {EC2_INPUT_PATH} --recursive --quiet

In [13]:
gdf = gpd.read_file(os.path.join(EC2_INPUT_PATH,INPUT_FILENAME+".shp"))

In [14]:
gdf = gdf.set_index("PFAF_ID", drop=False)

In [15]:
gdf.columns = map(str.lower, gdf.columns)

for PostgreSQL its better to have non-duplicate tables whereas for Pandas having duplicate column names is better. Renaming.  

In [18]:
gdf.columns = ['pfaf_id2', 'geometry']

In [19]:
gdf.head()

Unnamed: 0_level_0,pfaf_id2,geometry
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
611001,611001,(POLYGON ((-78.99722222222219 9.45416666666669...
611002,611002,POLYGON ((-77.00416666666663 5.770833333333362...
611003,611003,POLYGON ((-76.88749999999997 7.679166666666696...
611004,611004,POLYGON ((-76.51249999999996 7.587500000000028...
611005,611005,(POLYGON ((-76.17638888888887 9.37500000000002...


In [20]:
gdf2 = gdf.copy()
gdf2["type"] = gdf2.geometry.geom_type
gdfPolygon = gdf2.loc[gdf2["type"]=="Polygon"]
gdfMultiPolygon = gdf2.loc[gdf2["type"]=="MultiPolygon"]
gdfPolygon2 = gdfPolygon.copy()
gdfMultiPolygon2 = gdfMultiPolygon.copy()

In [21]:
gdfPolygon2['geom'] = gdfPolygon['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))
gdfMultiPolygon2['geom'] = gdfMultiPolygon['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))

In [22]:
gdfPolygon2.drop("geometry",1, inplace=True)
gdfMultiPolygon2.drop("geometry",1, inplace=True)

In [23]:
gdfPolygon2.drop("type",1, inplace=True)
gdfMultiPolygon2.drop("type",1, inplace=True)

In [24]:
gdfPolygon2.head()

Unnamed: 0_level_0,pfaf_id2,geom
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
611002,611002,POLYGON ((-77.00416666666663 5.770833333333362...
611003,611003,POLYGON ((-76.88749999999997 7.679166666666696...
611004,611004,POLYGON ((-76.51249999999996 7.587500000000028...
611006,611006,"POLYGON ((-76.0208333333333 7.32083333333336, ..."
611008,611008,POLYGON ((-75.18333333333331 10.53750000000002...


In [25]:
tableNamePolygon = TABLE_NAME+"polygon"
tableNameMultiPolygon = TABLE_NAME+"multipolygon"
tableNameAttributes = TABLE_NAME+"attributes"

In [26]:
gdfPolygon2.to_sql(tableNamePolygon, engine, if_exists='replace', index=False, 
                         dtype={'geom': Geometry('POLYGON', srid= 4326)})

In [27]:
sql = "create table %s as select * from %s" %(tableNamePolygon+"_pristine",tableNamePolygon)
result = connection.execute(sql)

In [28]:
gdfMultiPolygon2.to_sql(tableNameMultiPolygon, engine, if_exists='replace', index=False, 
                         dtype={'geom': Geometry('MULTIPOLYGON', srid= 4326)})

In [29]:
sql = "create table %s as select * from %s" %(tableNameMultiPolygon+"_pristine",tableNameMultiPolygon)
result = connection.execute(sql)

In [30]:
df = pd.read_csv(os.path.join(EC2_INPUT_PATH,INPUT_FILENAME+".csv"))

In [31]:
df = df.set_index("PFAF_ID", drop=False)

In [32]:
df.columns = map(str.lower, df.columns)

In [33]:
df.to_sql(tableNameAttributes,engine,if_exists='replace', index=False)

In [34]:
sql = "create table %s as select * from %s" %(tableNameAttributes+"_pristine",tableNameAttributes)
result = connection.execute(sql)

We now have three tables: Polygons, Multipolygons and Attributes. We will perform some operations and perform a join. 

In [35]:
sql = "ALTER TABLE %s ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);" %(tableNamePolygon)
result = connection.execute(sql)

In [39]:
sql = "CREATE TABLE test02 AS SELECT * FROM hybasvalid03polygon, hybasvalid03attributes WHERE hybasvalid03attributes.pfaf_id = hybasvalid03polygon.pfaf_id2;" 
result = connection.execute(sql)


In [None]:
SELECT geom
    FROM polygonselection, hybasvalid02attributes
    WHERE hybasvalid02attributes.pfaf_id = polygonselection.pfaf_id;

In [None]:
connection.close()