### Add HydroBasin data to Postgis Database server

* Purpose of script: Ingest Data from HydroBasins to postgis. Data includes geometries and attribute data
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20171110

The script requires a file called .password to be stored in the current working directory with the password to the database.

Please note that columns with uppercase should be referred to by using double quotes whereas strings need single quotes. Please note that the script will consolidate two polygons in Russia that spans two hemispheres into one. 

In [1]:
%matplotlib inline
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2017M11D23 UTC 15:34


'3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:26:58) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
SCRIPT_NAME = "Y2017M11D15_RH_Add_HydroBasins_postGIS_V01"

INPUT_VERSION = 3
OUTPUT_VERSION= 1

EC2_INPUT_PATH = "/volumes/data/%s/input" %(SCRIPT_NAME)
EC2_OUTPUT_PATH = "/volumes/data/%s/output" %(SCRIPT_NAME)

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/"

INPUT_FILENAME = "hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V%0.2d" %(INPUT_VERSION)

# Database settings
DATABASE_IDENTIFIER = "aqueduct30v02"
DATABASE_NAME = "database01"
TABLE_NAME = "hydrobasin6_v%0.2d" %(OUTPUT_VERSION)

In [3]:
!rm -r {EC2_INPUT_PATH}
!rm -r {EC2_OUTPUT_PATH}

!mkdir -p {EC2_INPUT_PATH}
!mkdir -p {EC2_OUTPUT_PATH}

In [4]:
!aws s3 cp {S3_INPUT_PATH} {EC2_INPUT_PATH} --recursive --quiet

In [5]:
import os
import boto3
import botocore
from sqlalchemy import *
import geopandas as gpd
import pandas as pd
from shapely.geometry.multipolygon import MultiPolygon
from geoalchemy2 import Geometry, WKTElement

In [6]:
def rdsConnect(database_identifier,database_name):
    rds = boto3.client('rds')
    F = open(".password","r")
    password = F.read().splitlines()[0]
    F.close()
    response = rds.describe_db_instances(DBInstanceIdentifier="%s"%(database_identifier))
    status = response["DBInstances"][0]["DBInstanceStatus"]
    print("Status:",status)
    endpoint = response["DBInstances"][0]["Endpoint"]["Address"]
    print("Endpoint:",endpoint)
    engine = create_engine('postgresql://rutgerhofste:%s@%s:5432/%s' %(password,endpoint,database_name))
    connection = engine.connect()
    return engine, connection

def uploadGDFtoPostGIS(gdf,tableName,saveIndex):
    # this function uploads a polygon shapefile to table in AWS RDS. 
    # It handles combined polygon/multipolygon geometry and stores it in valid multipolygon in epsg 4326.
    
    # gdf = input geoDataframe
    # tableName = postGIS table name (string)
    # saveIndex = save index column in separate column in postgresql, otherwise discarded. (Boolean)
    
    
    gdf["type"] = gdf.geometry.geom_type    
    geomTypes = ["Polygon","MultiPolygon"]
    
    for geomType in geomTypes:
        gdfType = gdf.loc[gdf["type"]== geomType]
        geomTypeLower = str.lower(geomType)
        gdfType['geom'] = gdfType['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))
        gdfType.drop(["geometry","type"],1, inplace=True)      
        print("Create table temp%s" %(geomTypeLower)) 
        gdfType.to_sql(
            name = "temp%s" %(geomTypeLower),
            con = engine,
            if_exists='replace',
            index= saveIndex, 
            dtype={'geom': Geometry(str.upper(geomType), srid= 4326)}
        )
        
    # Merge both tables and make valid
    sql = []
    sql.append("DROP TABLE IF EXISTS %s"  %(tableName))
    sql.append("ALTER TABLE temppolygon ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);")
    sql.append("CREATE TABLE %s AS (SELECT * FROM temppolygon UNION SELECT * FROM tempmultipolygon);" %(tableName))
    sql.append("UPDATE %s SET geom = st_makevalid(geom);" %(tableName))
    sql.append("DROP TABLE temppolygon,tempmultipolygon")

    for statement in sql:
        print(statement)
        result = connection.execute(statement)    
    gdfFromSQL =gpd.GeoDataFrame.from_postgis("select * from %s" %(tableName),connection,geom_col='geom' )
    return gdfFromSQL


In [7]:
engine, connection = rdsConnect(DATABASE_IDENTIFIER,DATABASE_NAME)

Status: available
Endpoint: aqueduct30v02.cgpnumwmfcqc.eu-central-1.rds.amazonaws.com


In [8]:
gdf = gpd.read_file(os.path.join(EC2_INPUT_PATH,INPUT_FILENAME+".shp"))

In [9]:
gdf.shape

(16397, 2)

In [10]:
gdf.columns = map(str.lower, gdf.columns)

In [11]:
gdf = gdf.set_index("pfaf_id", drop=False)

In [12]:
gdf.head()

Unnamed: 0_level_0,pfaf_id,geometry
pfaf_id,Unnamed: 1_level_1,Unnamed: 2_level_1
611001,611001,(POLYGON ((-78.99722222222219 9.45416666666669...
611002,611002,POLYGON ((-77.00416666666663 5.770833333333362...
611003,611003,POLYGON ((-76.88749999999997 7.679166666666696...
611004,611004,POLYGON ((-76.51249999999996 7.587500000000028...
611005,611005,(POLYGON ((-76.17638888888887 9.37500000000002...


Dissolve polygon in Siberia with pfaf_id 353020

In [13]:
gdf = gdf.dissolve(by="pfaf_id")

Defaulting to column but this will raise an ambiguity error in a future version
  aggregated_data = data.groupby(by=by).agg(aggfunc)
Defaulting to column but this will raise an ambiguity error in a future version
  g = self.groupby(by=by, group_keys=False)[self.geometry.name].agg(merge_geometries)


In [14]:
gdf["pfaf_id"] = gdf.index

In [15]:
gdf.shape

(16396, 2)

In [16]:
#gdf = gdf.drop_duplicates(subset="pfaf_id",keep='first')

In [17]:
df = pd.read_csv(os.path.join(EC2_INPUT_PATH,INPUT_FILENAME+".csv"))

In [18]:
df.columns = map(str.lower, df.columns)

In [19]:
df = df.drop_duplicates(subset="pfaf_id",keep='first')

In [20]:
df.dtypes

pfaf_id                   int64
hybas_id2                 int64
hybas_id                  int64
next_down                 int64
next_sink                 int64
main_bas                  int64
dist_sink               float64
dist_main               float64
sub_area                float64
up_area                 float64
pfaf_id.1                 int64
endo                      int64
coast                     int64
order                     int64
sort                      int64
upstream_hybas_ids       object
upstream_pfaf_ids        object
downstream_hybas_ids     object
downstream_pfaf_ids      object
next_sink_pfaf          float64
basin_hybas_ids          object
basin_pfaf_ids           object
sub_name                 object
maj_name                 object
faoid_copy               object
dtype: object

Select attributes that are NF 1-3 compliant

In [21]:
df2 = df[["pfaf_id","hybas_id","next_down","next_sink","main_bas","dist_sink","dist_main","sub_area","up_area","endo","coast","order","sort"]]

In [22]:
gdf2 = gdf.merge(df2,on="pfaf_id")

In [23]:
gdf2 = gdf2.set_index("pfaf_id",drop=False)

In [24]:
gdf2.head()

Unnamed: 0_level_0,geometry,pfaf_id,hybas_id,next_down,next_sink,main_bas,dist_sink,dist_main,sub_area,up_area,endo,coast,order,sort
pfaf_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
111011,"POLYGON ((32.36666666666668 29.6791666666667, ...",111011,1060000010,0,1060000010,1060000010,0.0,0.0,1890.8,1890.8,0,1,0,1
111012,"POLYGON ((31.73333333333336 29.88333333333337,...",111012,1060000100,0,1060000100,1060000100,0.0,0.0,2925.9,2925.9,0,0,1,2
111013,"POLYGON ((32.38750000000002 29.38333333333336,...",111013,1060000110,0,1060000110,1060000110,0.0,0.0,893.5,893.5,0,1,0,3
111014,"POLYGON ((31.97500000000002 29.1666666666667, ...",111014,1060000150,0,1060000150,1060000150,0.0,0.0,4217.3,4217.4,0,0,1,4
111015,(POLYGON ((33.70694444444447 27.75000000000003...,111015,1060000160,0,1060000160,1060000160,0.0,0.0,16638.1,16638.1,0,1,0,5


In [25]:
gdf2.shape

(16396, 14)

In [26]:
gdfFromSQL = uploadGDFtoPostGIS(gdf2,TABLE_NAME,False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Create table temppolygon
Create table tempmultipolygon
DROP TABLE IF EXISTS hydrobasin6_v01
ALTER TABLE temppolygon ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);
CREATE TABLE hydrobasin6_v01 AS (SELECT * FROM temppolygon UNION SELECT * FROM tempmultipolygon);
UPDATE hydrobasin6_v01 SET geom = st_makevalid(geom);
DROP TABLE temppolygon,tempmultipolygon


### Testing

In [27]:
gdfFromSQL.head()

Unnamed: 0,pfaf_id,hybas_id,next_down,next_sink,main_bas,dist_sink,dist_main,sub_area,up_area,endo,coast,order,sort,geom
0,111015,1060000160,0,1060000160,1060000160,0.0,0.0,16638.1,16638.1,0,1,0,5,(POLYGON ((34.00000000000003 26.62500000000002...
1,111019,1060001090,0,1060001090,1060001090,0.0,0.0,6566.3,6566.3,0,1,0,9,(POLYGON ((36.24444444444446 23.55833333333337...
2,111020,1060001370,0,1060001370,1060001370,0.0,0.0,11678.4,11678.5,0,0,1,10,(POLYGON ((35.14166666666669 22.58333333333336...
3,111041,1060001510,0,1060001510,1060001510,0.0,0.0,13728.7,42400.1,0,0,1,12,(POLYGON ((36.52916666666668 21.25000000000003...
4,111050,1060001520,0,1060001520,1060001520,0.0,0.0,16198.5,16198.5,0,1,0,21,"(POLYGON ((37.3402777777778 20.77916666666669,..."


In [28]:
gdfFromSQL.shape

(16396, 14)

In [29]:
connection.close()

In [30]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:04:21.254325
