### Add HydroBasin data to Postgis Database server

* Purpose of script: Ingest Data from HydroBasins last step into a postGIS database
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20171110

The script requires a file called .password to be stored in the current working directory with the password to the database.

Please note that columns with uppercase should be referred to by using double quotes whereas strings need single quotes. Please note that the script will consolidate two polygons in Russia that spans two hemispheres into one. 

In [1]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2017M11D17 UTC 11:16


'3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:26:58) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
SCRIPT_NAME = "Y2017M11D15_RH_Add_HydroBasins_postGIS_V01"

INPUT_VERSION = 3

EC2_INPUT_PATH = "/volumes/data/%s/input" %(SCRIPT_NAME)
EC2_OUTPUT_PATH = "/volumes/data/%s/output" %(SCRIPT_NAME)

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/"

INPUT_FILENAME = "hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V%0.2d" %(INPUT_VERSION)

# Database settings
DATABASE_IDENTIFIER = "aqueduct30v01"
DATABASE_NAME = "database01"
TABLE_NAME = "hybasvalid02"

In [3]:
import os
import boto3
import botocore
from sqlalchemy import *
import geopandas as gpd
import pandas as pd
from shapely.geometry.multipolygon import MultiPolygon
from geoalchemy2 import Geometry, WKTElement

In [4]:
%matplotlib inline

In [5]:
rds = boto3.client('rds')

In [6]:
F = open(".password","r")
password = F.read().splitlines()[0]
F.close()

In [7]:
response = rds.describe_db_instances(DBInstanceIdentifier="%s"%(DATABASE_IDENTIFIER))

In [8]:
status = response["DBInstances"][0]["DBInstanceStatus"]

In [9]:
print(status)

available


In [10]:
endpoint = response["DBInstances"][0]["Endpoint"]["Address"]

In [11]:
print(endpoint)

aqueduct30v01.cgpnumwmfcqc.eu-central-1.rds.amazonaws.com


In [12]:
engine = create_engine('postgresql://rutgerhofste:%s@%s:5432/%s' %(password,endpoint,DATABASE_NAME))

In [13]:
connection = engine.connect()

In [14]:
!rm -r {EC2_INPUT_PATH}
!rm -r {EC2_OUTPUT_PATH}

!mkdir -p {EC2_INPUT_PATH}
!mkdir -p {EC2_OUTPUT_PATH}

In [15]:
!aws s3 cp {S3_INPUT_PATH} {EC2_INPUT_PATH} --recursive --quiet

In [16]:
gdf = gpd.read_file(os.path.join(EC2_INPUT_PATH,INPUT_FILENAME+".shp"))

In [17]:
gdf.columns = map(str.lower, gdf.columns)

In [18]:
gdf = gdf.set_index("pfaf_id", drop=False)

for PostgreSQL its better to have non-duplicate tables whereas for Pandas having duplicate column names is better. Renaming.  

In [19]:
gdf.columns = ['pfaf_id2', 'geometry']

In [20]:
gdf.head()

Unnamed: 0_level_0,pfaf_id2,geometry
pfaf_id,Unnamed: 1_level_1,Unnamed: 2_level_1
611001,611001,(POLYGON ((-78.99722222222219 9.45416666666669...
611002,611002,POLYGON ((-77.00416666666663 5.770833333333362...
611003,611003,POLYGON ((-76.88749999999997 7.679166666666696...
611004,611004,POLYGON ((-76.51249999999996 7.587500000000028...
611005,611005,(POLYGON ((-76.17638888888887 9.37500000000002...


Dissolve polygon in Siberia with pfaf_id 353020

In [21]:
gdf = gdf.dissolve(by=gdf.index)

In [22]:
gdf2 = gdf.copy()
gdf2["type"] = gdf2.geometry.geom_type
gdfPolygon = gdf2.loc[gdf2["type"]=="Polygon"]
gdfMultiPolygon = gdf2.loc[gdf2["type"]=="MultiPolygon"]
gdfPolygon2 = gdfPolygon.copy()
gdfMultiPolygon2 = gdfMultiPolygon.copy()

In [23]:
gdfPolygon2['geom'] = gdfPolygon['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))
gdfMultiPolygon2['geom'] = gdfMultiPolygon['geometry'].apply(lambda x: WKTElement(x.wkt, srid=4326))

In [24]:
gdfPolygon2.drop("geometry",1, inplace=True)
gdfMultiPolygon2.drop("geometry",1, inplace=True)

In [25]:
gdfPolygon2.drop("type",1, inplace=True)
gdfMultiPolygon2.drop("type",1, inplace=True)

In [26]:
gdfPolygon2.head()

Unnamed: 0_level_0,pfaf_id2,geom
pfaf_id,Unnamed: 1_level_1,Unnamed: 2_level_1
111011,111011,"POLYGON ((32.36666666666668 29.6791666666667, ..."
111012,111012,"POLYGON ((31.73333333333336 29.88333333333337,..."
111013,111013,"POLYGON ((32.38750000000002 29.38333333333336,..."
111014,111014,"POLYGON ((31.97500000000002 29.1666666666667, ..."
111016,111016,"POLYGON ((33.66250000000003 26.21250000000002,..."


In [27]:
tableNamePolygon = TABLE_NAME+"polygon"
tableNameMultiPolygon = TABLE_NAME+"multipolygon"
tableNameGeometries = TABLE_NAME+"geometries"
tableNameAttributes = TABLE_NAME+"attributes"
tableNameOut = TABLE_NAME

In [28]:
gdfPolygon2.to_sql(tableNamePolygon, engine, if_exists='replace', index=False, 
                         dtype={'geom': Geometry('POLYGON', srid= 4326)})

In [29]:
gdfMultiPolygon2.to_sql(tableNameMultiPolygon, engine, if_exists='replace', index=False, 
                         dtype={'geom': Geometry('MULTIPOLYGON', srid= 4326)})

In [30]:
df = pd.read_csv(os.path.join(EC2_INPUT_PATH,INPUT_FILENAME+".csv"))

In [31]:
df.columns = map(str.lower, df.columns)

In [32]:
df = df.set_index("pfaf_id", drop=False)

In [33]:
df.head()

Unnamed: 0_level_0,pfaf_id,hybas_id2,hybas_id,next_down,next_sink,main_bas,dist_sink,dist_main,sub_area,up_area,...,upstream_hybas_ids,upstream_pfaf_ids,downstream_hybas_ids,downstream_pfaf_ids,next_sink_pfaf,basin_hybas_ids,basin_pfaf_ids,sub_name,maj_name,faoid_copy
pfaf_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
611001,611001,6060000010,6060000010,0,6060000010,6060000010,0.0,0.0,4317.4,4317.4,...,[],[],[],[],611001.0,[6060000010],[611001],"['Archipielago de San Blas Coast', 'Altrato 1']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001002', 'MAJ_BAS_300..."
611002,611002,6060000200,6060000200,0,6060000200,6060000200,0.0,0.0,35995.5,35996.7,...,[],[],[],[],611002.0,[6060000200],[611002],"['Altrato 1', 'Sucio', 'Altrato 2']","['Caribbean Coast', 'Caribbean Coast', 'Caribb...","['MAJ_BAS_3001_SUB_BASE_0001003', 'MAJ_BAS_300..."
611003,611003,6060000210,6060000210,0,6060000210,6060000210,0.0,0.0,443.9,443.9,...,[],[],[],[],611003.0,[6060000210],[611003],"['Altrato 1', 'Golfo del Darien Coast']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001003', 'MAJ_BAS_300..."
611004,611004,6060000240,6060000240,0,6060000240,6060000240,0.0,0.0,2186.3,2186.3,...,[],[],[],[],611004.0,[6060000240],[611004],['Golfo del Darien Coast'],['Caribbean Coast'],['MAJ_BAS_3001_SUB_BASE_0001006']
611005,611005,6060000250,6060000250,0,6060000250,6060000250,0.0,0.0,6533.8,6533.8,...,[],[],[],[],611005.0,[6060000250],[611005],"['Golfo del Darien Coast', 'Sinu']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001006', 'MAJ_BAS_300..."


In [34]:
df.shape

(16397, 25)

Dissolve polygon in Siberia with pfaf_id 353020

In [35]:
df.drop_duplicates(subset="pfaf_id",keep='first', inplace=True)

In [36]:
df.shape

(16396, 25)

In [37]:
df.to_sql(tableNameAttributes,engine,if_exists='replace', index=False)

### Outer Join

We now have three tables: Polygons, Multipolygons and Attributes. We will perform some operations and perform an outer join.   
Convert polygons to multipolygon and make valid

In [38]:
sql = "ALTER TABLE %s ALTER COLUMN geom type geometry(MultiPolygon, 4326) using ST_Multi(geom);" %(tableNamePolygon)
result = connection.execute(sql)

In [39]:
sql = "CREATE TABLE %s AS (SELECT * FROM %s UNION SELECT * FROM %s);" %(tableNameGeometries, tableNamePolygon,tableNameMultiPolygon)
result = connection.execute(sql)

In [40]:
sql = "update %s set geom = st_makevalid(geom);" %(tableNameGeometries)
result = connection.execute(sql)

In [41]:
sql = "CREATE TABLE %s AS SELECT * FROM %s l LEFT JOIN %s r ON l.pfaf_id2 = r.pfaf_id;" %(tableNameOut,tableNameGeometries,tableNameAttributes)
result = connection.execute(sql)

In [42]:
sql = 'ALTER TABLE %s DROP COLUMN IF EXISTS pfaf_id2, DROP COLUMN IF EXISTS "pfaf_id.1", DROP COLUMN IF EXISTS hybas_id2' %(tableNameOut)
result = connection.execute(sql)

In [43]:
sql = "DROP TABLE %s,%s,%s,%s" %(tableNamePolygon,tableNameMultiPolygon,tableNameAttributes,tableNameGeometries)
result = connection.execute(sql)

### Testing

In [44]:
sql = "select * from %s" %(tableNameOut)

In [45]:
gdfFromSQL =gpd.GeoDataFrame.from_postgis(sql,connection,geom_col='geom' ).set_index("pfaf_id", drop=False)

In [46]:
gdfFromSQL.head()

Unnamed: 0_level_0,geom,pfaf_id,hybas_id,next_down,next_sink,main_bas,dist_sink,dist_main,sub_area,up_area,...,upstream_hybas_ids,upstream_pfaf_ids,downstream_hybas_ids,downstream_pfaf_ids,next_sink_pfaf,basin_hybas_ids,basin_pfaf_ids,sub_name,maj_name,faoid_copy
pfaf_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111019,(POLYGON ((36.24444444444446 23.55833333333337...,111019,1060001090,0,1060001090,1060001090,0.0,0.0,6566.3,6566.3,...,[],[],[],[],111019.0,[1060001090],[111019],['Egyptian east coast'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0190313']
111084,(POLYGON ((38.56250000000002 15.88750000000003...,111084,1060550700,1060525050,1060002760,1060002760,217.4,217.4,12507.3,12507.6,...,[],[],"[1060525050, 1060002760]","[111083, 111081]",111081.0,"[1060525050, 1060002760, 1060550700]","[111083, 111081, 111084]",['Nahr al Qash'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0190318']
111087,(POLYGON ((38.22916666666669 16.00833333333337...,111087,1060581710,1060550940,1060002760,1060002760,326.1,326.1,11457.1,16392.8,...,"[1060606710, 1060606390]","[111088, 111089]","[1060550940, 1060525050, 1060002760]","[111085, 111083, 111081]",111081.0,"[1060550940, 1060525050, 1060002760, 106060671...","[111085, 111083, 111081, 111088, 111089, 111087]",['Nahr al Qash'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0190318']
112012,(POLYGON ((47.55833333333336 8.212500000000016...,112012,1060965550,1060040270,1060040270,1060040270,0.2,0.2,18049.7,18050.3,...,[],[],[1060040270],[112011],112011.0,"[1060040270, 1060965550]","[112011, 112012]",['Ogaden'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0193491']
112046,(POLYGON ((46.65000000000002 4.733333333333354...,112046,1061028620,1061038600,1060040390,1060040390,88.7,88.7,15641.3,15641.3,...,[],[],"[1061038600, 1061043750, 1060040390]","[112045, 112043, 112041]",112041.0,"[1061038600, 1061043750, 1060040390, 1061028620]","[112045, 112043, 112041, 112046]",['Ogaden'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0193491']


In [47]:
gdfFromSQL.shape

(16396, 24)

In [48]:
connection.close()

In [49]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:05:11.968857
