In [1]:
""" Add the FAO Names to the HydroBasins shapefile.
-------------------------------------------------------------------------------
Spatially join FAO Names hydrobasins to the official HydroBasins level 6 
polygons

Author: Rutger Hofste
Date: 20170825
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

Args:
    TESTING (Boolean) : Toggle testing case.
    SCRIPT_NAME (string) : Script name.
    OUTPUT_VERSION (integer) : output version.
    DATABASE_ENDPOINT (string) : RDS or postGreSQL endpoint.
    DATABASE_NAME (string) : Database name.
    TABLE_NAME_AREA_30SPFAF06 (string) : Table name used for areas. Must exist
        on same database as used in rest of script.
    S3_INPUT_PATH_RIVERDISCHARGE (string) : AWS S3 input path for 
        riverdischarge.    
    S3_INPUT_PATH_DEMAND (string) : AWS S3 input path for 
        demand.    

"""

SCRIPT_NAME = "Y2017M08D25_RH_spatial_join_FAONames_V01"
OUTPUT_VERSION = 7

S3_INPUT_PATH_FAO = "s3://wri-projects/Aqueduct30/processData/Y2017M08D23_RH_Buffer_FAONames_V01/output_V02/"
S3_INPUT_PATH_HYBAS = "s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/"

INPUT_FILE_NAME_FAO = "hydrobasins_fao_fiona_merged_buffered_v01.shp"
INPUT_FILE_NAME_HYBAS = "hybas_lev06_v1c_merged_fiona_V04.shp"

OUTPUT_FILE_NAME = "hybas_lev06_v1c_merged_fiona_withFAO_V%0.2d" %(OUTPUT_VERSION)

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)

print("Input ec2: " + ec2_input_path,
      "\nInput s3 FAO: " + S3_INPUT_PATH_FAO,
      "\nInput s3 Hybas: " + S3_INPUT_PATH_HYBAS,
      "\nOutput ec2: " + ec2_output_path,
      "\nOutput s3: " + s3_output_path)



Input ec2: /volumes/data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07 
Input s3 FAO: s3://wri-projects/Aqueduct30/processData/Y2017M08D23_RH_Buffer_FAONames_V01/output_V02/ 
Input s3 Hybas: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/ 
Output ec2: /volumes/data/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07 
Output s3: s3://wri-projects/Aqueduct30/processData/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M05D24 UTC 11:11


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
import os
if 'GDAL_DATA' not in os.environ:
    os.environ['GDAL_DATA'] = r'/usr/share/gdal/2.1'
from osgeo import gdal,ogr,osr
'GDAL_DATA' in os.environ
# If false, the GDAL_DATA variable is set incorrectly. You need this variable to obtain the spatial reference
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import time
%matplotlib notebook

In [4]:
!rm -r {ec2_input_path}
!rm -r {ec2_output_path}

!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

rm: cannot remove '/volumes/data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07': No such file or directory
rm: cannot remove '/volumes/data/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07': No such file or directory


In [5]:
!aws s3 cp {S3_INPUT_PATH_FAO} {ec2_input_path} --recursive 

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D23_RH_Buffer_FAONames_V01/output_V02/hydrobasins_fao_fiona_merged_buffered_v01.cpg to ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07/hydrobasins_fao_fiona_merged_buffered_v01.cpg
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D23_RH_Buffer_FAONames_V01/output_V02/hydrobasins_fao_fiona_merged_buffered_v01.prj to ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07/hydrobasins_fao_fiona_merged_buffered_v01.prj
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D23_RH_Buffer_FAONames_V01/output_V02/hydrobasins_fao_fiona_merged_buffered_v01.shx to ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07/hydrobasins_fao_fiona_merged_buffered_v01.shx
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D23_RH_Buffer_FAONames_V01/output_V02/hydrobasins_fao_fiona_merged_buffered_v01.dbf to ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07/hydrob

In [6]:
!aws s3 cp {S3_INPUT_PATH_HYBAS} {ec2_input_path} --recursive --exclude *.tif

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/hybas_lev00_v1c_merged_fiona_V04.cpg to ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07/hybas_lev00_v1c_merged_fiona_V04.cpg
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/hybas_lev06_v1c_merged_fiona_V04.prj to ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07/hybas_lev06_v1c_merged_fiona_V04.prj
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/hybas_lev00_v1c_merged_fiona_V04.prj to ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07/hybas_lev00_v1c_merged_fiona_V04.prj
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/hybas_lev06_v1c_merged_fiona_V04.cpg to ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/input_V07/hybas_lev06_v1c_merged_fiona_V04.cpg
download: s3://wri-proje

In [7]:
gdfFAO = gpd.read_file(os.path.join(ec2_input_path,INPUT_FILE_NAME_FAO))

In [8]:
list(gdfFAO)

['area',
 'SUB_BAS',
 'TO_BAS',
 'MAJ_BAS',
 'SUB_NAME',
 'MAJ_NAME',
 'SUB_AREA',
 'MAJ_AREA',
 'LEGEND',
 'index1',
 'geometry']

In [9]:
gdfHybas = gpd.read_file(os.path.join(ec2_input_path,INPUT_FILE_NAME_HYBAS))

In [10]:
list(gdfHybas)

['HYBAS_ID',
 'NEXT_DOWN',
 'NEXT_SINK',
 'MAIN_BAS',
 'DIST_SINK',
 'DIST_MAIN',
 'SUB_AREA',
 'UP_AREA',
 'PFAF_ID',
 'ENDO',
 'COAST',
 'ORDER',
 'SORT',
 'geometry']

In [11]:
gdfHybas.dtypes

HYBAS_ID       int64
NEXT_DOWN      int64
NEXT_SINK      int64
MAIN_BAS       int64
DIST_SINK    float64
DIST_MAIN    float64
SUB_AREA     float64
UP_AREA      float64
PFAF_ID        int64
ENDO           int64
COAST          int64
ORDER          int64
SORT           int64
geometry      object
dtype: object

In [12]:
gdfFAO.dtypes

area        float64
SUB_BAS       int64
TO_BAS        int64
MAJ_BAS       int64
SUB_NAME     object
MAJ_NAME     object
SUB_AREA      int64
MAJ_AREA      int64
LEGEND        int64
index1        int64
geometry     object
dtype: object

In [13]:
gdfFAO['index1_copy'] = gdfFAO['index1']

In [14]:
gdfFAO = gdfFAO.set_index('index1')

In [15]:
gdfFAO.index.name

'index1'

A spatial join was performed on the data. However the FAO polygons were stored as polygons and not as multi-polygons. The data also lacked a unique Identifier. The identifier consists of a combination of MAJ_BAS and SUB_BASE. The maximum length of MAJ_BAS is 4 and 6 for SUB_BAS (279252). We will store the identifier as a string with the format: MAJ_BASxxxxSUB_BASxxxxxxx (4,7)

In [16]:
gdfFAO['FAOid'] = gdfFAO.apply(lambda x:'MAJ_BAS_%0.4d_SUB_BAS_%0.7d' % (x['MAJ_BAS'],x['SUB_BAS']),axis=1)

In [17]:
gdfFAO.index.name

'index1'

In [18]:
dfFAO = gdfFAO.drop('geometry',1)

In [19]:
dfFAO.head()

Unnamed: 0_level_0,area,SUB_BAS,TO_BAS,MAJ_BAS,SUB_NAME,MAJ_NAME,SUB_AREA,MAJ_AREA,LEGEND,index1_copy,FAOid
index1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,12.432765,1001,1019,5001,Herlen Gol / Hulun Nur,Amur,104012,2086009,1,0,MAJ_BAS_5001_SUB_BAS_0001001
1,7.367752,1002,1006,5001,Onon,Amur,59873,2086009,1,1,MAJ_BAS_5001_SUB_BAS_0001002
2,6.239719,1003,-888,5001,Solonchak Zun Torey / Solonchak,Amur,50635,2086009,1,2,MAJ_BAS_5001_SUB_BAS_0001003
3,4.837981,1004,1011,5001,Ingoda,Amur,37746,2086009,1,3,MAJ_BAS_5001_SUB_BAS_0001004
4,1.094161,1005,1010,5001,Aga,Amur,8627,2086009,1,4,MAJ_BAS_5001_SUB_BAS_0001005


In [20]:
gdfFAO['FAOid_copy'] = gdfFAO['FAOid']

In [21]:
gdfFAO.index.name

'index1'

In [22]:
list(gdfFAO)

['area',
 'SUB_BAS',
 'TO_BAS',
 'MAJ_BAS',
 'SUB_NAME',
 'MAJ_NAME',
 'SUB_AREA',
 'MAJ_AREA',
 'LEGEND',
 'geometry',
 'index1_copy',
 'FAOid',
 'FAOid_copy']

In [23]:
gdfFAO = gdfFAO.dissolve(by='FAOid')

In [24]:
list(gdfFAO)

['geometry',
 'area',
 'SUB_BAS',
 'TO_BAS',
 'MAJ_BAS',
 'SUB_NAME',
 'MAJ_NAME',
 'SUB_AREA',
 'MAJ_AREA',
 'LEGEND',
 'index1_copy',
 'FAOid_copy']

In [25]:
dfFAO = gdfFAO.drop('geometry',1)

In [26]:
dfFAO.head()

Unnamed: 0_level_0,area,SUB_BAS,TO_BAS,MAJ_BAS,SUB_NAME,MAJ_NAME,SUB_AREA,MAJ_AREA,LEGEND,index1_copy,FAOid_copy
FAOid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MAJ_BAS_1001_SUB_BAS_0001001,0.865953,1001,1005,1001,Upper Roanoke,"Gulf of Mexico, North Atlantic Coast",8689,701385,1,21235,MAJ_BAS_1001_SUB_BAS_0001001
MAJ_BAS_1001_SUB_BAS_0001002,0.150201,1002,1004,1001,Banister,"Gulf of Mexico, North Atlantic Coast",1540,701385,1,21236,MAJ_BAS_1001_SUB_BAS_0001002
MAJ_BAS_1001_SUB_BAS_0001003,0.435553,1003,1004,1001,Upper Dan,"Gulf of Mexico, North Atlantic Coast",4403,701385,1,21237,MAJ_BAS_1001_SUB_BAS_0001003
MAJ_BAS_1001_SUB_BAS_0001004,0.414318,1004,1005,1001,Lower Dan,"Gulf of Mexico, North Atlantic Coast",4204,701385,1,21238,MAJ_BAS_1001_SUB_BAS_0001004
MAJ_BAS_1001_SUB_BAS_0001005,0.634339,1005,-999,1001,Lower Roanoke,"Gulf of Mexico, North Atlantic Coast",6496,701385,1,21239,MAJ_BAS_1001_SUB_BAS_0001005


In [27]:
validGeom = gdfFAO.geometry.is_valid

In [28]:
gdfFAO.crs = {'init': u'epsg:4326'}

In [29]:
gdfFAO = gdfFAO.set_index('index1_copy')

In [30]:
gdfJoined = gpd.sjoin(gdfHybas, gdfFAO ,how="left", op='intersects')

In [31]:
gdfJoined.shape

(24078, 25)

In [32]:
series = gdfJoined.groupby('PFAF_ID')['SUB_NAME'].apply(list)
series2 = gdfJoined.groupby('PFAF_ID')['MAJ_NAME'].apply(list)
series3 = gdfJoined.groupby('PFAF_ID')['FAOid_copy'].apply(list)

In [33]:
df_new1 = series.to_frame()
df_new2 = series2.to_frame()
df_new3 = series3.to_frame()

In [34]:
df_new1.head()

Unnamed: 0_level_0,SUB_NAME
PFAF_ID,Unnamed: 1_level_1
111011,[Wadi El Naqa]
111012,[Egyptian east coast]
111013,[Egyptian east coast]
111014,[Egyptian east coast]
111015,[Egyptian east coast]


In [35]:
df_out = df_new1.merge(right = df_new2, how = "outer", left_index = True, right_index = True )

In [36]:
df_out = df_out.merge(right = df_new3, how = "outer", left_index = True, right_index = True )

In [37]:
df_out.dtypes

SUB_NAME      object
MAJ_NAME      object
FAOid_copy    object
dtype: object

In [38]:
df_out.to_csv(os.path.join(ec2_output_path,OUTPUT_FILE_NAME+".csv"),encoding="UTF-8")

In [39]:
df_out.to_pickle(os.path.join(ec2_output_path,OUTPUT_FILE_NAME+".pkl"))

## Linking table

In [40]:
df_link = gdfJoined[["PFAF_ID","FAOid_copy"]]

In [41]:
print(os.path.join(ec2_output_path,OUTPUT_FILE_NAME+".csv"))

/volumes/data/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/hybas_lev06_v1c_merged_fiona_withFAO_V07.csv


In [42]:
df_link.to_csv(os.path.join(ec2_output_path,OUTPUT_FILE_NAME+"_link.csv"),encoding="UTF-8")

In [43]:
df_link.to_pickle(os.path.join(ec2_output_path,OUTPUT_FILE_NAME+"_link.pkl"))

In [44]:
df_link.head()

Unnamed: 0,PFAF_ID,FAOid_copy
0,811101,MAJ_BAS_1010_SUB_BAS_0010075
1,811102,MAJ_BAS_1010_SUB_BAS_0010075
1,811102,MAJ_BAS_1010_SUB_BAS_0010085
1,811102,MAJ_BAS_1010_SUB_BAS_0010088
2,811103,MAJ_BAS_1010_SUB_BAS_0010075


In [45]:
!aws s3 cp {ec2_output_path} {s3_output_path} --recursive

upload: ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/hybas_lev06_v1c_merged_fiona_withFAO_V07_link.pkl to s3://wri-projects/Aqueduct30/processData/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/hybas_lev06_v1c_merged_fiona_withFAO_V07_link.pkl
upload: ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/hybas_lev06_v1c_merged_fiona_withFAO_V07_link.csv to s3://wri-projects/Aqueduct30/processData/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/hybas_lev06_v1c_merged_fiona_withFAO_V07_link.csv
upload: ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/hybas_lev06_v1c_merged_fiona_withFAO_V07.csv to s3://wri-projects/Aqueduct30/processData/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/hybas_lev06_v1c_merged_fiona_withFAO_V07.csv
upload: ../../../../data/Y2017M08D25_RH_spatial_join_FAONames_V01/output_V07/hybas_lev06_v1c_merged_fiona_withFAO_V07.pkl to s3://wri-projects/Aqueduct30/processData/Y2017M08D25_RH_spatial_jo

In [46]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:06:58.588292


Previous Runs:  
0:06:43.013521