In [1]:
""" Combine zonal statistics of different indicators and calculate flux. 
-------------------------------------------------------------------------------
zonal_stats_ca_aq21ee_export.csv

Author: Rutger Hofste
Date: 20180619
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

Args:


"""

OVERWRITE = 1
TESTING = 0
SCRIPT_NAME = "Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01"
OUTPUT_VERSION = 6

GCS_INPUT_PATH = "gs://aqueduct30_v01/Y2018M06D18_RH_QA_AQ21_AQ30_Demand_Zonal_Stats_EE_V01/output_V03"

AQ21_SHAPEFILE_S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/qaData/Y2018M06D05_RH_QA_Aqueduct21_Flux_Shapefile_V01/output_V05"
AQ30_SHAPEFILE_S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/"
AQ21PROJ_SHAPEFILE_S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/qaData/Y2018M06D19_RH_QA_Download_Aq21projection_Shapefile_V01/output_V01"

AQ21_INPUT_FILE_NAME = "aqueduct21_flux"
AQ30_INPUT_FILE_NAME = "hybas_lev06_v1c_merged_fiona_V04"
AQ21PROJ_INPUT_FILE_NAME = "aqueduct21projection_flux"

ECKERT_IV_PROJ4_STRING = "+proj=eck4 +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 
s3_output_path = "s3://wri-projects/Aqueduct30/qaData/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)

print("Input GCS : " + GCS_INPUT_PATH +
      "\nInput ec2: " + ec2_input_path + 
      "\nOutput ec2: " + ec2_output_path +
      "\nOutput s3: " + ec2_output_path)

Input GCS : gs://aqueduct30_v01/Y2018M06D18_RH_QA_AQ21_AQ30_Demand_Zonal_Stats_EE_V01/output_V03
Input ec2: /volumes/data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06
Output ec2: /volumes/data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/output_V06
Output s3: /volumes/data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/output_V06


In [2]:
import time, datetime, sys, logging
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M06D20 UTC 19:51


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
import geopandas as gpd
import pandas as pd
pd.set_option("display.max_columns",500)

In [4]:
if OVERWRITE:
    !rm -r {ec2_input_path}
    !rm -r {ec2_output_path}
    !mkdir -p {ec2_input_path}
    !mkdir -p {ec2_output_path}
else: 
    !mkdir -p {ec2_input_path}
    !mkdir -p {ec2_output_path}

In [5]:
# Aq 21 shapefile
!aws s3 cp {AQ21_SHAPEFILE_S3_INPUT_PATH} {ec2_input_path} --recursive

download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D05_RH_QA_Aqueduct21_Flux_Shapefile_V01/output_V05/aqueduct21_flux.cpg to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/aqueduct21_flux.cpg
download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D05_RH_QA_Aqueduct21_Flux_Shapefile_V01/output_V05/aqueduct21_flux.prj to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/aqueduct21_flux.prj
download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D05_RH_QA_Aqueduct21_Flux_Shapefile_V01/output_V05/aqueduct21_flux.shx to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/aqueduct21_flux.shx
download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D05_RH_QA_Aqueduct21_Flux_Shapefile_V01/output_V05/aqueduct21_flux.dbf to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/aqueduct21_flux.dbf
download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D05_RH_QA_Aqueduct21_Flux_Shapefile_V01/output_V05/aqueduc

In [6]:
# Aq 30 shapefile
!aws s3 cp {AQ30_SHAPEFILE_S3_INPUT_PATH} {ec2_input_path} --recursive --exclude "*" --include "hybas_lev06_v1c_merged_fiona_V04*"

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/hybas_lev06_v1c_merged_fiona_V04.cpg to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/hybas_lev06_v1c_merged_fiona_V04.cpg
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/hybas_lev06_v1c_merged_fiona_V04.prj to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/hybas_lev06_v1c_merged_fiona_V04.prj
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/hybas_lev06_v1c_merged_fiona_V04.shx to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/hybas_lev06_v1c_merged_fiona_V04.shx
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V02/output_V04/hybas_lev06_v1c_merged_fiona_V04.dbf to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/hybas_lev06_v1c_merged_fiona_V04.dbf


In [7]:
# Aq 21 proj shapefile
!aws s3 cp {AQ21PROJ_SHAPEFILE_S3_INPUT_PATH} {ec2_input_path} --recursive

download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D19_RH_QA_Download_Aq21projection_Shapefile_V01/output_V01/aqueduct21projection_flux.cpg to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/aqueduct21projection_flux.cpg
download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D19_RH_QA_Download_Aq21projection_Shapefile_V01/output_V01/aqueduct21projection_flux.prj to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/aqueduct21projection_flux.prj
download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D19_RH_QA_Download_Aq21projection_Shapefile_V01/output_V01/aqueduct21projection_flux.shx to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/aqueduct21projection_flux.shx
download: s3://wri-projects/Aqueduct30/qaData/Y2018M06D19_RH_QA_Download_Aq21projection_Shapefile_V01/output_V01/aqueduct21projection_flux.shp to ../../../../data/Y2018M06D19_RH_QA_AQ21_AQ30_Demand_Cleanup_V01/input_V06/aqueduct21projection_flux.shp


In [8]:
# Zonal Stats

!gsutil cp {GCS_INPUT_PATH}/* {ec2_input_path}

Copying gs://aqueduct30_v01/Y2018M06D18_RH_QA_AQ21_AQ30_Demand_Zonal_Stats_EE_V01/output_V03/zonal_stats_ca_aq21ee_export.csv...
Copying gs://aqueduct30_v01/Y2018M06D18_RH_QA_AQ21_AQ30_Demand_Zonal_Stats_EE_V01/output_V03/zonal_stats_ca_aq21projee_export.csv...
Copying gs://aqueduct30_v01/Y2018M06D18_RH_QA_AQ21_AQ30_Demand_Zonal_Stats_EE_V01/output_V03/zonal_stats_ca_aq30ee_export.csv...
Copying gs://aqueduct30_v01/Y2018M06D18_RH_QA_AQ21_AQ30_Demand_Zonal_Stats_EE_V01/output_V03/zonal_stats_cd_aq21ee_export.csv...
- [4 files][ 86.4 MiB/ 86.4 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m -o ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://aqueduct30_v01/Y2018M06D18_RH_QA_AQ21_AQ30_Demand_Zonal_Stats_EE_V01/output_V03/zonal_stats_cd_aq21projee_export.csv...
Copying gs://a

In [23]:
# Read Shapefiles of Aq2.1 Aq3.0 and Aq21proj

aq21_input_file_path = "{}/{}.shp".format(ec2_input_path,AQ21_INPUT_FILE_NAME)
gdf_aq21 = gpd.read_file(aq21_input_file_path )
gdf_aq21 = gdf_aq21.set_index("GU",drop=False)

aq30_input_file_path = "{}/{}.shp".format(ec2_input_path,AQ30_INPUT_FILE_NAME)
gdf_aq30 = gpd.read_file(aq30_input_file_path)
gdf_aq30["PFAF_ID_COPY"] = gdf_aq30["PFAF_ID"]
gdf_aq30 = gdf_aq30.dissolve(by='PFAF_ID_COPY', aggfunc='first')

gdf_aq30_eckert4 = gdf_aq30.to_crs(ECKERT_IV_PROJ4_STRING)
gdf_aq30["area_m2"] = gdf_aq30_eckert4.geometry.area
gdf_aq30 = gdf_aq30.set_index("PFAF_ID",drop=False)

In [21]:
aq21proj_input_file_path = "{}/{}.shp".format(ec2_input_path,AQ21PROJ_INPUT_FILE_NAME)
gdf_aq21proj = gpd.read_file(aq21proj_input_file_path )
gdf_aq21proj = gdf_aq21proj.set_index("BasinID",drop=False)

In [11]:
assert gdf_aq21proj.shape[0] == 15006

In [24]:
assert gdf_aq30.shape[0]== 16397-1 #(There is one basin with a shared PFAF_ID)

In [27]:
aqueduct_versions = ["aq21","aq30","aq21proj"]
sectors = ["a","d","i","t"]
demand_types = ["c","u"]

aqueduct_versions = ["aq30"]

for aqueduct_version in aqueduct_versions:  
    if aqueduct_version == "aq21":
        gdf_left = gdf_aq21.copy()
        index_name = "GU"
    elif aqueduct_version == "aq30":
        gdf_left = gdf_aq30.copy()
        index_name = "PFAF_ID"
    elif aqueduct_version == "aq21proj":
        gdf_left = gdf_aq21proj.copy()
        index_name = "BasinID"        
    else:
        break
    df_merge = pd.DataFrame(gdf_left[index_name])
    
    
    
    input_file_name = "zonal_stats_ct_aq30ee_export.csv"
    input_file_path = ec2_input_path + "/" + input_file_name
    df_right = pd.read_csv(input_file_path)
    
    
    
    """
    for demand_type in demand_types:
        for sector in sectors:
            print(aqueduct_version,demand_type,sector)
            input_file_name = "zonal_stats_{}{}_{}ee_export.csv".format(demand_type,sector,aqueduct_version)
            input_file_path = ec2_input_path + "/" + input_file_name
            df_right = pd.read_csv(input_file_path)
            
            df_right = df_right[["sum","count",index_name]].copy()
            df_right = df_right.set_index(index_name,drop=False)
            
            df_right = df_right.rename(columns={"sum":"sum_{}{}_m3".format(demand_type,sector),
                                                "count":"count_{}{}_dimensionless".format(demand_type,sector)})
            
            df_merge  = df_merge.merge(right=df_right,
                                       how="left",
                                       left_on = index_name,
                                       right_on = index_name)
            #gdf_left["sum_{}{}_m".format(demand_type,sector)] = gdf_left["sum_{}{}_m3".format(demand_type,sector)]/gdf_left["area_m2"]
            print(df_merge.shape)
            
    #gdf_to_disk = gdf_left.copy()
    #gdf_to_disk_geom_only = gdf_to_disk[[index_name,"geometry"]]
    #df_to_disk = pd.DataFrame(gdf_to_disk.drop("geometry",1))
    #output_file_path_no_ext = "{}/{}".format(ec2_output_path,aqueduct_version)
    
    #gdf_to_disk.to_file(driver='ESRI Shapefile', filename=output_file_path_no_ext+".shp")
    #gdf_to_disk_geom_only.to_file(driver='ESRI Shapefile', filename=output_file_path_no_ext+"_geom_only.shp")
    #df_to_disk.to_csv(output_file_path_no_ext+".csv")
    """
    

Unnamed: 0,system:index,COAST,DIST_MAIN,DIST_SINK,ENDO,HYBAS_ID,MAIN_BAS,NEXT_DOWN,NEXT_SINK,ORDER,PFAF_ID,SORT,SUB_AREA,UP_AREA,count,sum,.geo
0,0000deed656876d04411,1,0.0,0.0,0,6060013760,6060013760,0,6060013760,0,635900,1183,22525.3,22525.3,30020,334913400.0,
1,0000a3d8b52542354151,0,100.2,100.2,0,6060735790,6060013750,6060737700,6060013750,2,635804,1178,3165.1,3165.1,4174,33867170.0,
2,0000006fa5b6909cdbf5,0,136.0,136.0,0,6060736580,6060013750,6060735770,6060013750,1,635807,1179,4364.5,34578.1,5725,61184220.0,
3,00007d055559e7784057,0,277.5,277.5,0,6060744470,6060013750,6060736580,6060013750,1,635809,1181,21622.3,21623.1,28024,456593700.0,
4,0000fe52d1ca380c0136,0,277.5,277.5,0,6060744370,6060013750,6060736580,6060013750,2,635808,1182,8591.6,8591.6,11155,24667190.0,


In [None]:
for aqueduct_version in aqueduct_versions:  
    
    if aqueduct_version == "aq21":
        gdf_left = gdf_aq21.copy()
        index_name = "GU"
    elif aqueduct_version == "aq30":
        gdf_left = gdf_aq30.copy()
        index_name = "PFAF_ID"
    elif aqueduct_version == "aq21proj":
        gdf_left = gdf_aq21proj.copy()
        index_name = "BasinID"        
        
    else:
        break
    

In [None]:
gdf_left.shape

In [None]:
df_right.shape

In [None]:
!aws s3 cp {ec2_output_path} {s3_output_path} --recursive

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

Previous runs:  
0:08:42.263441