In [1]:
"""Post process aggregations from EE and combine with other datasets.
-------------------------------------------------------------------------------

combines the different datasets into one result table. 

indicator weights 
bws withdrawal per sector
bwd withdrawal per sector
iav withdrawal per sector
sev withdrawal per sector


Author: Rutger Hofste
Date: 20190128
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

"""

TESTING = 0
SCRIPT_NAME = "Y2019M01D28_RH_GA_Zonal_Stats_Table_V01"
OUTPUT_VERSION = 3

GCS_INPUT_PATH = "gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05"

BQ_PROJECT_ID = "aqueduct30"
BQ_DATASET_NAME = "aqueduct30v01"
BQ_INPUT_TABLE_NAME = "y2018m12d04_rh_master_merge_rawdata_gpd_v02_v05"
BQ_OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()


ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 

s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)

print("GCS_INPUT_PATH: " + GCS_INPUT_PATH +
      "\nec2_input_path: " +  ec2_input_path + 
      "\nec2_output_path: " + ec2_output_path + 
      "\ns3_output_path: " + s3_output_path  )



GCS_INPUT_PATH: gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05
ec2_input_path: /volumes/data/Y2019M01D28_RH_GA_Zonal_Stats_Table_V01/input_V03
ec2_output_path: /volumes/data/Y2019M01D28_RH_GA_Zonal_Stats_Table_V01/output_V03
s3_output_path: s3://wri-projects/Aqueduct30/processData/Y2019M01D28_RH_GA_Zonal_Stats_Table_V01/output_V03/


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2019M01D29 UTC 15:17


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

In [4]:
!gsutil -m cp {GCS_INPUT_PATH}/* {ec2_input_path}

Copying gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05/Dom_weighted_bwd_sumee_export.csv...
Copying gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05/Dom_weighted_bws_sumee_export.csv...
Copying gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05/Dom_weighted_cep_sumee_export.csv...
Copying gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05/Dom_weighted_cfr_sumee_export.csv...
Copying gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05/Dom_weighted_drr_sumee_export.csv...
Copying gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05/Dom_weighted_gtd_sumee_export.csv...
Copying gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_Indicators_EE_V01/output_V05/Dom_weighted_iav_sumee_export.csv...
Copying gs://aqueduct30_v01/Y2019M01D17_RH_GA_Zonal_Stats_Weighted_In

- [70/70 files][ 11.0 MiB/ 11.0 MiB] 100% Done                                  
Operation completed over 70 objects/11.0 MiB.                                    


In [5]:
import os
import numpy as np
import pandas as pd
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

In [6]:
sql = """
SELECT
  indicator,
  AVG(cat) AS cat,
  label
FROM
  `{}.{}.{}`
GROUP BY
  label, indicator
ORDER BY
  indicator, cat
""".format(BQ_PROJECT_ID,BQ_DATASET_NAME,BQ_INPUT_TABLE_NAME)

In [7]:
df_labels = pd.read_gbq(query=sql,
                        project_id =BQ_PROJECT_ID,
                        dialect="standard")

In [8]:
df_labels.head()

Unnamed: 0,indicator,cat,label
0,,,
1,bwd,,
2,bwd,-1.0,Arid and Low Water Use
3,bwd,0.0,Low
4,bwd,1.0,Low - Medium


## BWS, BWD, IAV, SEV

In [9]:
def score_to_category(score):
    if np.isnan(score):
        cat = np.nan
    else:
        if score < 5:
            cat = int(np.floor(score))
        else:
            cat = 4
    return cat

In [10]:
sectors = ["Tot","Dom","Ind","Irr","Liv"]
indicators = ["bws","bwd","iav","sev","gtd","drr","rfr","cfr","ucw","cep","udw","usa","rri"]
indicators = ["bws","bwd","iav","sev"]

In [11]:
df_vertical = pd.DataFrame()
for sector in sectors:
    input_file_name = "{}_weights_sumee_export.csv".format(sector)
    input_file_path = "{}/{}".format(ec2_input_path,input_file_name)
    df_weights = pd.read_csv(input_file_path)
    df_weights.drop(columns=["system:index",".geo"],
                    inplace=True)
    df_weights.rename(columns={"sum":"sum_weights"},
                      inplace=True)
    
    
    for indicator in indicators:
        print("sector:" , sector , "indicator: ", indicator)
        input_file_name = "{}_weighted_{}_sumee_export.csv".format(sector,indicator)
        input_file_path = "{}/{}".format(ec2_input_path,input_file_name)
        df = pd.read_csv(input_file_path)
   
        df.drop(columns=["system:index",".geo"],
                inplace=True)
        df.rename(columns={"sum":"sum_weighted_indicator"},inplace=True)
   
        df["indicator_name"] = indicator
        df["sector"] = sector

        # Join weights and weighted_indicators

        df_merged = pd.merge(left=df_weights,
                             right=df,
                             how="inner",
                             left_on="gid_1",
                             right_on="gid_1")
    
        df_merged["score"]  = df_merged["sum_weighted_indicator"] / df_merged["sum_weights"]

        # The cat -> label is different for each indicator. Using a link table instead.
        df_merged["cat"] = df_merged["score"].apply(score_to_category)
        df_vertical = df_vertical.append(df_merged)
    

sector: Tot indicator:  bws
sector: Tot indicator:  bwd
sector: Tot indicator:  iav
sector: Tot indicator:  sev
sector: Dom indicator:  bws
sector: Dom indicator:  bwd
sector: Dom indicator:  iav
sector: Dom indicator:  sev
sector: Ind indicator:  bws
sector: Ind indicator:  bwd
sector: Ind indicator:  iav
sector: Ind indicator:  sev
sector: Irr indicator:  bws
sector: Irr indicator:  bwd
sector: Irr indicator:  iav
sector: Irr indicator:  sev
sector: Liv indicator:  bws
sector: Liv indicator:  bwd
sector: Liv indicator:  iav
sector: Liv indicator:  sev


In [12]:
# Some provinces have scores > 5 due to an unknown caveat. Replacing with 5's
df_vertical["score"].clip(lower=None,upper=5,inplace=True)

In [13]:
df_vertical["cat"] = df_vertical["score"].apply(score_to_category)

In [14]:
df_vertical.head()

Unnamed: 0,gid_1,sum_weights,sum_weighted_indicator,indicator_name,sector,score,cat
0,BRA.19_1,2514.995991,4058.518346,bws,Tot,1.613728,1.0
1,BRA.13_1,5463.364571,2765.254934,bws,Tot,0.506145,0.0
2,BRA.25_1,9152.996072,9952.655191,bws,Tot,1.087366,1.0
3,BRA.8_1,1419.280556,1585.571265,bws,Tot,1.117165,1.0
4,BRA.5_1,4196.830095,3338.599288,bws,Tot,0.795505,0.0


In [15]:
df_out = pd.merge(left=df_vertical,
                   right=df_labels,
                   how="left",
                   left_on=["indicator_name","cat"],
                   right_on=["indicator","cat"])

In [16]:
df_out.drop(columns=["indicator"],
            inplace=True)

In [17]:
output_file_path_ec2 = "{}/{}_V{:02.0f}.csv".format(ec2_output_path,SCRIPT_NAME,OUTPUT_VERSION)

In [18]:
df_out.to_csv(path_or_buf=output_file_path_ec2)

In [19]:
BQ_PROJECT_ID = "aqueduct30"
BQ_DATASET_NAME = "aqueduct30v01"
BQ_INPUT_TABLE_NAME = "y2018m12d04_rh_master_merge_rawdata_gpd_v02_v05"

In [20]:
destination_table = "{}.{}".format(BQ_DATASET_NAME,BQ_OUTPUT_TABLE_NAME)

In [21]:
df_out.to_gbq(destination_table=destination_table,
              project_id=BQ_PROJECT_ID,
              if_exists="replace")

8it [00:27,  3.47s/it]


In [22]:
!aws s3 cp {ec2_output_path} {s3_output_path} --recursive

upload: ../../../../data/Y2019M01D28_RH_GA_Zonal_Stats_Table_V01/output_V03/Y2019M01D28_RH_GA_Zonal_Stats_Table_V01_V03.csv to s3://wri-projects/Aqueduct30/processData/Y2019M01D28_RH_GA_Zonal_Stats_Table_V01/output_V03/Y2019M01D28_RH_GA_Zonal_Stats_Table_V01_V03.csv


In [23]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:00:44.409718


Previous runs:   
0:00:37.409272
