# Calculate average PCRGlobWB supply using EE

* Purpose of script: This script will join the csv tables from GCS into one file using pandas
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20170914

In [1]:
import time
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
print(dateString,timeString)

Y2017M09D14 UTC 19:40


In [64]:
GCS_INPUT_PATH = "gs://aqueduct30_v01/Y2017M09D11_RH_zonal_stats_EE_V15/"
EC2_INPUT_PATH = "/volumes/data/Y2017M09D14_RH_merge_EE_results_V01/input"
EC2_OUTPUT_PATH = "/volumes/data/Y2017M09D14_RH_merge_EE_results_V01/output"

STRING_TRIM = "V15ee_export.csv"
# e.g. IrrLinearWW_monthY2014M12V15ee_export.csv -> IrrLinearWW_monthY2014M12

#Aux files 
AUXFILES = ["Hybas06",
            "area_30s_m2",
            "ones_30s"
           ]

DROP_COLUMNS = [".geo","system:index"]




In [65]:
!mkdir -p {EC2_INPUT_PATH}
!mkdir -p {EC2_OUTPUT_PATH}

In [66]:
#!gsutil cp -r {GCS_INPUT_PATH} {EC2_INPUT_PATH} 

In [67]:
import pandas as pd
import os
import re

In [81]:
def createRegex(aList):
    return '|'.join(aList)

def prepareFile(oneFile):
        trimFileName = oneFile[:-len(STRING_TRIM)]
        d ={}
        d["df"] = pd.read_csv(os.path.join(folder,oneFile))
        d["df"] = prepareDf(d["df"])
        d["trimFileName"] = trimFileName
        return d
        

        
        

def prepareDf(df):
    for column in df.columns:
        if re.search("PfafID",column):
            df2 = df.set_index(column)
            df2 = df2.drop(DROP_COLUMNS,1)
            
            return df2


In [82]:
folder = os.path.join(EC2_INPUT_PATH,"Y2017M09D11_RH_zonal_stats_EE_V15/")

In [83]:
files = os.listdir(folder)

## Process Auxiliary Datasets (PfafID, Area, Ones)

In [84]:
dAux ={}
for regex in AUXFILES:
    r = re.compile(regex)
    newList = filter(r.match, files)
    oneFile = list(newList)[0]
    dAux[regex] = prepareFile(oneFile)   

In [85]:
regex = createRegex(AUXFILES)

In [86]:
print(regex)

Hybas06|area_30s_m2|ones_30s


In [88]:
d ={}
dAux ={}
for oneFile in files: 
    trimFileName = oneFile[:-len(STRING_TRIM)]    
    if not re.search(regex,oneFile):
        d[trimFileName] = prepareFile(oneFile)
        
    elif re.search(regex,oneFile):
        dAux[trimFileName] = prepareFile(oneFile)
    
    else:
        print("Unrecognized file name, check STRING_TRIM variable")
        

In [98]:
dfLeft = dAux["Hybas06"]["df"]

In [99]:
dfRight = d["IrrLinearWN_monthY2014M01"]["df"]

In [103]:
dfMerge = dfLeft.merge(dfRight,
                       how="outer",
                       left_index=True,
                       right_index=True,
                       sort=True
                      )

In [104]:
dfMerge.head()

Unnamed: 0_level_0,count_Hybas06,mean_Hybas06,count_IrrLinearWN_monthY2014M01,mean_IrrLinearWN_monthY2014M01
PfafID_Hybas06,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
111011,2536,111011.0,2536,3e-05
111012,3921,111012.0,3921,0.0
111013,1194,111013.0,1194,0.0
111014,5605,111014.0,5605,0.0
111015,21873,111015.0,21769,0.0
