# Add upstream, downstream and basin information to the dataframe

* Purpose of script: add contextual data to the dataframe. The script will sum the volumetric information of all upstream, downstream and basin parameters of the dataframe. The script was revised on 12 October 2017 to restrict negative runoff.
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20170915

In [1]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2017M10D12 UTC 15:37


'3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:26:58) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
INPUT_VERSION = 18
OUTPUT_VERSION = 1

SCRIPT_NAME = "Y2017M09D15_RH_Add_Basin_Data_V02"

S3_INPUT_PATH_EE  = "s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/"
S3_OUTPUT_PATH = "s3://wri-projects/Aqueduct30/processData/%s/output/" %(SCRIPT_NAME)

S3_INPUT_PATH_HYDROBASINS = "s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/"

EC2_INPUT_PATH = "/volumes/data/%s/input" %(SCRIPT_NAME)
EC2_OUTPUT_PATH = "/volumes/data/%s/output" %(SCRIPT_NAME)

INPUT_FILENAME_EE =  "mergedZonalStatsEE_V%0.2d.pkl" %(INPUT_VERSION)
INPUT_FILENAME_HYDROBASINS =  "hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.pkl"

OUTPUT_FILENAME = "Y2017M09D15_RH_Add_Basin_Data_V%0.2d" %(OUTPUT_VERSION)

Note: There are two polygons with the same PFAF_ID (353020). This is caused by the fact that both poygons would otherwise cross the 180 degree meridian

In [3]:
!rm -r {EC2_INPUT_PATH} 
!rm -r {EC2_OUTPUT_PATH} 

In [4]:
!mkdir -p {EC2_INPUT_PATH} 
!mkdir -p {EC2_OUTPUT_PATH} 

In [5]:
!aws s3 cp {S3_INPUT_PATH_EE} {EC2_INPUT_PATH} --recursive

download: s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/mergedZonalStatsEE_V18.pkl to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V02/input/mergedZonalStatsEE_V18.pkl
download: s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/mergedZonalStatsEE_V18.csv to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V02/input/mergedZonalStatsEE_V18.csv


In [6]:
!aws s3 cp {S3_INPUT_PATH_HYDROBASINS} {EC2_INPUT_PATH} --recursive --exclude "*" --include "*.pkl"

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.pkl to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V02/input/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.pkl


In [7]:
import os
import pandas as pd
import multiprocessing as mp
import pickle
import numpy as np
import itertools
import logging
import pprint
import ast

In [8]:
inputLocationEE = os.path.join(EC2_INPUT_PATH,INPUT_FILENAME_EE)
inputLocationHydroBasins = os.path.join(EC2_INPUT_PATH,INPUT_FILENAME_HYDROBASINS)

In [9]:
df_ee = pd.read_pickle(inputLocationEE)

In [10]:
df_ee.index.names = ['PFAF_ID']

In [11]:
df_HydroBasins = pd.read_pickle(inputLocationHydroBasins)


In [12]:
df_complete = df_HydroBasins.merge(df_ee,how="left",left_index=True, right_index=True)

Note: There are two polygons with the same PFAF_ID (353020). This is caused by the fact that both poygons would otherwise cross the 180 degree meridian. 

In [13]:
df_complete = df_complete.drop_duplicates(subset='PFAF_ID', keep='first')

## Functions

In [14]:
def calculateTotalDemand(useType,temporalResolution,year,month):
    # This function will add Dom Ind IrrLinear and Livestock of all basins in the input list
    
    if temporalResolution == "year":
        keyTotal = "total_volume_Tot%s_%s_Y%0.4d" %(useType,temporalResolution,year)
    else:
        keyTotal = "total_volume_Tot%s_%s_Y%0.4dM%0.2d" %(useType,temporalResolution,year,month)
    
    # Create Column with zeros
    dfDemand[keyTotal] = 0
    for demandType in demandTypes:
        if demandType == "IrrLinear" and temporalResolution == "year":
            key = "total_volume_%s%s_%sY%0.4d" %(demandType,useType,temporalResolution,year)
        else:
            key = "total_volume_%s%s_%sY%0.4dM%0.2d" %(demandType,useType,temporalResolution,year,month)
        dfDemand[keyTotal] = dfDemand[keyTotal] + df_complete[key]
    return dfDemand   





# This functions can take only one argument because I map them over the pooler.
def addUpstream2(listje):
    df_full_temp = df_complete.copy()
    df_part_temp = df_full_temp[df_full_temp.index.isin(listje)]
    df_part_temp2 = df_part_temp.copy()
    df_out = pd.DataFrame(index=df_part_temp2.index)
    
    i = 0
    for index, row in df_part_temp2.iterrows():
        i += 1
        print("i: ",i  ," index: ", index)
        try:
            upstreamCatchments = df_part_temp2.loc[index, "Upstream_PFAF_IDs"]
            upstreamCatchments = ast.literal_eval(upstreamCatchments)
            df_upstream = df_full_temp[df_full_temp.index.isin(upstreamCatchments)]
            # selecting columns based on regular expression
            df_upstream = df_upstream.filter(regex=("^total_(area|volume_(P|I))"))
            df_upstream = df_upstream.add_prefix("upstream_")
            
            # added later (2017 10 11) to prevent negative runoff from upstream to propagate downstream. 
            # Update: Setting the sum to zero instead of the input runoff parameters
            
            #df_right = df_upstream.filter(regex=("upstream_total_volume_reducedmeanrunoff*|upstream_total_volume_runoff*")).clip(lower=0)
            #df_left = df_upstream.drop(list(df_right.columns), 1)
            #df_upstream_capped = df_left.merge(df_right,left_index=True,right_index=True,how="outer")         
    
            
            #sumSeries = df_upstream.sum(0)
            #sumSeries = df_upstream_capped.sum(0)
            sumSeries = df_upstream.sum(0).filter(regex=("upstream_total_volume_reducedmeanrunoff*|upstream_total_volume_runoff*")).clip(lower=0)
            for key, value in sumSeries.iteritems():
                df_out.loc[index, key] = value
            df_out.loc[index, "errorCode"] = 0
        except:
            print("error")
            df_out.loc[index, "errorCode"] = 1
            pass
    
    return df_out
    
    
def addDownstream2(listje):
    df_full_temp = df_complete.copy()
    df_part_temp = df_full_temp[df_full_temp.index.isin(listje)]
    df_part_temp2 = df_part_temp.copy()
    df_out = pd.DataFrame(index=df_part_temp2.index)
    
    i = 0
    for index, row in df_part_temp2.iterrows():
        i += 1
        print("i: ",i  ," index: ", index)
        try:
            upstreamCatchments = df_part_temp2.loc[index, "Downstream_PFAF_IDs"]
            upstreamCatchments = ast.literal_eval(upstreamCatchments)
            df_upstream = df_full_temp[df_full_temp.index.isin(upstreamCatchments)]
            # selecting columns based on regular expression
            df_upstream = df_upstream.filter(regex=("^total_(area|volume_(P|I))*"))
            df_upstream = df_upstream.add_prefix("downstream_")     
            
            sumSeries = df_upstream.sum(0)
            for key, value in sumSeries.iteritems():
                df_out.loc[index, key] = value
            df_out.loc[index, "errorCode"] = 0
        except:
            print("error")
            df_out.loc[index, "errorCode"] = 1
            pass
    
    return df_out


def addBasin2(listje):
    df_full_temp = df_complete.copy()
    df_part_temp = df_full_temp[df_full_temp.index.isin(listje)]
    df_part_temp2 = df_part_temp.copy()
    #df_out = df_part_temp2.copy()
    df_out = pd.DataFrame(index=df_part_temp2.index)
    
    i = 0
    for index, row in df_part_temp2.iterrows():
        i += 1
        print("i: ",i  ," index: ", index)
        try:
            upstreamCatchments = df_part_temp2.loc[index, "Basin_PFAF_IDs"]
            upstreamCatchments = ast.literal_eval(upstreamCatchments)
            df_upstream = df_full_temp[df_full_temp.index.isin(upstreamCatchments)]
            # selecting columns based on regular expression
            df_upstream = df_upstream.filter(regex=("^total_(area|volume_(P|I))"))
            df_upstream = df_upstream.add_prefix("basin_")
            sumSeries = df_upstream.sum(0)
            for key, value in sumSeries.iteritems():
                df_out.loc[index, key] = value
            df_out.loc[index, "errorCode"] = 0
        except:
            print("error")
            df_out.loc[index, "errorCode"] = 1
            pass
    
    return df_out

## Script

In [15]:
demandTypes = ["PDom","PInd","IrrLinear","PLiv"]
useTypes = ["WW","WN"]
temporalResolutions = ["year","month"]
years = [2014]

In [16]:
dfDemand = pd.DataFrame(index=df_complete.index)
for temporalResolution in temporalResolutions:
    for useType in useTypes:
        for year in years:
            if temporalResolution == "year":
                month = 12
                print(useType,temporalResolution,year,month)
                dfDemand = calculateTotalDemand(useType,temporalResolution,year,month)
            else:
                for month in range(1,13):
                    print(useType,temporalResolution,year,month)
                    dfDemand = calculateTotalDemand(useType,temporalResolution,year,month)          

WW year 2014 12
WN year 2014 12
WW month 2014 1
WW month 2014 2
WW month 2014 3
WW month 2014 4
WW month 2014 5
WW month 2014 6
WW month 2014 7
WW month 2014 8
WW month 2014 9
WW month 2014 10
WW month 2014 11
WW month 2014 12
WN month 2014 1
WN month 2014 2
WN month 2014 3
WN month 2014 4
WN month 2014 5
WN month 2014 6
WN month 2014 7
WN month 2014 8
WN month 2014 9
WN month 2014 10
WN month 2014 11
WN month 2014 12


In [17]:
dfDemand.head()

Unnamed: 0_level_0,total_volume_TotWW_year_Y2014,total_volume_TotWN_year_Y2014,total_volume_TotWW_month_Y2014M01,total_volume_TotWW_month_Y2014M02,total_volume_TotWW_month_Y2014M03,total_volume_TotWW_month_Y2014M04,total_volume_TotWW_month_Y2014M05,total_volume_TotWW_month_Y2014M06,total_volume_TotWW_month_Y2014M07,total_volume_TotWW_month_Y2014M08,...,total_volume_TotWN_month_Y2014M03,total_volume_TotWN_month_Y2014M04,total_volume_TotWN_month_Y2014M05,total_volume_TotWN_month_Y2014M06,total_volume_TotWN_month_Y2014M07,total_volume_TotWN_month_Y2014M08,total_volume_TotWN_month_Y2014M09,total_volume_TotWN_month_Y2014M10,total_volume_TotWN_month_Y2014M11,total_volume_TotWN_month_Y2014M12
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111011,62577410.0,29152340.0,5071952.0,5039243.0,5096070.0,5268799.0,5233163.0,5282180.0,5382082.0,5472195.0,...,2372698.0,2448676.0,2443976.0,2468962.0,2509266.0,2543447.0,2498844.0,2432985.0,2380002.0,2353863.0
111012,8204290.0,3388488.0,682756.6,682916.6,683184.6,683658.3,684056.9,684325.0,684404.2,684421.5,...,281867.8,282341.5,282740.1,283008.2,283087.4,283104.7,282938.7,282631.1,282095.8,281633.4
111013,7569537.0,3111568.0,630127.7,630235.5,630426.1,630766.9,631049.6,631247.4,631314.5,631322.9,...,258928.7,259269.5,259552.2,259750.0,259817.1,259825.5,259707.5,259483.8,259103.6,258762.0
111014,1046668.0,502430.9,86513.5,86643.06,86851.63,87216.48,87518.61,87708.51,87754.53,87772.85,...,41498.58,41863.42,42165.56,42355.45,42401.48,42419.8,42272.04,42047.55,41654.68,41301.91
111015,111843100.0,46686470.0,9301079.0,9303778.0,9309980.0,9319181.0,9327492.0,9333573.0,9335649.0,9335846.0,...,3881478.0,3889610.0,3896934.0,3902287.0,3904112.0,3904281.0,3900583.0,3894476.0,3886032.0,3877061.0


In [18]:
df_complete = df_complete.merge(dfDemand,how="left",left_index=True,right_index=True)

In [19]:
df_complete.dtypes

HYBAS_ID2                              int64
Unnamed: 0                             int64
HYBAS_ID                               int64
NEXT_DOWN                              int64
NEXT_SINK                              int64
MAIN_BAS                               int64
DIST_SINK                            float64
DIST_MAIN                            float64
SUB_AREA                             float64
UP_AREA                              float64
PFAF_ID                                int64
ENDO                                   int64
COAST                                  int64
ORDER                                  int64
SORT                                   int64
Upstream_HYBAS_IDs                    object
Upstream_PFAF_IDs                     object
Downstream_HYBAS_IDs                  object
Downstream_PFAF_IDs                   object
NEXT_SINK_PFAF                       float64
Basin_HYBAS_IDs                       object
Basin_PFAF_IDs                        object
SUB_NAME  

## Runoff Routing  
prevent negative propagation

In [20]:
df_complete["Upstream_PFAF_IDs"] = df_complete["Upstream_PFAF_IDs"].apply(lambda x: ast.literal_eval(x))

In [21]:
df_complete["numberUpstream_PFAF_IDs"] = df_complete["Upstream_PFAF_IDs"].apply(lambda x: len(x))

In [24]:
df_out = pd.DataFrame()

In [27]:
#for numberUpstream in range(0,df_complete["numberUpstream_PFAF_IDs"].max()+1):
for numberUpstream in range(0,1):
    print(numberUpstream)
    df_filtered = df_complete.loc[df_complete['numberUpstream_PFAF_IDs'] == numberUpstream].copy()
    if numberUpstream == 0:
        temporalResolution = "year"      
        # Headwaters
        df_filtered.loc[df_filtered["numberUpstream_PFAF_IDs"]==numberUpstream,"total_volume_accumulatedRunoff_year_Y2014"] = df_filtered["total_volume_runoff_yearY2014M12"]
        df_filtered.loc[df_filtered["numberUpstream_PFAF_IDs"]==numberUpstream,"total_volume_accumulatedRunoffAvailable_year_Y2014"] = df_filtered["total_volume_runoff_yearY2014M12"] - df_filtered["total_volume_TotWN_year_Y2014"]
    
        df_filtered["total_volume_accumulatedRunoff_year_Y2014"] = df_filtered["total_volume_accumulatedRunoff_year_Y2014"].clip(lower=0)
        df_filtered["total_volume_accumulatedRunoffAvailable_year_Y2014"] = df_filtered["total_volume_accumulatedRunoffAvailable_year_Y2014"].clip(lower=0)
        
        print(df_filtered["total_volume_accumulatedRunoffAvailable_year_Y2014"].clip(lower=0).min())
    
    else:
        pass
        """
        # accumulated = accumulatedRunoff_upstream + local runoff
        # accumulated available = accumulatedRunoffAvailable_upstream + local Runoff - Local consumption
        for index, row in df_filtered.iterrows():
            upstreamCatchments = df_filtered.loc[index]["Upstream_PFAF_IDs"]
            df_upstream = df_complete[df_complete.index.isin(upstreamCatchments)]
            
            df_upstream = df_upstream.filter(regex=("^total_volume_accumulated"))           
                
            sumSeries = df_upstream.sum(0)
                                           
            for key, value in sumSeries.iteritems():
                df_out.loc[index, key] = value
            df_out.loc[index, "errorCode"] = 0
            """
    df_out = df_out.append(df_filtered)
            
        

0
0.0


In [28]:
df_filtered.head()

Unnamed: 0_level_0,HYBAS_ID2,Unnamed: 0,HYBAS_ID,NEXT_DOWN,NEXT_SINK,MAIN_BAS,DIST_SINK,DIST_MAIN,SUB_AREA,UP_AREA,...,total_volume_TotWN_month_Y2014M06,total_volume_TotWN_month_Y2014M07,total_volume_TotWN_month_Y2014M08,total_volume_TotWN_month_Y2014M09,total_volume_TotWN_month_Y2014M10,total_volume_TotWN_month_Y2014M11,total_volume_TotWN_month_Y2014M12,numberUpstream_PFAF_IDs,total_volume_accumulatedRunoff_year_Y2014,total_volume_accumulatedRunoffAvailable_year_Y2014
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111011,1060000010,6675,1060000010,0,1060000010,1060000010,0.0,0.0,1890.8,1890.8,...,2468962.0,2509266.0,2543447.0,2498844.0,2432985.0,2380002.0,2353863.0,0,1343305.0,0.0
111012,1060000100,6676,1060000100,0,1060000100,1060000100,0.0,0.0,2925.9,2925.9,...,283008.2,283087.4,283104.7,282938.7,282631.1,282095.8,281633.4,0,567387.4,0.0
111013,1060000110,6677,1060000110,0,1060000110,1060000110,0.0,0.0,893.5,893.5,...,259750.0,259817.1,259825.5,259707.5,259483.8,259103.6,258762.0,0,169254.2,0.0
111014,1060000150,6678,1060000150,0,1060000150,1060000150,0.0,0.0,4217.3,4217.4,...,42355.45,42401.48,42419.8,42272.04,42047.55,41654.68,41301.91,0,1.544113e-05,0.0
111015,1060000160,6679,1060000160,0,1060000160,1060000160,0.0,0.0,16638.1,16638.1,...,3902287.0,3904112.0,3904281.0,3900583.0,3894476.0,3886032.0,3877061.0,0,9114.805,0.0


In [None]:
df_filtered

In [None]:
df_upstream

In [None]:
dfOut

In [None]:
dfOut.to_csv(os.path.join(EC2_OUTPUT_PATH,OUTPUT_FILENAME+".csv"))

In [None]:
mp.cpu_count()

In [None]:
indices_full = df_complete.index.values
indices_split = np.array_split(indices_full, mp.cpu_count())

In [None]:
indices_split[1].shape

In [None]:
mp.log_to_stderr()

In [None]:
logger = mp.get_logger()
logger.setLevel(logging.INFO)

In [None]:
pool = mp.Pool(mp.cpu_count())

In [None]:
df_upstream = pd.concat(pool.map(addUpstream2, indices_split))

In [None]:
df_upstream.loc[155697]["upstream_total_volume_reducedmeanrunoff_year_Y1960Y2014"]

In [None]:
df_downstream = pd.concat(pool.map(addDownstream2, indices_split))

In [None]:
df_basin = pd.concat(pool.map(addBasin2, indices_split))

In [None]:
pool.close()

In [None]:
df_complete = df_complete.merge(df_upstream,how="left",left_index=True,right_index=True)

In [None]:
df_complete = df_complete.merge(df_downstream,how="left",left_index=True,right_index=True)

In [None]:
df_complete = df_complete.merge(df_basin,how="left",left_index=True,right_index=True)

In [None]:
df_complete.to_pickle(os.path.join(EC2_OUTPUT_PATH,OUTPUT_FILENAME+".pkl"))

In [None]:
df_complete.to_csv(os.path.join(EC2_OUTPUT_PATH,OUTPUT_FILENAME+".csv"))

In [None]:
!aws s3 cp {EC2_OUTPUT_PATH} {S3_OUTPUT_PATH} --recursive

In [None]:
df_complete.head()

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)