# Add upstream, downstream and basin information to the dataframe

* Purpose of script: add contextual data to the datafram. 
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20170915

In [1]:
import time
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
print(dateString,timeString)

Y2017M09D28 UTC 13:19


In [2]:
S3_INPUT_PATH_EE  = "s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/"

S3_INPUT_PATH_HYDROBASINS = "s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/"

EC2_INPUT_PATH = "/volumes/data/Y2017M09D15_RH_Add_Basin_Data_V01/input"
EC2_OUTPUT_PATH = "/volumes/data/Y2017M09D15_RH_Add_Basin_Data_V01/output"

INPUT_FILENAME_EE =  "mergedZonalStatsEE_V12.pkl"
INPUT_FILENAME_HYDROBASINS =  "hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.pkl"


In [3]:
!rm -r {EC2_INPUT_PATH} 
!rm -r {EC2_OUTPUT_PATH} 

In [4]:
!mkdir -p {EC2_INPUT_PATH} 
!mkdir -p {EC2_OUTPUT_PATH} 

In [5]:
!aws s3 cp {S3_INPUT_PATH_EE} {EC2_INPUT_PATH} --recursive

download: s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/mergedZonalStatsEE_V12.pkl to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V01/input/mergedZonalStatsEE_V12.pkl
download: s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/mergedZonalStatsEE_V12.csv to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V01/input/mergedZonalStatsEE_V12.csv


In [6]:
!aws s3 cp {S3_INPUT_PATH_HYDROBASINS} {EC2_INPUT_PATH} --recursive --exclude "*" --include "*.pkl"

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.pkl to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V01/input/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.pkl


In [7]:
import os
import pandas as pd
import multiprocessing as mp
import pickle
import numpy as np
import itertools
import logging
import pprint
import ast

In [8]:
inputLocationEE = os.path.join(EC2_INPUT_PATH,INPUT_FILENAME_EE)
inputLocationHydroBasins = os.path.join(EC2_INPUT_PATH,INPUT_FILENAME_HYDROBASINS)

In [9]:
df_ee = pd.read_pickle(inputLocationEE)

In [10]:
df_ee.index.names = ['PFAF_ID']

In [11]:
df_HydroBasins = pd.read_pickle(inputLocationHydroBasins)


In [12]:
df_complete = df_HydroBasins.merge(df_ee,how="left",left_index=True, right_index=True)

In [13]:
df_complete.dtypes

HYBAS_ID2                                   int64
Unnamed: 0                                  int64
HYBAS_ID                                    int64
NEXT_DOWN                                   int64
NEXT_SINK                                   int64
MAIN_BAS                                    int64
DIST_SINK                                 float64
DIST_MAIN                                 float64
SUB_AREA                                  float64
UP_AREA                                   float64
PFAF_ID                                     int64
ENDO                                        int64
COAST                                       int64
ORDER                                       int64
SORT                                        int64
Upstream_HYBAS_IDs                         object
Upstream_PFAF_IDs                          object
Downstream_HYBAS_IDs                       object
Downstream_PFAF_IDs                        object
NEXT_SINK_PFAF                            float64


## Functions

In [65]:
def calculateTotalDemand(useType,temporalResolution,year,month):
    # This function will add Dom Ind IrrLinear and Livestock of all basins in the input list
    
    if temporalResolution == "year":
        keyTotal = "local_sum_volumem3_Tot%s_%s_Y%0.4d" %(useType,temporalResolution,year)
    else:
        keyTotal = "local_sum_volumem3_Tot%s_%s_Y%0.4dM%0.2d" %(useType,temporalResolution,year,month)
    
    # Create Column with zeros
    dfDemand[keyTotal] = 0
    for demandType in demandTypes:
        if demandType == "IrrLinear" and temporalResolution == "year":
            key = "total_volume_%s%s_%sY%0.4d" %(demandType,useType,temporalResolution,year)
        else:
            key = "total_volume_%s%s_%sY%0.4dM%0.2d" %(demandType,useType,temporalResolution,year,month)
        dfDemand[keyTotal] = dfDemand[keyTotal] + df_complete[key]
    return dfDemand   


def calculateUpstream():
    # This function will add upstream data to the dataFrame 
    # standard column format: upstream_sum_volumem3_TotWW_monthY2014M12
    pass

dfTest = pd.DataFrame()
 
def addUpstream2(listje):
    df_full_temp = df_complete.copy()
    df_part_temp = df_full_temp[df_full_temp.index.isin(listje)]
    df_part_temp2 = df_part_temp.copy()
    df_out = df_part_temp2.copy()
    
    i = 0
    for index, row in df_part_temp2.iterrows():
        i += 1
        print("i: ",i  ," index: ", index)
        upstreamCatchments = df_part_temp2.loc[index, "Upstream_PFAF_IDs"]
        upstreamCatchments = ast.literal_eval(upstreamCatchments)
        df_upstream = df_full_temp[df_full_temp.index.isin(upstreamCatchments)]
        # selecting columns based on regular expression
        df_upstream = df_upstream.filter(regex=("total*"))
        df_upstream = df_upstream.add_prefix("upstream_")
        sumSeries = df_upstream.sum(0)
        for key, value in sumSeries.iteritems():
            df_out.loc[index, key] = value
        df_out.loc[index, "errorCode"] = 0
        
        
    
    return(df_upstream,df_out)
        
    
    
def addUpstream(listje):
    df_full_temp = df_complete.copy()
    df_part_temp = df_full_temp[df_full_temp.index.isin(listje)]
    df_part_temp2 = df_part_temp.copy()
    df_out = df_part_temp2.copy()
    i = 0
    for index, row in df_part_temp2.iterrows():
        i += 1
        print("i: ",i  ," index: ", index)
        try:
            upstreamCatchments = df_part_temp2.loc[index, "Upstream_PFAF_IDs"]
            upstreamCatchments = ast.literal_eval(upstreamCatchments)
            df_upstream = df_full_temp.loc[upstreamCatchments]
            area = df_upstream["countarea30sm2"] * df_upstream["meanarea30sm2"]

            df_new = pd.DataFrame()
            df_new["aream2"] = area

            for parameter in parameterList:
                df_new["count_" + parameter] = df_upstream["count" + parameter]
                df_new["volumem3_" + parameter] = area * df_upstream["mean" + parameter]

            sumSeries = df_new.sum()

            for key, value in sumSeries.iteritems():
                newKey = "upstream_sum_" + key
                df_out.loc[index, newKey] = value
            df_out.loc[index, "errorCode"] = 0
        except:
            print("error")
            df_out.loc[index, "errorCode"] = 1
            pass

    return df_out   
    
    

## Script

In [18]:
demandTypes = ["PDom","PInd","IrrLinear","PLiv"]
useTypes = ["WW","WN"]
temporalResolutions = ["year","month"]
years = [2014]

In [19]:
dfDemand = pd.DataFrame(index=df_complete.index)
for temporalResolution in temporalResolutions:
    for useType in useTypes:
        for year in years:
            if temporalResolution == "year":
                month = 12
                print(useType,temporalResolution,year,month)
                dfDemand = calculateTotalDemand(useType,temporalResolution,year,month)
            else:
                for month in range(1,13):
                    print(useType,temporalResolution,year,month)
                    dfDemand = calculateTotalDemand(useType,temporalResolution,year,month)          

WW year 2014 12
WN year 2014 12
WW month 2014 1
WW month 2014 2
WW month 2014 3
WW month 2014 4
WW month 2014 5
WW month 2014 6
WW month 2014 7
WW month 2014 8
WW month 2014 9
WW month 2014 10
WW month 2014 11
WW month 2014 12
WN month 2014 1
WN month 2014 2
WN month 2014 3
WN month 2014 4
WN month 2014 5
WN month 2014 6
WN month 2014 7
WN month 2014 8
WN month 2014 9
WN month 2014 10
WN month 2014 11
WN month 2014 12


In [20]:
dfDemand.head()

Unnamed: 0_level_0,local_sum_volumem3_TotWW_year_Y2014,local_sum_volumem3_TotWN_year_Y2014,local_sum_volumem3_TotWW_month_Y2014M01,local_sum_volumem3_TotWW_month_Y2014M02,local_sum_volumem3_TotWW_month_Y2014M03,local_sum_volumem3_TotWW_month_Y2014M04,local_sum_volumem3_TotWW_month_Y2014M05,local_sum_volumem3_TotWW_month_Y2014M06,local_sum_volumem3_TotWW_month_Y2014M07,local_sum_volumem3_TotWW_month_Y2014M08,...,local_sum_volumem3_TotWN_month_Y2014M03,local_sum_volumem3_TotWN_month_Y2014M04,local_sum_volumem3_TotWN_month_Y2014M05,local_sum_volumem3_TotWN_month_Y2014M06,local_sum_volumem3_TotWN_month_Y2014M07,local_sum_volumem3_TotWN_month_Y2014M08,local_sum_volumem3_TotWN_month_Y2014M09,local_sum_volumem3_TotWN_month_Y2014M10,local_sum_volumem3_TotWN_month_Y2014M11,local_sum_volumem3_TotWN_month_Y2014M12
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111011,62577410.0,29152340.0,5071952.0,5039243.0,5096070.0,5268799.0,5233163.0,5282180.0,5382082.0,5472195.0,...,2372698.0,2448676.0,2443976.0,2468962.0,2509266.0,2543447.0,2498844.0,2432985.0,2380002.0,2353863.0
111012,8204290.0,3388488.0,682756.6,682916.6,683184.6,683658.3,684056.9,684325.0,684404.2,684421.5,...,281867.8,282341.5,282740.1,283008.2,283087.4,283104.7,282938.7,282631.1,282095.8,281633.4
111013,7569537.0,3111568.0,630127.7,630235.5,630426.1,630766.9,631049.6,631247.4,631314.5,631322.9,...,258928.7,259269.5,259552.2,259750.0,259817.1,259825.5,259707.5,259483.8,259103.6,258762.0
111014,1046668.0,502430.9,86513.5,86643.06,86851.63,87216.48,87518.61,87708.51,87754.53,87772.85,...,41498.58,41863.42,42165.56,42355.45,42401.48,42419.8,42272.04,42047.55,41654.68,41301.91
111015,111843100.0,46686470.0,9301079.0,9303778.0,9309980.0,9319181.0,9327492.0,9333573.0,9335649.0,9335846.0,...,3881478.0,3889610.0,3896934.0,3902287.0,3904112.0,3904281.0,3900583.0,3894476.0,3886032.0,3877061.0


In [21]:
df_smaller = 

In [30]:
listje = [292107]

In [62]:
df_upstream_sum,df_out = addUpstream2(listje)

In [63]:
df_upstream_sum

Unnamed: 0_level_0,upstream_sum_total_area_30s_m2,upstream_sum_total_volume_IrrLinearWN_monthY2014M01,upstream_sum_total_volume_IrrLinearWN_monthY2014M04,upstream_sum_total_volume_PLivWN_monthY2014M03,upstream_sum_total_volume_runoff_yearY2014M12,upstream_sum_total_volume_IrrLinearWW_monthY2014M06,upstream_sum_total_volume_reducedmeanrunoff_month_Y1960Y2014M8,upstream_sum_total_volume_PIrrWW_monthY2014M04,upstream_sum_total_volume_IrrLinearWW_monthY2014M09,upstream_sum_total_volume_PIrrWW_monthY2014M11,...,upstream_sum_total_volume_PIrrWN_monthY2014M10,upstream_sum_total_volume_runoff_monthY2014M02,upstream_sum_total_volume_PLivWN_monthY2014M10,upstream_sum_total_volume_PDomWN_yearY2014M12,upstream_sum_total_volume_runoff_monthY2014M10,upstream_sum_total_volume_PIrrWW_monthY2014M08,upstream_sum_total_volume_IrrLinearWW_monthY2014M04,upstream_sum_total_volume_PLivWW_yearY2014M12,upstream_sum_total_volume_PIrrWN_monthY2014M07,upstream_sum_total_volume_IrrLinearWN_monthY2014M05
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
292108,3591820000.0,0.0,0.0,6776.336645,89183570.0,0.0,6073141.0,0.0,0.0,0.0,...,0.0,20867310.0,7167.88471,2621435.0,140290.3,0.0,0.0,85100.494113,0.0,0.0
292109,18085580000.0,7386851.0,8708965.0,64981.950872,1227492000.0,103537200.0,101544500.0,12390520.0,72413790.0,31151290.0,...,19669650.0,388454500.0,68148.543201,21174590.0,15560810.0,108218800.0,23039590.0,813498.382533,44159100.0,27216790.0


In [49]:
test = df_upstream_sum.sum(0)

In [64]:
df_out.head()

Unnamed: 0_level_0,HYBAS_ID2,Unnamed: 0,HYBAS_ID,NEXT_DOWN,NEXT_SINK,MAIN_BAS,DIST_SINK,DIST_MAIN,SUB_AREA,UP_AREA,...,upstream_sum_total_volume_PIrrWN_monthY2014M10,upstream_sum_total_volume_runoff_monthY2014M02,upstream_sum_total_volume_PLivWN_monthY2014M10,upstream_sum_total_volume_PDomWN_yearY2014M12,upstream_sum_total_volume_runoff_monthY2014M10,upstream_sum_total_volume_PIrrWW_monthY2014M08,upstream_sum_total_volume_IrrLinearWW_monthY2014M04,upstream_sum_total_volume_PLivWW_yearY2014M12,upstream_sum_total_volume_PIrrWN_monthY2014M07,upstream_sum_total_volume_IrrLinearWN_monthY2014M05
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
292107,2060877600,6355,2060877600,2060879580,2060085630,2060085630,19.0,19.0,155.4,21859.5,...,19669650.0,409321800.0,75316.427911,23796020.0,15701100.0,108218800.0,23039590.0,898598.876645,44159100.0,27216790.0


In [None]:
mp.cpu_count()

In [None]:
print(inputLocation)

In [None]:
indices_full = df_full.index.values
indices_split = np.array_split(indices_full, mp.cpu_count())

In [None]:
print(indices_split)

In [None]:
mp.log_to_stderr()

In [None]:
logger = mp.get_logger()
logger.setLevel(logging.INFO)

In [None]:
pool = mp.Pool(mp.cpu_count())

In [None]:
df_out = pd.concat(pool.map(addUpstream, indices_split))

In [None]:
df_out = addUpstream([1])

In [None]:
df_out.head()