# Add upstream, downstream and basin information to the dataframe

* Purpose of script: add contextual data to the datafram. 
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20170915

In [1]:
import time
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
print(dateString,timeString)

Y2017M09D26 UTC 14:41


In [2]:
S3_INPUT_PATH_EE  = "s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/"

S3_INPUT_PATH_HYDROBASINS = "s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/"

EC2_INPUT_PATH = "/volumes/data/Y2017M09D15_RH_Add_Basin_Data_V01/input"
EC2_OUTPUT_PATH = "/volumes/data/Y2017M09D15_RH_Add_Basin_Data_V01/output"

INPUT_FILENAME_EE =  "mergedZonalStatsEE_V12.pkl"
INPUT_FILENAME_HYDROBASINS =  "hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.csv"


In [3]:
!rm -r {EC2_INPUT_PATH} 
!rm -r {EC2_OUTPUT_PATH} 

In [4]:
!mkdir -p {EC2_INPUT_PATH} 
!mkdir -p {EC2_OUTPUT_PATH} 

In [5]:
!aws s3 cp {S3_INPUT_PATH_EE} {EC2_INPUT_PATH} --recursive

download: s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/mergedZonalStatsEE_V12.csv to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V01/input/mergedZonalStatsEE_V12.csv
download: s3://wri-projects/Aqueduct30/processData/Y2017M09D14_RH_merge_EE_results_V01/output/mergedZonalStatsEE_V12.pkl to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V01/input/mergedZonalStatsEE_V12.pkl


In [6]:
!aws s3 cp {S3_INPUT_PATH_HYDROBASINS} {EC2_INPUT_PATH} --recursive --exclude "*" --include "*.csv"

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.csv to ../../../../data/Y2017M09D15_RH_Add_Basin_Data_V01/input/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.csv


In [7]:
import os
import pandas as pd
import multiprocessing as mp
import pickle
import numpy as np
import itertools
import logging

In [8]:
inputLocationEE = os.path.join(EC2_INPUT_PATH,INPUT_FILENAME_EE)
inputLocationHydroBasins = os.path.join(EC2_INPUT_PATH,INPUT_FILENAME_HYDROBASINS)

In [9]:
df_ee = pd.read_pickle(inputLocationEE)

In [10]:
df_HydroBasins = pd.read_csv(inputLocationHydroBasins)

In [11]:
df_ee.head()

Unnamed: 0_level_0,count_Hybas06,mean_Hybas06,count_area_30s_m2,mean_area_30s_m2,total_area_30s_m2,count_IrrLinearWN_monthY2014M01,mean_IrrLinearWN_monthY2014M01,total_volume_IrrLinearWN_monthY2014M01,count_IrrLinearWN_monthY2014M04,mean_IrrLinearWN_monthY2014M04,...,total_volume_IrrLinearWW_monthY2014M04,count_PLivWW_yearY2014M12,mean_PLivWW_yearY2014M12,total_volume_PLivWW_yearY2014M12,count_PIrrWN_monthY2014M07,mean_PIrrWN_monthY2014M07,total_volume_PIrrWN_monthY2014M07,count_IrrLinearWN_monthY2014M05,mean_IrrLinearWN_monthY2014M05,total_volume_IrrLinearWN_monthY2014M05
PfafID_Hybas06,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111011,2536,111011.0,2536,743658.186761,1885917000.0,2536,3e-05,55779.511103,2536,5.9e-05,...,294146.068317,2536,1.536236e-06,2897.213351,2536,7.1e-05,134465.393295,2536,4.7e-05,89361.642352
111012,3921,111012.0,3921,746186.462653,2925797000.0,3921,0.0,0.0,3921,0.0,...,0.0,3921,7.462877e-06,21834.864422,3921,0.0,0.0,3921,0.0,0.0
111013,1194,111013.0,1194,747422.836265,892422900.0,1194,0.0,0.0,1194,0.0,...,0.0,1194,5.765556e-07,514.531435,1194,0.0,0.0,1194,0.0,0.0
111014,5605,111014.0,5605,750449.27045,4206268000.0,5605,0.0,0.0,5605,0.0,...,0.0,5605,2.387179e-06,10041.114809,5605,0.0,0.0,5605,0.0,0.0
111015,21873,111015.0,21873,758792.279231,16597060000.0,21769,0.0,0.0,21769,0.0,...,0.0,21769,3.035635e-07,5038.263451,21769,0.0,0.0,21769,0.0,0.0


In [12]:
df_HydroBasins.head()

Unnamed: 0.1,Unnamed: 0,HYBAS_ID_x,NEXT_DOWN_x,NEXT_SINK_x,MAIN_BAS_x,DIST_SINK_x,DIST_MAIN_x,SUB_AREA_x,UP_AREA_x,PFAF_ID,...,Upstream_HYBAS_IDs,Upstream_PFAF_IDs,Downstream_HYBAS_IDs,Downstream_PFAF_IDs,NEXT_SINK_PFAF,Basin_HYBAS_IDs,Basin_PFAF_IDs,SUB_NAME,MAJ_NAME,FAOid_copy
0,0,6060000010,0,6060000010,6060000010,0.0,0.0,4317.4,4317.4,611001,...,[],[],[],[],611001.0,[6060000010],[611001],"['Archipielago de San Blas Coast', 'Altrato 1']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001002', 'MAJ_BAS_300..."
1,1,6060000200,0,6060000200,6060000200,0.0,0.0,35995.5,35996.7,611002,...,[],[],[],[],611002.0,[6060000200],[611002],"['Altrato 1', 'Sucio', 'Altrato 2']","['Caribbean Coast', 'Caribbean Coast', 'Caribb...","['MAJ_BAS_3001_SUB_BASE_0001003', 'MAJ_BAS_300..."
2,2,6060000210,0,6060000210,6060000210,0.0,0.0,443.9,443.9,611003,...,[],[],[],[],611003.0,[6060000210],[611003],"['Altrato 1', 'Golfo del Darien Coast']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001003', 'MAJ_BAS_300..."
3,3,6060000240,0,6060000240,6060000240,0.0,0.0,2186.3,2186.3,611004,...,[],[],[],[],611004.0,[6060000240],[611004],['Golfo del Darien Coast'],['Caribbean Coast'],['MAJ_BAS_3001_SUB_BASE_0001006']
4,4,6060000250,0,6060000250,6060000250,0.0,0.0,6533.8,6533.8,611005,...,[],[],[],[],611005.0,[6060000250],[611005],"['Golfo del Darien Coast', 'Sinu']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001006', 'MAJ_BAS_300..."


In [None]:
sectors = ["Dom","Ind","Irr","IrrLinear","Liv"]
parameters = ["WW","WN"]
temporalScales = ["year","month"]
runoffparameters = ["runoff","reducedmeanrunoff"]

In [None]:
demandList = []
for r in itertools.product(sectors,parameters, temporalScales): 
    regex = "%s%s_%s" %(r[0],r[1],r[2])
    demandList = demandList + [regex]

In [None]:
print(demandList)

In [None]:
def addUpstream(listje):
    df_full_temp = df_full.copy()
    df_part_temp = df_full_temp[df_full_temp.index.isin(listje)]
    df_part_temp2 = df_part_temp.copy()
    df_out = df_part_temp2.copy()
    i = 0
    for index, row in df_part_temp2.iterrows():
        i += 1
        print("i: ",i  ," index: ", index)
        try:
            upstreamCatchments = df_part_temp2.loc[index, "Upstream_PFAF_IDs"]
            upstreamCatchments = ast.literal_eval(upstreamCatchments)
            df_upstream = df_full_temp.loc[upstreamCatchments]
            area = df_upstream["countarea30sm2"] * df_upstream["meanarea30sm2"]

            df_new = pd.DataFrame()
            df_new["aream2"] = area

            for parameter in parameterList:
                df_new["count_" + parameter] = df_upstream["count" + parameter]
                df_new["volumem3_" + parameter] = area * df_upstream["mean" + parameter]

            sumSeries = df_new.sum()

            for key, value in sumSeries.iteritems():
                newKey = "upstream_sum_" + key
                df_out.loc[index, newKey] = value
            df_out.loc[index, "errorCode"] = 0
        except:
            print("error")
            df_out.loc[index, "errorCode"] = 1
            pass

    return df_out

In [None]:
def addUpstream2(listje):
    df_full_temp = df_full.copy()
    df_part_temp = df_full_temp[df_full_temp.index.isin(listje)]
    df_part_temp2 = df_part_temp.copy()
    df_out = df_part_temp2.copy()
    i = 0
    for index, row in df_part_temp2.iterrows():
        i += 1
        print("i: ",i  ," index: ", index)
    

In [None]:
mp.cpu_count()

In [None]:
print(inputLocation)

In [None]:
df_full.head()

In [None]:
indices_full = df_full.index.values
indices_split = np.array_split(indices_full, mp.cpu_count())

In [None]:
print(indices_split)

In [None]:
mp.log_to_stderr()

In [None]:
logger = mp.get_logger()
logger.setLevel(logging.INFO)

In [None]:
pool = mp.Pool(mp.cpu_count())

In [None]:
df_out = pd.concat(pool.map(addUpstream, indices_split))

In [None]:
df_out = addUpstream([1])

In [None]:
df_out.head()