In [1]:
""" Combine and simplify demand and riverdischarge dataframes.
-------------------------------------------------------------------------------

Combine the area, demand and riverdischarge dataframes and put them in a 
simplified and cleaned format. A community question has been posted at: 
https://stackoverflow.com/questions/50486168/is-it-ok-to-split-value-and-parameter-in-database/50488411#50488411

Args:
    TESTING (Boolean) : Toggle testing case.
    SCRIPT_NAME (string) : Script name.
    OUTPUT_VERSION (integer) : output version.

"""

TESTING = 0
OVERWRITE = 0
SCRIPT_NAME = "Y2018M05D23_RH_Simplify_DataFrames_Pandas_30sPfaf06_V02"
OUTPUT_VERSION = 3

DATABASE_ENDPOINT = "aqueduct30v05.cgpnumwmfcqc.eu-central-1.rds.amazonaws.com"
DATABASE_NAME = "database01"

# Area 
TABLE_NAME_AREA_30SPFAF06 = "area_m2_30spfaf06"

# Riverdischarge
S3_INPUT_PATH_RIVERDISCHARGE = "s3://wri-projects/Aqueduct30/processData/Y2018M05D16_RH_Final_Riverdischarge_30sPfaf06_V01/output_V03"

# Demand
S3_INPUT_PATH_DEMAND = "s3://wri-projects/Aqueduct30/processData/Y2018M04D22_RH_Zonal_Stats_Demand_EE_V01/output_V01"

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)

print("\nInput ec2: " + ec2_input_path,
      "\nInput postGIS table area: " + TABLE_NAME_AREA_30SPFAF06 ,
      "\nInput s3 riverdischarge: " + S3_INPUT_PATH_RIVERDISCHARGE,
      "\nInput s3 demand: " + S3_INPUT_PATH_DEMAND)



Input ec2: /volumes/data/Y2018M05D23_RH_Simplify_DataFrames_Pandas_30sPfaf06_V02/input_V03 
Input postGIS table area: area_m2_30spfaf06 
Input s3 riverdischarge: s3://wri-projects/Aqueduct30/processData/Y2018M05D16_RH_Final_Riverdischarge_30sPfaf06_V01/output_V03 
Input s3 demand: s3://wri-projects/Aqueduct30/processData/Y2018M04D22_RH_Zonal_Stats_Demand_EE_V01/output_V01


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M05D23 UTC 17:45


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
if OVERWRITE:
    !rm -r {ec2_input_path}
    !rm -r {ec2_output_path}
    !mkdir -p {ec2_input_path}
    !mkdir -p {ec2_output_path}
    !aws s3 cp {S3_INPUT_PATH_RIVERDISCHARGE} {ec2_input_path} --recursive --exclude="*" --include="*.pkl"
    !aws s3 cp {S3_INPUT_PATH_DEMAND} {ec2_input_path} --recursive --exclude="*" --include="*.pkl"

In [4]:
# imports
import re
import os
import numpy as np
import pandas as pd
import aqueduct3
from datetime import timedelta
from sqlalchemy import *
pd.set_option('display.max_columns', 500)

In [5]:
def get_area_df():
    F = open("/.password","r")
    password = F.read().splitlines()[0]
    F.close()
    
    engine = create_engine("postgresql://rutgerhofste:{}@{}:5432/{}".format(password,DATABASE_ENDPOINT,DATABASE_NAME))
    connection = engine.connect()

    if TESTING:
        query = "SELECT * FROM {} LIMIT 100".format(TABLE_NAME_AREA_30SPFAF06)
    else:
        query = "SELECT * FROM {}".format(TABLE_NAME_AREA_30SPFAF06)
    df_area = pd.read_sql(query,connection)
    return df_area

def pre_process_area(df):
    df_out = df[["pfafid_30spfaf06","area_m2_30spfaf06","count"]]
    df_out.rename(columns={"count":"area_count_30spfaf06"},inplace=True)
    df_out.set_index("pfafid_30spfaf06",inplace=True)
    return df_out


def get_file_names(file_names,temporal_resolution,year,month):
    """ Finds the filenames for riverdischarge and demand using regex.
    -------------------------------------------------------------------------------
    
    WARNING: Month is set to 1 for yearly (annual) data for riverdischarge
    whereas for demand month is set to 12. 
    
    
    Args:
        file_names (list) : list of all file names.
        temporal_resolution (string) : 'month' or 'year'
        year (integer) : year [1960:2014]
        month (integer) : month [1:12]. Not used if temporal_resolution is 'year'
    
    Returns:
        matching_file_names (dict) : dictionary with matching filenames for 
            demand and discharge.
    
    """   
    
    matching_file_names = {}    
    matching_file_names["riverdischarge"] = []
    matching_file_names["demand"] = []
    
    if temporal_resolution == "year":
        month_riverdischarge = 1
        month_demand = 12
        riverdischarge_pattern = "global_historical_combinedriverdischarge_{}_millionm3_30sPfaf06_1960_2014_I\d\d\dY{:04.0f}M{:02.0f}.pkl".format(temporal_resolution,year,month_riverdischarge)
        demand_pattern = "global_historical_P....._{}_m_5min_1960_2014_I\d\d\dY{:04.0f}M{:02.0f}_reduced_06_30s_mean.pkl".format(temporal_resolution,year,month_demand)      
    else:
        riverdischarge_pattern = "global_historical_combinedriverdischarge_{}_millionm3_30sPfaf06_1960_2014_I\d\d\dY{:04.0f}M{:02.0f}.pkl".format(temporal_resolution,year,month)
        demand_pattern = "global_historical_P....._{}_m_5min_1960_2014_I\d\d\dY{:04.0f}M{:02.0f}_reduced_06_30s_mean.pkl".format(temporal_resolution,year,month)

    for file_name in file_names:
        if re.search(riverdischarge_pattern,file_name):
            matching_file_names["riverdischarge"].append(file_name)
        elif re.search(demand_pattern,file_name):
            matching_file_names["demand"].append(file_name)
    return matching_file_names

def pre_process_df(df):
    """ rename dataframe column and drastically simplify dataframe.
    -------------------------------------------------------------------------------
    
    The column name will be in format: 
    domww_m_30spfaf06    
    {indicator}_{unit}_{spatial_aggregation}
    
    The temporal resolution is not added to the schema.   
        
    Args:
        df (pd.DataFrame) : input dataframe.
    
    Returns:
        df_out (pd.DataFrame) : 
    
    """
    
    df_in = df.copy()
    
    indicator = df_in.loc[0]["indicator"].lower()
    unit = df_in.loc[0]["unit"].lower()
    zones_spatial_resolution = df_in.loc[0]["zones_spatial_resolution"]
    zones_pfaf_level = df_in.loc[0]["zones_pfaf_level"]    
    
    new_indicator_name = "{}_{}_{}pfaf{:02.0f}".format(indicator,unit,zones_spatial_resolution,zones_pfaf_level)
    new_count_name = "{}_count_{}pfaf{:02.0f}".format(indicator,zones_spatial_resolution,zones_pfaf_level)
    new_zones_name = "pfafid_{}pfaf{:02.0f}".format(zones_spatial_resolution,zones_pfaf_level)
    
    df_out = df_in[["zones","count","mean","temporal_resolution","year","month"]]
    df_out.rename(columns={"mean":new_indicator_name,
                           "count":new_count_name,
                           "zones":new_zones_name},
                  inplace=True)
    
    df_out[new_zones_name] = df_out[new_zones_name].astype(np.int64)
    df_out.set_index(new_zones_name,inplace=True)
    df_out.sort_index(axis=1, inplace=True)
    return df_out



In [6]:
df_area = get_area_df()
df_area = pre_process_area(df_area)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [7]:
df_area.head()

Unnamed: 0_level_0,area_m2_30spfaf06,area_count_30spfaf06
pfafid_30spfaf06,Unnamed: 1_level_1,Unnamed: 2_level_1
111011,1885917000.0,2536
111012,2925797000.0,3921
111013,892422900.0,1194
111014,4206268000.0,5605
111015,16597060000.0,21873


In [8]:
file_names = os.listdir(ec2_input_path)

In [9]:
temporal_resolutions = ["year","month"]
years = range(1960,2014+1)
months = range(1,12+1)

In [10]:
if TESTING:
    temporal_resolutions = ["month"]
    years = range(1960,1961)
    months = range(1,3)
    

In [None]:
i = 0
start_time = time.time()
for temporal_resolution in temporal_resolutions:
    for year in years:
        for month in months:
            i = i + 1
            elapsed_time = time.time() - start_time 
            print("Index: {:03.0f} Elapsed: {}".format(i, timedelta(seconds=elapsed_time)))
            print(i,temporal_resolution,year,month)
            matching_file_names = get_file_names(file_names,temporal_resolution,year,month)            
            
            
            df_merged = df_area.copy()
            for indicator, matching_file_names in matching_file_names.items():   
                for matching_file_name in matching_file_names:    
                    file_path = "{}/{}".format(ec2_input_path,matching_file_name)
                    df = pd.read_pickle(file_path)   

                    if indicator == "riverdischarge":
                        df.rename(columns={"count_mainchannel":"count",
                                           "riverdischarge_millionm3":"mean",
                                           "year_mainchannel":"year",
                                           "month_mainchannel":"month",
                                           "temporal_resolution_mainchannel":"temporal_resolution",
                                           "indicator_mainchannel":"indicator",
                                           "unit_mainchannel":"unit",
                                           "zones_spatial_resolution_mainchannel":"zones_spatial_resolution",
                                           "zones_pfaf_level_mainchannel":"zones_pfaf_level"},
                                  inplace = True)  


                    elif indicator == "demand":
                        pass
                    df_cleaned = pre_process_df(df)
                    
                    
                    df_merged = df_merged.merge(right= df_cleaned,
                                                 how="left",
                                                 left_index =True,
                                                 right_index = True,
                                                 suffixes = ["","_duplicate"])
                    
                    try:
                        df_merged = df_merged.drop(columns = ["month_duplicate",
                                                              "temporal_resolution_duplicate",
                                                              "year_duplicate"] ) 
                    except:
                        pass

            
            df_merged["riverdischarge_m_30spfaf06"] = (df_merged["riverdischarge_millionm3_30spfaf06"] * 1e6) / df_merged["area_m2_30spfaf06"]
            df_merged.drop(columns=["riverdischarge_millionm3_30spfaf06"])
            df_merged.sort_index(axis=1, inplace=True)

            output_file_name = "global_historical_merged_{}_millionm3_30sPfaf06_1960_2014_Y{:04.0f}M{:02.0f}.pkl".format(temporal_resolution,year,month)
            output_path = "{}/{}".format(ec2_output_path,output_file_name)
            df_merged.to_pickle(output_path)
            
            

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Index: 001 Elapsed: 0:00:00.000286
1 year 1960 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Index: 002 Elapsed: 0:00:01.317194
2 year 1960 2
Index: 003 Elapsed: 0:00:02.626110
3 year 1960 3
Index: 004 Elapsed: 0:00:03.943249
4 year 1960 4
Index: 005 Elapsed: 0:00:05.264024
5 year 1960 5
Index: 006 Elapsed: 0:00:06.562026
6 year 1960 6
Index: 007 Elapsed: 0:00:07.885769
7 year 1960 7
Index: 008 Elapsed: 0:00:09.254746
8 year 1960 8
Index: 009 Elapsed: 0:00:10.524897
9 year 1960 9
Index: 010 Elapsed: 0:00:11.818077
10 year 1960 10
Index: 011 Elapsed: 0:00:13.168514
11 year 1960 11
Index: 012 Elapsed: 0:00:14.426456
12 year 1960 12
Index: 013 Elapsed: 0:00:15.719591
13 year 1961 1
Index: 014 Elapsed: 0:00:16.980692
14 year 1961 2
Index: 015 Elapsed: 0:00:18.331683
15 year 1961 3
Index: 016 Elapsed: 0:00:19.682685
16 year 1961 4
Index: 017 Elapsed: 0:00:20.951616
17 year 1961 5
Index: 018 Elapsed: 0:00:22.236320
18 year 1961 6
Index: 019 Elapsed: 0:00:23.545655
19 year 1961 7
Index: 020 Elapsed: 0:00:24.845431
20 year 1961 8
Index: 021 Elapsed: 0:00:26.117218
21 year 1961 9
Index

Index: 164 Elapsed: 0:03:33.401136
164 year 1973 8
Index: 165 Elapsed: 0:03:34.803535
165 year 1973 9
Index: 166 Elapsed: 0:03:36.209910
166 year 1973 10
Index: 167 Elapsed: 0:03:37.591347
167 year 1973 11
Index: 168 Elapsed: 0:03:39.016385
168 year 1973 12
Index: 169 Elapsed: 0:03:40.409502
169 year 1974 1
Index: 170 Elapsed: 0:03:41.786350
170 year 1974 2
Index: 171 Elapsed: 0:03:43.218795
171 year 1974 3
Index: 172 Elapsed: 0:03:44.578712
172 year 1974 4
Index: 173 Elapsed: 0:03:45.896891
173 year 1974 5
Index: 174 Elapsed: 0:03:47.173125
174 year 1974 6
Index: 175 Elapsed: 0:03:48.515253
175 year 1974 7
Index: 176 Elapsed: 0:03:49.864484
176 year 1974 8
Index: 177 Elapsed: 0:03:51.179421
177 year 1974 9
Index: 178 Elapsed: 0:03:52.477783
178 year 1974 10
Index: 179 Elapsed: 0:03:53.812634
179 year 1974 11
Index: 180 Elapsed: 0:03:55.159235
180 year 1974 12
Index: 181 Elapsed: 0:03:56.525813
181 year 1975 1
Index: 182 Elapsed: 0:03:57.913785
182 year 1975 2
Index: 183 Elapsed: 0:03:

In [None]:
!aws s3 cp {ec2_output_path} {s3_output_path} --recursive

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

Previous Runs:
