In [1]:
""" Cleanup, add category and label for icep at gadm level 1.
-------------------------------------------------------------------------------

Author: Rutger Hofste
Date: 20190107
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

Args:
    TESTING (Boolean) : Toggle testing case.
    SCRIPT_
    NAME (string) : Script name.
    OUTPUT_VERSION (integer) : output version.
    DATABASE_ENDPOINT (string) : RDS or postGreSQL endpoint.
    DATABASE_NAME (string) : Database name.
    TABLE_NAME_AREA_30SPFAF06 (string) : Table name used for areas. Must exist
        on same database as used in rest of script.
    S3_INPUT_PATH_RIVERDISCHARGE (string) : AWS S3 input path for 
        riverdischarge.    
    S3_INPUT_PATH_DEMAND (string) : AWS S3 input path for 
        demand.     

"""

TESTING = 0
SCRIPT_NAME = "Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01"
OUTPUT_VERSION = 1

COUNT_THRESHOLD = 1000 #(icepbasin cellsize 60km )

NODATA_VALUE = -9999

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01/output_V01/"

BQ_PROJECT_ID = "aqueduct30"
BQ_INPUT_LINK_TABLE_NAME = "y2018m11d12_rh_gadm36_level1_to_rds_v01_v04"
BQ_OUTPUT_DATASET_NAME = "aqueduct30v01"
BQ_OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 

s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}/".format(SCRIPT_NAME,OUTPUT_VERSION)

print("S3_INPUT_PATH: ",S3_INPUT_PATH,
      "\nec2_input_path: ",ec2_input_path,
      "\nBQ_OUTPUT_DATASET_NAME: ", BQ_OUTPUT_DATASET_NAME,
      "\nBQ_OUTPUT_TABLE_NAME: ",BQ_OUTPUT_TABLE_NAME,
      "\ns3_output_path:",s3_output_path
      )

S3_INPUT_PATH:  s3://wri-projects/Aqueduct30/processData/Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01/output_V01/ 
ec2_input_path:  /volumes/data/Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01/input_V01 
BQ_OUTPUT_DATASET_NAME:  aqueduct30v01 
BQ_OUTPUT_TABLE_NAME:  y2019m01d07_rh_ga_cep_gadm_cat_label_bq_v01_v01 
s3_output_path: s3://wri-projects/Aqueduct30/processData/Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01/output_V01/


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2019M01D08 UTC 10:21


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_input_path}
!rm -r {ec2_output_path}
!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

In [4]:
!aws s3 cp {S3_INPUT_PATH} {ec2_input_path} --recursive

download: s3://wri-projects/Aqueduct30/processData/Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01/output_V01/df_gadm36_l1_30s.pkl to ../../../../data/Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01/input_V01/df_gadm36_l1_30s.pkl
download: s3://wri-projects/Aqueduct30/processData/Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01/output_V01/df_gadm36_l1_30s.csv to ../../../../data/Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01/input_V01/df_gadm36_l1_30s.csv


In [5]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

In [6]:
def normalize_score(row):
    if row <= -5:
        minV, maxV, addV = icep_min, -5, 0
    elif row <= 0:
        minV, maxV, addV = -5, -1, 0
    elif row <= 1:
        minV, maxV, addV = 0, 1, 2
    elif row <= 5:
        minV, maxV, addV = 1, 5, 3
    else:
        minV, maxV, addV = 5, icep_max, 4

    # Normalize score base on class bounds
    score = (row - minV) / (maxV - minV) + addV
    # Fix scores less than 0 or great than 5
    final_score = np.where(score < 0, 0, np.where(score > 5, 5, score))
    return final_score

def category_to_label(row):
    if row < -9998:
        cat = "No Data"
    elif row < 1:
        cat = "Low (< -5)"
    elif row < 2:
        cat = "Low to medium (-5 to 0)"
    elif row < 3:
        cat = "Medium to high (0 to +1)"
    elif row < 4:
        cat = "High (+1 to +5)"
    elif row <= 5:
        cat = "Extremely High (> +5)"
    else:
        cat = "Error"
    return cat

def label_to_category(row):
    if row == "Low (< -5)":
        cat = 0
    elif row == "Low to medium (-5 to 0)":
        cat = 1
    elif row == "Medium to high (0 to +1)":
        cat = 2
    elif row == "High (+1 to +5)":
        cat = 3
    elif row == "Extremely High (> +5)":
        cat = 4
    else:
        cat = -9999
    return cat

In [7]:
files = os.listdir(ec2_input_path)

In [8]:
files

['df_gadm36_l1_30s.pkl', 'df_gadm36_l1_30s.csv']

In [9]:
input_file_path = "{}/df_gadm36_l1_30s.pkl".format(ec2_input_path)

In [10]:
df = pd.read_pickle(input_file_path)

In [11]:
df.head()

Unnamed: 0,count,mean,zones,output_version,parameter,reducer,script_used,spatial_aggregation,spatial_resolution,unit
0,65158,0.617027,0,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless
1,29861,-0.160649,1,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless
2,30924,0.685177,2,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless
3,25573,0.696274,3,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless
4,20405,0.42411,4,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless


In [12]:
df.zones = df.zones.astype(np.int64)

In [13]:
df = df.rename(columns={"mean":"cep_raw",
                        "zones":"gid_1_id"})

In [14]:
sql = """
SELECT
  gid_1_id,
  gid_1
FROM
  `{}.{}.{}`
""".format(BQ_PROJECT_ID,BQ_OUTPUT_DATASET_NAME,BQ_INPUT_LINK_TABLE_NAME)

In [15]:
df_link = pd.read_gbq(query=sql,
                      dialect="standard")

In [16]:
df_link.head()

Unnamed: 0,gid_1_id,gid_1
0,0,AFG.1_1
1,1,AFG.2_1
2,2,AFG.3_1
3,3,AFG.4_1
4,4,AFG.5_1


In [17]:
df.head()

Unnamed: 0,count,cep_raw,gid_1_id,output_version,parameter,reducer,script_used,spatial_aggregation,spatial_resolution,unit
0,65158,0.617027,0,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless
1,29861,-0.160649,1,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless
2,30924,0.685177,2,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless
3,25573,0.696274,3,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless
4,20405,0.42411,4,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless


In [18]:
df_link.shape

(3610, 2)

In [19]:
df.shape

(3603, 10)

In [20]:
df_merged = pd.merge(left=df,
                     right=df_link,
                     how="left",
                     left_on="gid_1_id",
                     right_on="gid_1_id")

In [21]:
df_merged.set_index(keys=["gid_1"],
                    drop = False,
                    inplace = True)

In [22]:
df_merged.head()

Unnamed: 0_level_0,count,cep_raw,gid_1_id,output_version,parameter,reducer,script_used,spatial_aggregation,spatial_resolution,unit,gid_1
gid_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AFG.1_1,65158,0.617027,0,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless,AFG.1_1
AFG.2_1,29861,-0.160649,1,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless,AFG.2_1
AFG.3_1,30924,0.685177,2,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless,AFG.3_1
AFG.4_1,25573,0.696274,3,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless,AFG.4_1
AFG.5_1,20405,0.42411,4,1,icep_raw,mean,Y2019M01D07_RH_GA_CEP_Zonal_Stats_GADM_EE_V01,gadm_36_L01,30s,dimensionless,AFG.5_1


In [23]:
icep_min = df["cep_raw"].min()
icep_max = df["cep_raw"].max()

In [24]:
icep_min

-167.17104857680263

In [25]:
icep_max

93.858630351800002

In [26]:
df_merged["cep_raw"] = df_merged["cep_raw"].fillna(-9999.0)

In [27]:
df_merged["cep_score"] = df_merged["cep_raw"].apply(lambda x: normalize_score(x))

In [28]:
# Replace nodata scores with NoData value
df_merged["cep_score"][df_merged["cep_raw"] <-9998 ] = NODATA_VALUE

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
df_merged["cep_label"] = df_merged["cep_score"].apply(lambda x: category_to_label(x))

In [30]:
df_merged["cep_cat"] = df_merged["cep_label"].apply(lambda x: label_to_category(x))

In [31]:
df_merged = df_merged.drop(columns=["output_version","reducer","script_used","spatial_aggregation","spatial_resolution","unit","parameter"])

In [32]:
df_merged.columns = df_merged.columns.str.lower()

In [33]:
df_merged.head()

Unnamed: 0_level_0,count,cep_raw,gid_1_id,gid_1,cep_score,cep_label,cep_cat
gid_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AFG.1_1,65158,0.617027,0,AFG.1_1,2.617027,Medium to high (0 to +1),2
AFG.2_1,29861,-0.160649,1,AFG.2_1,1.209838,Low to medium (-5 to 0),1
AFG.3_1,30924,0.685177,2,AFG.3_1,2.685177,Medium to high (0 to +1),2
AFG.4_1,25573,0.696274,3,AFG.4_1,2.696274,Medium to high (0 to +1),2
AFG.5_1,20405,0.42411,4,AFG.5_1,2.42411,Medium to high (0 to +1),2


In [34]:
df_merged.sort_index(axis=1,
                     inplace=True)

In [35]:
destination_table = "{}.{}".format(BQ_OUTPUT_DATASET_NAME,BQ_OUTPUT_TABLE_NAME)

In [36]:
df_merged.to_gbq(destination_table=destination_table,
                 project_id=BQ_PROJECT_ID,
                 chunksize=10000,
                 if_exists="replace")

1it [00:04,  4.70s/it]


In [37]:
output_file_path_pkl = "{}/cep_cat_label.pkl".format(ec2_output_path)
output_file_path_csv = "{}/cep_cat_label.csv".format(ec2_output_path)
df_merged.to_pickle(output_file_path_pkl)
df_merged.to_csv(output_file_path_csv,encoding='utf-8')

In [38]:
!aws s3 cp  {ec2_output_path} {s3_output_path} --recursive

upload: ../../../../data/Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01/output_V01/cep_cat_label.csv to s3://wri-projects/Aqueduct30/processData/Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01/output_V01/cep_cat_label.csv
upload: ../../../../data/Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01/output_V01/cep_cat_label.pkl to s3://wri-projects/Aqueduct30/processData/Y2019M01D07_RH_GA_CEP_GADM_Cat_Label_BQ_V01/output_V01/cep_cat_label.pkl


In [39]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:00:19.170429


Previous runs:  
0:00:19.839925
