In [1]:
""" Merge, cleanup, add category and label for drought risk.
-------------------------------------------------------------------------------

Author: Rutger Hofste
Date: 201809028
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

Args:

Result:
    Table on Google Bigquery.


"""

SCRIPT_NAME = "Y2018M09D28_RH_DR_Cat_Label_V01"
OUTPUT_VERSION = 2

GCS_INPUT_PATH = "gs://aqueduct30_v01/Y2018M09D28_RH_DR_Zonal_Stats_EE_V01/output_V01/"

BQ_PROJECT_ID = "aqueduct30"
BQ_OUTPUT_DATASET_NAME = "aqueduct30v01"
BQ_OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_input_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 

print("GCS_INPUT_PATH: ",GCS_INPUT_PATH,
      "\nec2_input_path: ",ec2_input_path,
      "\nBQ_OUTPUT_DATASET_NAME: ", BQ_OUTPUT_DATASET_NAME,
      "\nBQ_OUTPUT_TABLE_NAME: ",BQ_OUTPUT_TABLE_NAME
      )

GCS_INPUT_PATH:  gs://aqueduct30_v01/Y2018M09D28_RH_DR_Zonal_Stats_EE_V01/output_V01/ 
ec2_input_path:  /volumes/data/Y2018M09D28_RH_DR_Cat_Label_V01/output_V02 
BQ_OUTPUT_DATASET_NAME:  aqueduct30v01 
BQ_OUTPUT_TABLE_NAME:  y2018m09d28_rh_dr_cat_label_v01_v02


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M10D01 UTC 10:28


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_input_path}
!mkdir -p {ec2_input_path}

rm: cannot remove '/volumes/data/Y2018M09D28_RH_DR_Cat_Label_V01/output_V02': No such file or directory


In [4]:
!gsutil -m cp {GCS_INPUT_PATH}* {ec2_input_path}

Copying gs://aqueduct30_v01/Y2018M09D28_RH_DR_Zonal_Stats_EE_V01/output_V01/exposureee_export.csv...
Copying gs://aqueduct30_v01/Y2018M09D28_RH_DR_Zonal_Stats_EE_V01/output_V01/hazardee_export.csv...
Copying gs://aqueduct30_v01/Y2018M09D28_RH_DR_Zonal_Stats_EE_V01/output_V01/riskee_export.csv...
Copying gs://aqueduct30_v01/Y2018M09D28_RH_DR_Zonal_Stats_EE_V01/output_V01/vulnerabilityee_export.csv...
- [4/4 files][  7.9 MiB/  7.9 MiB] 100% Done                                    
Operation completed over 4 objects/7.9 MiB.                                      


In [5]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

In [6]:
files = os.listdir(ec2_input_path)

In [7]:
files

['hazardee_export.csv',
 'riskee_export.csv',
 'exposureee_export.csv',
 'vulnerabilityee_export.csv']

In [8]:
def raw_value_to_score(x):
    """ input is already [0-1]
    mapping to [0-5]
    
    """
    if x == -9999:
        y = -9999
    else:
        y = 5 * x
    return y


def raw_value_to_score_vulnerability(x):
    """ Applying quantile approach as suggested by email from Gustavo Naumann
    
    """
    if x == -9999:
        y = -9999
    elif x<0.45:
        y = max(x/0.16-(0.29/0.16),0)
    elif (x >= 0.45) and ( x < 0.72):
        y = (1/0.27)*x - (2/3)
    elif (x >= 0.72) and ( x < 0.75):
        y = (1/0.03)*x -22 
    elif (x >= 0.75) and ( x < 0.84):
        y = (1/0.09)*x -(16/3)
    elif (x >= 0.84):
        y = min(5, (1/0.16)*x-(10/8)) 
    return y


def score_to_category(score):
    if score != 5:
        cat = int(np.floor(score))
    else:
        cat = 4
    return cat

def category_to_label(cat):
    if cat == -9999:
        label = "NoData"
    elif cat == 0:
        label = "Low"
    elif cat == 1:
        label = "Low - Medium"
    elif cat == 2:
        label = "Medium"
    elif cat == 3:
        label = "Medium - High"
    elif cat == 4: 
        label = "High"
    else:
        label = "Error"
    return label

In [9]:
d_out = {}
df_merge = pd.DataFrame(columns=['PFAF_ID']) 
for one_file in files:
    print(one_file)
    file_name, extension = one_file.split(".")
    parameter = file_name[:-9] # remove ee_export
    
    
    input_file_path = "{}/{}".format(ec2_input_path,one_file)
    df = pd.read_csv(input_file_path)
    df.drop_duplicates(subset="PFAF_ID",
                       keep="first",
                       inplace=True)
    df = df.fillna(-9999)
    df_out = df[["PFAF_ID","mean","count"]]
    df_out = df_out.rename(columns={"mean":"drought{}_dimensionless".format(parameter),
                           "count":"drought{}_count".format(parameter)})
    if one_file == "vulnerabilityee_export.csv":
        df_out_valid = df_out.loc[df_out["drought{}_dimensionless".format(parameter)]>=0]
        q = df_out_valid["drought{}_dimensionless".format(parameter)].quantile(q=[0,0.2,0.4,0.6,0.8,1])
        df_out["drought{}_score".format(parameter)] = df_out["drought{}_dimensionless".format(parameter)].apply(raw_value_to_score_vulnerability)
        print(q)
    else:
        df_out["drought{}_score".format(parameter)] = df_out["drought{}_dimensionless".format(parameter)].apply(raw_value_to_score)
    
    df_out["drought{}_cat".format(parameter)] = df_out["drought{}_score".format(parameter)].apply(score_to_category)
    df_out["drought{}_label".format(parameter)] = df_out["drought{}_cat".format(parameter)].apply(category_to_label)
    
    df_merge = df_merge.merge(right=df_out,how="outer",on="PFAF_ID")


hazardee_export.csv
riskee_export.csv
exposureee_export.csv
vulnerabilityee_export.csv
0.0    0.298089
0.2    0.449033
0.4    0.721990
0.6    0.751741
0.8    0.843906
1.0    1.000000
Name: droughtvulnerability_dimensionless, dtype: float64


In [10]:
q

0.0    0.298089
0.2    0.449033
0.4    0.721990
0.6    0.751741
0.8    0.843906
1.0    1.000000
Name: droughtvulnerability_dimensionless, dtype: float64

In [11]:
df_merge.head()

Unnamed: 0,PFAF_ID,droughthazard_dimensionless,droughthazard_count,droughthazard_score,droughthazard_cat,droughthazard_label,droughtrisk_dimensionless,droughtrisk_count,droughtrisk_score,droughtrisk_cat,...,droughtexposure_dimensionless,droughtexposure_count,droughtexposure_score,droughtexposure_cat,droughtexposure_label,droughtvulnerability_dimensionless,droughtvulnerability_count,droughtvulnerability_score,droughtvulnerability_cat,droughtvulnerability_label
0,635900,0.356721,408,1.783604,1,Low - Medium,0.45932,320,2.296601,2,...,0.127156,320,0.63578,0,Low,0.73217,320,2.405674,2,Medium
1,635804,0.34713,58,1.735648,1,Low - Medium,0.447877,58,2.239387,2,...,0.113867,58,0.569333,0,Low,0.733205,58,2.440163,2,Medium
2,635807,0.359986,76,1.799929,1,Low - Medium,0.440631,76,2.203156,2,...,0.103049,76,0.515246,0,Low,0.736576,76,2.552544,2,Medium
3,635809,0.299776,341,1.498881,1,Low - Medium,0.492612,341,2.463059,2,...,0.19902,341,0.995099,0,Low,0.726616,341,2.220544,2,Medium
4,635808,0.305777,138,1.528887,1,Low - Medium,0.412117,138,2.060587,2,...,0.094684,138,0.473421,0,Low,0.739183,138,2.639447,2,Medium


In [12]:
destination_table = "{}.{}".format(BQ_OUTPUT_DATASET_NAME,BQ_OUTPUT_TABLE_NAME)

In [13]:
destination_table

'aqueduct30v01.y2018m09d28_rh_dr_cat_label_v01_v02'

In [14]:
df_merge.to_gbq(destination_table=destination_table,
                 project_id=BQ_PROJECT_ID,
                 chunksize=10000,
                 if_exists="replace")

2it [00:06,  3.46s/it]


In [15]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:00:21.379444


Previous runs:  
0:00:23.124523
