In [1]:
""" Process flood risk data and store on BigQuery. 
-------------------------------------------------------------------------------

Author: Rutger Hofste
Date: 20181204
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

"""

SCRIPT_NAME = "Y2018M12D04_RH_RFR_CFR_BQ_V01"
OUTPUT_VERSION = 1

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/finalData/Floods"
INPUT_FILE_NAME = "flood_results.csv"

BQ_PROJECT_ID = "aqueduct30"
BQ_OUTPUT_DATASET_NAME = "aqueduct30v01"
BQ_OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 

print("S3_INPUT_PATH: ",S3_INPUT_PATH,
      "\nec2_input_path: ",ec2_input_path,
      "\nec2_output_path: ",ec2_output_path,
      "\nBQ_OUTPUT_DATASET_NAME: ", BQ_OUTPUT_DATASET_NAME,
      "\nBQ_OUTPUT_TABLE_NAME: ",BQ_OUTPUT_TABLE_NAME
      )


S3_INPUT_PATH:  s3://wri-projects/Aqueduct30/finalData/Floods 
ec2_input_path:  /volumes/data/Y2018M12D04_RH_RFR_CFR_BQ_V01/input_V01 
ec2_output_path:  /volumes/data/Y2018M12D04_RH_RFR_CFR_BQ_V01/output_V01 
BQ_OUTPUT_DATASET_NAME:  aqueduct30v01 
BQ_OUTPUT_TABLE_NAME:  y2018m12d04_rh_rfr_cfr_bq_v01_v01


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M12D05 UTC 12:53


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_input_path}
!rm -r {ec2_output_path}
!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

In [4]:
!aws s3 cp {S3_INPUT_PATH} {ec2_input_path} --recursive --exclude 'inundationMaps/*'

download: s3://wri-projects/Aqueduct30/finalData/Floods/flood_results.CPG to ../../../../data/Y2018M12D04_RH_RFR_CFR_BQ_V01/input_V01/flood_results.CPG
download: s3://wri-projects/Aqueduct30/finalData/Floods/flood_results.prj to ../../../../data/Y2018M12D04_RH_RFR_CFR_BQ_V01/input_V01/flood_results.prj
download: s3://wri-projects/Aqueduct30/finalData/Floods/flood_results.shp.xml to ../../../../data/Y2018M12D04_RH_RFR_CFR_BQ_V01/input_V01/flood_results.shp.xml
download: s3://wri-projects/Aqueduct30/finalData/Floods/flood_results.sbx to ../../../../data/Y2018M12D04_RH_RFR_CFR_BQ_V01/input_V01/flood_results.sbx
download: s3://wri-projects/Aqueduct30/finalData/Floods/flood_results.sbn to ../../../../data/Y2018M12D04_RH_RFR_CFR_BQ_V01/input_V01/flood_results.sbn
download: s3://wri-projects/Aqueduct30/finalData/Floods/flood_results.shx to ../../../../data/Y2018M12D04_RH_RFR_CFR_BQ_V01/input_V01/flood_results.shx
download: s3://wri-projects/Aqueduct30/finalData/Floods/flood_results.csv to ../

In [5]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

In [6]:
files = os.listdir(ec2_input_path)

In [7]:
input_path = "{}/{}".format(ec2_input_path,INPUT_FILE_NAME)

In [8]:
df = pd.read_csv(input_path)

In [9]:
df.dtypes

PFAF_ID                 int64
River_pop_impacted    float64
Coast_pop_impacted    float64
pop_total             float64
RVR_raw               float64
CST_raw               float64
RVR_s                 float64
CST_s                 float64
RVR_cat                object
CST_cat                object
dtype: object

In [10]:
df.head()

Unnamed: 0,PFAF_ID,River_pop_impacted,Coast_pop_impacted,pop_total,RVR_raw,CST_raw,RVR_s,CST_s,RVR_cat,CST_cat
0,111011,33829.559052,0.0,454804.93145,0.074383,0.0,4.180674,0.0,Extremely High (more than 1 in 100),"Low (0 to 9 in 1,000,000)"
1,111012,0.0,0.0,28.227839,0.0,0.0,0.0,0.0,"Low (0 to 1 in 1,000)","Low (0 to 9 in 1,000,000)"
2,111013,188.250738,0.0,11524.015199,0.016336,0.0,4.008112,0.0,Extremely High (more than 1 in 100),"Low (0 to 9 in 1,000,000)"
3,111014,0.0,0.0,30.244114,0.0,0.0,0.0,0.0,"Low (0 to 1 in 1,000)","Low (0 to 9 in 1,000,000)"
4,111015,16815.152185,7.560877,249994.816238,0.067262,3e-05,4.159506,1.332959,Extremely High (more than 1 in 100),"Low to medium (9 in 1,000,000 to 7 in 100,000)"


In [11]:
# RVR -> RFR
# CST -> CFR. 

# raw -> raw.
# s -> score.
# None -> cat.
# cat -> label. 

In [12]:
df_out = df.rename(columns={"PFAF_ID":"pfaf_id",
                            "RVR_raw":"rfr_raw",
                            "CST_raw":"cfr_raw",
                            "RVR_s":"rfr_score",
                            "CST_s":"cfr_score",
                            "RVR_cat":"rfr_label",
                            "CST_cat":"cfr_label"})

In [13]:
df_out.drop(columns=["River_pop_impacted","Coast_pop_impacted","pop_total"],inplace=True)

In [14]:
def score_to_category(score):
    if score != 5:
        cat = int(np.floor(score))
    else:
        cat = 4
    return cat

In [15]:
df_out["rfr_cat"] = df_out["rfr_score"].apply(score_to_category)
df_out["cfr_cat"] = df_out["cfr_score"].apply(score_to_category)

In [16]:
df_out = df_out.reindex(sorted(df_out.columns), axis=1)

In [17]:
destination_table = "{}.{}".format(BQ_OUTPUT_DATASET_NAME,BQ_OUTPUT_TABLE_NAME)

In [18]:
df_out.to_gbq(destination_table=destination_table,
          project_id=BQ_PROJECT_ID,
          chunksize=10000,
          if_exists="replace")

2it [00:07,  3.96s/it]


In [19]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:00:23.705739


Previous runs:  
0:00:18.766466
