In [1]:
""" Process and upload industry weights to BigQuery.
-------------------------------------------------------------------------------

Author: Rutger Hofste
Date: 20181206
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

"""

SCRIPT_NAME = "Y2018M12D06_RH_Process_Weights_BQ_V01"
OUTPUT_VERSION = 1

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Weights_V01/output"
INPUT_FILE_NAME = "aq30_weights_enhanced.csv"

BQ_PROJECT_ID = "aqueduct30"
BQ_OUTPUT_DATASET_NAME = "aqueduct30v01"
BQ_OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 

print("S3_INPUT_PATH: ",S3_INPUT_PATH,
      "\nec2_input_path: ",ec2_input_path,
      "\nec2_output_path: ",ec2_output_path,
      "\nBQ_OUTPUT_DATASET_NAME: ", BQ_OUTPUT_DATASET_NAME,
      "\nBQ_OUTPUT_TABLE_NAME: ",BQ_OUTPUT_TABLE_NAME
      )

S3_INPUT_PATH:  s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Weights_V01/output 
ec2_input_path:  /volumes/data/Y2018M12D06_RH_Process_Weights_BQ_V01/input_V01 
ec2_output_path:  /volumes/data/Y2018M12D06_RH_Process_Weights_BQ_V01/output_V01 
BQ_OUTPUT_DATASET_NAME:  aqueduct30v01 
BQ_OUTPUT_TABLE_NAME:  y2018m12d06_rh_process_weights_bq_v01_v01


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M12D07 UTC 15:23


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_input_path}
!rm -r {ec2_output_path}
!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

In [4]:
!aws s3 cp {S3_INPUT_PATH} {ec2_input_path} --recursive 

Completed 18.2 KiB/18.2 KiB (38.2 KiB/s) with 1 file(s) remainingdownload: s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Weights_V01/output/aq30_weights_enhanced.csv to ../../../../data/Y2018M12D06_RH_Process_Weights_BQ_V01/input_V01/aq30_weights_enhanced.csv


In [5]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

In [6]:
input_path = "{}/{}".format(ec2_input_path,INPUT_FILE_NAME)

In [7]:
df = pd.read_csv(input_path)

In [8]:
df.dtypes

id                         int64
category_full_name        object
category_short            object
indicator_name_full       object
indicator_short           object
sector_full               object
sector_short              object
weight_abs               float64
weight_label              object
weight_interpretation     object
weight_fraction          float64
dtype: object

In [9]:
df.head()

Unnamed: 0,id,category_full_name,category_short,indicator_name_full,indicator_short,sector_full,sector_short,weight_abs,weight_label,weight_interpretation,weight_fraction
0,1,Physical Risk Quantity,QAN,Baseline water stress,BWS,Default,DEF,4.0,Very High,Represents very high risk to the industry,0.163265
1,2,Physical Risk Quantity,QAN,Baseline water depletion,BWD,Default,DEF,4.0,Very High,Represents very high risk to the industry,0.163265
2,3,Physical Risk Quantity,QAN,Groundwater table decline,GTD,Default,DEF,4.0,Very High,Represents very high risk to the industry,0.163265
3,4,Physical Risk Quantity,QAN,Interannual variability,IAV,Default,DEF,0.5,Low,Represents low risk to the industry,0.020408
4,5,Physical Risk Quantity,QAN,Seasonal variability,SEV,Default,DEF,0.5,Low,Represents low risk to the industry,0.020408


In [10]:
destination_table = "{}.{}".format(BQ_OUTPUT_DATASET_NAME,BQ_OUTPUT_TABLE_NAME)

In [11]:
df.to_gbq(destination_table=destination_table,
          project_id=BQ_PROJECT_ID,
          chunksize=10000,
          if_exists="replace")

1it [00:03,  3.58s/it]


In [12]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:00:11.529102


Previous runs:   
0:00:11.529102
