In [1]:
""" Process and upload areas of master shapefile to BigQuery.
-------------------------------------------------------------------------------

Areas in the previous step have been calculated in ArcMap in an Eckert IV
projection. 


Author: Rutger Hofste
Date: 20181206
Kernel: python35
Docker: rutgerhofste/gisdocker:ubuntu16.04

"""
SCRIPT_NAME = "Y2018M12D07_RH_Process_Area_BQ_V01"
OUTPUT_VERSION = 1

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Master_Area_V01/steps/step02_area"
INPUT_FILE_NAME = "Y2018M12D06_RH_Master_Shape_Eckert4_V01.csv"

BQ_PROJECT_ID = "aqueduct30"
BQ_OUTPUT_DATASET_NAME = "aqueduct30v01"
BQ_OUTPUT_TABLE_NAME = "{}_v{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION).lower()

ec2_input_path = "/volumes/data/{}/input_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION) 

print("S3_INPUT_PATH: ",S3_INPUT_PATH,
      "\nec2_input_path: ",ec2_input_path,
      "\nec2_output_path: ",ec2_output_path,
      "\nBQ_OUTPUT_DATASET_NAME: ", BQ_OUTPUT_DATASET_NAME,
      "\nBQ_OUTPUT_TABLE_NAME: ",BQ_OUTPUT_TABLE_NAME
      )

S3_INPUT_PATH:  s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Master_Area_V01/steps/step02_area 
ec2_input_path:  /volumes/data/Y2018M12D07_RH_Process_Area_BQ_V01/input_V01 
ec2_output_path:  /volumes/data/Y2018M12D07_RH_Process_Area_BQ_V01/output_V01 
BQ_OUTPUT_DATASET_NAME:  aqueduct30v01 
BQ_OUTPUT_TABLE_NAME:  y2018m12d07_rh_process_area_bq_v01_v01


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M12D07 UTC 16:20


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [3]:
!rm -r {ec2_input_path}
!rm -r {ec2_output_path}
!mkdir -p {ec2_input_path}
!mkdir -p {ec2_output_path}

rm: cannot remove '/volumes/data/Y2018M12D07_RH_Process_Area_BQ_V01/input_V01': No such file or directory
rm: cannot remove '/volumes/data/Y2018M12D07_RH_Process_Area_BQ_V01/output_V01': No such file or directory


In [4]:
!aws s3 cp {S3_INPUT_PATH} {ec2_input_path} --recursive 

download: s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Master_Area_V01/steps/step02_area/Y2018M12D06_RH_Master_Shape_Eckert4_V01.cpg to ../../../../data/Y2018M12D07_RH_Process_Area_BQ_V01/input_V01/Y2018M12D06_RH_Master_Shape_Eckert4_V01.cpg
download: s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Master_Area_V01/steps/step02_area/Y2018M12D06_RH_Master_Shape_Eckert4_V01.prj to ../../../../data/Y2018M12D07_RH_Process_Area_BQ_V01/input_V01/Y2018M12D06_RH_Master_Shape_Eckert4_V01.prj
download: s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Master_Area_V01/steps/step02_area/Y2018M12D06_RH_Master_Shape_Eckert4_V01.shp.xml to ../../../../data/Y2018M12D07_RH_Process_Area_BQ_V01/input_V01/Y2018M12D06_RH_Master_Shape_Eckert4_V01.shp.xml
download: s3://wri-projects/Aqueduct30/processData/Y2018M12D07_RH_Master_Area_V01/steps/step02_area/Y2018M12D06_RH_Master_Shape_Eckert4_V01.sbx to ../../../../data/Y2018M12D07_RH_Process_Area_BQ_V01/input_V01/Y2018M12D06_RH_Master_Sh

In [5]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/.google.json"
os.environ["GOOGLE_CLOUD_PROJECT"] = "aqueduct30"
client = bigquery.Client(project=BQ_PROJECT_ID)

In [6]:
input_path = "{}/{}".format(ec2_input_path,INPUT_FILE_NAME)

In [7]:
df = pd.read_csv(input_path)

In [8]:
df.dtypes

string_id     object
area_km2     float64
dtype: object

In [9]:
df.head()

Unnamed: 0,string_id,area_km2
0,111011-EGY.11_1-3365,4.223754
1,111011-EGY.15_1-3365,1846.012343
2,111011-EGY.15_1-None,30.526067
3,111011-None-3365,0.742712
4,111011-None-None,13.430995


In [10]:
destination_table = "{}.{}".format(BQ_OUTPUT_DATASET_NAME,BQ_OUTPUT_TABLE_NAME)

In [11]:
df.to_gbq(destination_table=destination_table,
          project_id=BQ_PROJECT_ID,
          chunksize=10000,
          if_exists="replace")

7it [00:26,  3.80s/it]


In [12]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

0:01:40.075823


Previous runs:   
0:01:40.075823
