In [1]:
""" Ingest PCRGLOBWB timeseries data on Google Earth Engine
-------------------------------------------------------------------------------
This notebook will upload the geotiff files from the Google Cloud Storage to
the WRI/aqueduct earthengine bucket. An errorlog will be stored on Amazon S3.

Requirements:
    Authorize earthengine by running in your terminal: earthengine 
                                                       authenticate

    you need to have access to the WRI-Aquaduct (yep a Google employee made a
    typo) bucket to ingest the data. Rutger can grant access to write to this 
    folder. 

    Have access to the Google Cloud Storage Bucker

Make sure to set the project to Aqueduct30 by running
`gcloud config set project aqueduct30`

Code follows the Google for Python Styleguide. Exception are the scripts that 
use earth engine since this is camelCase instead of underscore.

Author: Rutger Hofste
Date: 20170802
Kernel: python27
Docker: rutgerhofste/gisdocker:ubuntu16.04

Args:    
    TESTING (Boolean) : Toggle Testing Mode.
    OVERWRITE (Boolean) : Overwrite old folder !CAUTION!
    
    SCRIPT_NAME (string) : Script name.    
    PREVIOUS_SCRIPT_NAME (string) : Previous script name. 
    INPUT_VERSION (integer) : Input version.     
    OUTPUT_VERSION (integer) : Output version.     
    OUTPUT_FILE_NAME (string) : File Name for a csv file containing the failed tasks. 
    
    SEPARATOR (regex) : Regular expression of separators used in geotiff
      filenames.     
    SCHEMA (list) : A list of strings containing the schema. See 
      aqueduct3.split_key() for more info.
    EXTRA_PROPERTIES (Dictionary) : Extra properties to add to assets. nodata_value,
      script used are common properties.

Returns:


"""

# Input Parameters
TESTING = 0
OVERWRITE = 1 # !CAUTION!
SCRIPT_NAME = "Y2017M08D02_RH_Ingest_GCS_EE_V02"
PREVIOUS_SCRIPT_NAME = "Y2017M07D31_RH_Convert_NetCDF_Geotiff_V02"

INPUT_VERSION = 2
OUTPUT_VERSION = 9

OUTPUT_FILE_NAME = "df_errorsV01.csv"

SEPARATOR = "_|-"
SCHEMA = ["geographic_range",
     "temporal_range",
     "indicator",
     "temporal_resolution",
     "unit",
     "spatial_resolution",
     "temporal_range_min",
     "temporal_range_max"]

EXTRA_PROPERTIES = {"nodata_value":-9999,
                    "ingested_by" : "RutgerHofste",
                    "script_used": SCRIPT_NAME}

# ETL
gcs_input_path = "gs://aqueduct30_v01/{}/output_V{:02.0f}/".format(PREVIOUS_SCRIPT_NAME,INPUT_VERSION)
ee_output_path = "projects/WRI-Aquaduct/PCRGlobWB20V{:02.0f}".format(OUTPUT_VERSION)
s3_output_path = "s3://wri-projects/Aqueduct30/processData/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)
ec2_output_path = "/volumes/data/{}/output_V{:02.0f}".format(SCRIPT_NAME,OUTPUT_VERSION)

print("Input gcs: " +  gcs_input_path +
      "\nOutput ee: " + ee_output_path +
      "\nOutput S3: " + s3_output_path +
      "\nOutput ec2: " + ec2_output_path)


Input gcs: gs://aqueduct30_v01/Y2017M07D31_RH_Convert_NetCDF_Geotiff_V02/output_V02/
Output ee: projects/WRI-Aquaduct/PCRGlobWB20V09
Output S3: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Ingest_GCS_EE_V02/output_V09
Output ec2: /volumes/data/Y2017M08D02_RH_Ingest_GCS_EE_V02/output_V09


In [2]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M04D17 UTC 12:05


'3.5.4 |Anaconda, Inc.| (default, Nov 20 2017, 18:44:38) \n[GCC 7.2.0]'

In [None]:
# Imports
import subprocess
import datetime
import os
import time
import re
import pandas as pd
from datetime import timedelta
import aqueduct3


def main():
    start_time = time.time()
    !mkdir -p {ec2_output_path}
    
    keys = aqueduct3.get_GCS_keys(gcs_input_path)
    df = aqueduct3.keys_to_df(keys,SEPARATOR,SCHEMA)
    df = df.assign(**EXTRA_PROPERTIES)
    df["exportdescription"] = df["indicator"] + "_" + df["temporal_resolution"]+"Y"+df["year"]+"M"+df["month"]
    df = df.apply(pd.to_numeric, errors='ignore')
    
    # Earth Engine Preparations
    # Create folder
    if OVERWRITE:
        command = "earthengine rm -r {}".format(ee_output_path)
        print(command)
        subprocess.check_output(command,shell=True)

    command = "earthengine create folder {}".format(ee_output_path)
    print(command)
    subprocess.check_output(command,shell=True)
    
    # Create ImageCollections
    parameters = df.parameter.unique()
    for parameter in parameters:
        ic_id = ee_output_path + "/" + parameter
        command, result = aqueduct3.create_imageCollection(ic_id)
        print(command,result)

    if TESTING:
        df = df[1:3] 
       
    df_errors = pd.DataFrame()
    
    for index, row in df.iterrows():
        elapsed_time = time.time() - start_time 
        print(index,"{:02.2f}".format((float(index)/df.shape[0])*100) + "elapsed: ", str(timedelta(seconds=elapsed_time)))

        geotiff_gcs_path = gcs_input_path + row.file_name + "." + row.extension
        output_ee_asset_id = ee_output_path +"/"+ row.parameter + "/" + row.file_name
        properties = row.to_dict()

        df_errors2 = aqueduct3.upload_geotiff_to_EE_imageCollection(geotiff_gcs_path, output_ee_asset_id, properties,index)
        df_errors = df_errors.append(df_errors2)    

    # Storing error dataframe on ec2 and S3
    df_errors.to_csv("{}/{}".format(ec2_output_path,OUTPUT_FILE_NAME))
    !aws s3 cp  {ec2_output_path} {s3_output_path} --recursive
    
    # Retry Failed Tasks Once
    df_retry = df_errors.loc[df_errors['error'] != 0]
    for index, row in df_retry.iterrows():
        response = subprocess.check_output(row.command, shell=True)
    
    return df,df_errors



if __name__ == "__main__":
    df,df_errors = main()

earthengine rm -r projects/WRI-Aquaduct/PCRGlobWB20V09
earthengine create folder projects/WRI-Aquaduct/PCRGlobWB20V09
earthengine create collection projects/WRI-Aquaduct/PCRGlobWB20V09/global_historical_PDomWN_month_millionm3_5min_1960_2014 b''
earthengine create collection projects/WRI-Aquaduct/PCRGlobWB20V09/global_historical_PDomWN_year_millionm3_5min_1960_2014 b''
earthengine create collection projects/WRI-Aquaduct/PCRGlobWB20V09/global_historical_PDomWW_month_millionm3_5min_1960_2014 b''
earthengine create collection projects/WRI-Aquaduct/PCRGlobWB20V09/global_historical_PDomWW_year_millionm3_5min_1960_2014 b''
earthengine create collection projects/WRI-Aquaduct/PCRGlobWB20V09/global_historical_PIndWN_month_millionm3_5min_1960_2014 b''
earthengine create collection projects/WRI-Aquaduct/PCRGlobWB20V09/global_historical_PIndWN_year_millionm3_5min_1960_2014 b''
earthengine create collection projects/WRI-Aquaduct/PCRGlobWB20V09/global_historical_PIndWW_month_millionm3_5min_1960_2014 

151 1.63elapsed:  0:10:11.823550
152 1.64elapsed:  0:10:14.790312
153 1.65elapsed:  0:10:17.765038
154 1.66elapsed:  0:10:20.726679
155 1.67elapsed:  0:10:23.714217
156 1.68elapsed:  0:10:26.760054
157 1.69elapsed:  0:10:29.716276
158 1.70elapsed:  0:10:32.720433
159 1.71elapsed:  0:10:35.686680
160 1.72elapsed:  0:10:38.679578
161 1.73elapsed:  0:10:41.638911
162 1.74elapsed:  0:10:44.625741
163 1.75elapsed:  0:10:47.668477
164 1.77elapsed:  0:10:50.647356
165 1.78elapsed:  0:10:53.547140
166 1.79elapsed:  0:10:56.522378
167 1.80elapsed:  0:10:59.609974
168 1.81elapsed:  0:11:02.637070
169 1.82elapsed:  0:11:05.585962
170 1.83elapsed:  0:11:08.566226
171 1.84elapsed:  0:11:11.635405
172 1.85elapsed:  0:11:14.633631
173 1.86elapsed:  0:11:17.674287
174 1.87elapsed:  0:11:20.641876
175 1.88elapsed:  0:11:23.630862
176 1.89elapsed:  0:11:26.584303
177 1.91elapsed:  0:11:29.600856
178 1.92elapsed:  0:11:32.560139
179 1.93elapsed:  0:11:35.545442
180 1.94elapsed:  0:11:37.594592
181 1.95el

400 4.31elapsed:  0:22:37.418408
401 4.32elapsed:  0:22:40.443687
402 4.33elapsed:  0:22:43.522055
403 4.34elapsed:  0:22:46.537966
404 4.35elapsed:  0:22:49.497252
405 4.36elapsed:  0:22:52.475825
406 4.37elapsed:  0:22:55.503042
407 4.38elapsed:  0:22:58.021716
408 4.39elapsed:  0:23:01.027172
409 4.40elapsed:  0:23:03.597609
410 4.41elapsed:  0:23:06.615721
411 4.42elapsed:  0:23:09.714151
412 4.43elapsed:  0:23:12.741830
413 4.45elapsed:  0:23:15.787932
414 4.46elapsed:  0:23:18.806468
415 4.47elapsed:  0:23:22.403629
416 4.48elapsed:  0:23:25.473735
417 4.49elapsed:  0:23:28.554247
418 4.50elapsed:  0:23:31.628558
419 4.51elapsed:  0:23:34.705945
420 4.52elapsed:  0:23:37.732014
421 4.53elapsed:  0:23:40.762403
422 4.54elapsed:  0:23:43.823655
423 4.55elapsed:  0:23:46.837984
424 4.56elapsed:  0:23:49.869801
425 4.57elapsed:  0:23:52.858076
426 4.59elapsed:  0:23:55.897980
427 4.60elapsed:  0:23:58.929534
428 4.61elapsed:  0:24:01.959420
429 4.62elapsed:  0:24:05.018143
430 4.63el

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)

Previous Runs: