In [None]:
from dask.distributed import Client

client = Client('10.65.18.58:8786')

In [None]:
print(client)
client

In [None]:
import dask.dataframe as dd
import dask.bytes as db
import datetime
import pandas as pd
import math
import boto3
import json

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../flows'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import tiles_pb2
from common import to_normalized_time, get_storage_option, extract_region_columns, join_region_columns, save_regional_aggregation

In [None]:
client.upload_file('../flows/tiles_pb2.py')
client.upload_file('../flows/common.py')

In [None]:
# Configs

source = {
    'endpoint_url': 'http://10.65.18.73:9000',
    'region_name':'us-east-1',
    'key': 'foobar',
    'secret': 'foobarbaz',
    'bucket': 'test'
}

dest = {
    'endpoint_url': 'http://10.65.18.73:9000',
    'region_name': 'us-east-1',
    'key': 'foobar',
    'secret': 'foobarbaz',
    'bucket': 'mass-upload-test'
}

# This determines the number of bins(subtiles) per tile. Eg. Each tile has 4^6=4096 grid cells (subtiles) when LEVEL_DIFF is 6
# Tile (z, x, y) will have a sutbile where its zoom level is z + LEVEL_DIFF
# eg. Tile (9, 0, 0) will have (15, 0, 0) as a subtile with LEVEL_DIFF = 6
LEVEL_DIFF = 6
MIN_SUBTILE_PRECISION = LEVEL_DIFF # since (0,0,0) main tile wil have (LEVEL_DIFF, x, y) subtiles as its grid cells

# Note: We need to figure out the spatial resolution of a run output in advance. For some model, 15 precision is way too high. 
# For example, lpjml model covers the entire world in very coarse resolution and with 15 precision, it takes 1 hour to process and upload
# the tiles resulting 397395 tile files. (uploading takes most of the time ) 
# where it takes only a minitue with 10 precision. And having high precision tiles doesn't make 
# significant difference visually since underlying data itself is very coarse.
MAX_SUBTILE_PRECISION = 14

# Maximum zoom level for a main tile
MAX_ZOOM = MAX_SUBTILE_PRECISION - LEVEL_DIFF

# TODO: provide these as input parameters
model_id = 'geo-test-data'
run_id = 'test-run'
s_bucket = source['bucket']
parquet_path = f's3://{s_bucket}/geo-test-data.parquet'


In [None]:
# Read parquet files in as set of dataframes
df = dd.read_parquet(parquet_path,
    storage_options={
        'anon': False,
        'use_ssl': False,
        'key': source['key'],
        'secret': source['secret'],
        'client_kwargs':{
            'region_name': source['region_name'],
            'endpoint_url': source['endpoint_url']
        }
    }).repartition(npartitions = 100)
df.dtypes

In [None]:
# Temporal aggregation (compute for both sum and mean)
time_res = 'month'

columns = df.columns.tolist()
columns.remove('value')

t = dd.to_datetime(df['timestamp'], unit='s').apply(lambda x: to_normalized_time(x, time_res), meta=(None, 'int'))
temporal_df = df.assign(timestamp=t) \
                .groupby(columns)['value'].agg(['sum', 'mean'])
# Rename agg column names
temporal_df.columns = temporal_df.columns.str.replace('sum', 't_sum').str.replace('mean', 't_mean')
temporal_df = temporal_df.reset_index()

In [None]:
temporal_df.tail()

In [None]:
from copy import copy
def compute_regional_aggregation(input_df, dest, time_res, model_id, run_id):
    # Copy input df so that original df doesn't get mutated
    df = input_df.copy()
    # Ranme columns
    df.columns = df.columns.str.replace('t_sum', 't_sum_s_sum').str.replace('t_mean', 't_mean_s_sum')
    df['s_count'] = 1
    df = df.reset_index()
    
    regions_cols = extract_region_columns(df)
    
    # Region aggregation at the highest admin level
    df = df[['feature', 'timestamp', 't_sum_s_sum', 't_mean_s_sum', 's_count'] + regions_cols] \
        .groupby(['feature', 'timestamp'] + regions_cols) \
        .agg(['sum'])
    df.columns = df.columns.droplevel(1)
    df = df.reset_index()
    # persist the result in memory at this point since this df is going to be used multiple times to compute for different regional levels
    df = df.persist()
    
    # Compute aggregation and save for all regional levels
    for level in range(len(regions_cols)): 
        save_df = df.copy()
        # Merge region columns to single region_id column. eg. ['Ethiopia', 'Afar'] -> ['Ethiopia|Afar']
        save_df['region_id'] = join_region_columns(save_df, level)
    
        # groupby feature and timestamp
        save_df = save_df[['feature', 'timestamp', 'region_id', 't_sum_s_sum', 't_mean_s_sum', 's_count']] \
            .groupby(['feature', 'timestamp']).agg(list)
        save_df = save_df.reset_index()
        save_df = save_df.apply(lambda x: save_regional_aggregation(x, dest, model_id, run_id, time_res, region_level=regions_cols[level]), 
                      axis=1, meta=(None, 'object'))
        save_df.compute()

In [None]:
compute_regional_aggregation(temporal_df, dest, time_res, model_id, run_id)