In [102]:
from dask.distributed import Client

client = Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 61693 instead


In [103]:
print(client)
client

<Client: 'tcp://127.0.0.1:61694' processes=4 threads=12, memory=16.00 GiB>


0,1
Client  Scheduler: tcp://127.0.0.1:61694  Dashboard: http://127.0.0.1:61693/status,Cluster  Workers: 4  Cores: 12  Memory: 16.00 GiB


In [105]:
import dask.dataframe as dd
import dask.bytes as db
import datetime
import pandas as pd
import math
import boto3
import json

In [106]:
import os
import sys
module_path = os.path.abspath(os.path.join('../flows'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [150]:
import tiles_pb2
from common import run_temporal_aggregation, deg2num, filter_by_min_zoom, ancestor_tiles, get_storage_options 

In [108]:
client.upload_file('../flows/tiles_pb2.py')
client.upload_file('../flows/common.py')

{'tcp://127.0.0.1:61700': {'status': 'OK'},
 'tcp://127.0.0.1:61703': {'status': 'OK'},
 'tcp://127.0.0.1:61706': {'status': 'OK'},
 'tcp://127.0.0.1:61709': {'status': 'OK'}}

In [109]:
# Configs

source = {
    'endpoint_url': 'http://10.65.18.73:9000',
    'region_name':'us-east-1',
    'key': 'foobar',
    'secret': 'foobarbaz',
    'bucket': 'test'
}

dest = {
    'endpoint_url': 'http://10.65.18.73:9000',
    'region_name': 'us-east-1',
    'key': 'foobar',
    'secret': 'foobarbaz',
    'bucket': 'experiments'
}

s_bucket = source['bucket']
# TODO: provide these as input parameters
model_id = '2fe40c11-8862-4ab4-b528-c85dacdc615e'
run_id = '04f97328-2c73-48ce-8020-d74632336670'
#parquet_path = f's3://{s_bucket}/geo-test-data.parquet'
parquet_path = f's3://{s_bucket}/{model_id}/{run_id}/*.parquet'



In [110]:
parquet_path

's3://test/2fe40c11-8862-4ab4-b528-c85dacdc615e/04f97328-2c73-48ce-8020-d74632336670/*.parquet'

In [111]:
# Read parquet files in as set of dataframes
df = dd.read_parquet(parquet_path,
    storage_options={
        'anon': False,
        'use_ssl': False,
        'key': source['key'],
        'secret': source['secret'],
        'client_kwargs':{
            'region_name': source['region_name'],
            'endpoint_url': source['endpoint_url']
        }
    }).repartition(npartitions = 100)
df.dtypes

timestamp    datetime64[ns]
lat                 float64
lng                 float64
feature              object
value               float64
country              object
admin1               object
admin2               object
admin3               object
dtype: object

In [112]:
# Temporal aggregation (compute for both sum and mean)
time_res = 'month'
temporal_df = run_temporal_aggregation(df, time_res)
temporal_df.compute()

Unnamed: 0,timestamp,lat,lng,feature,country,admin1,admin2,admin3,t_sum,t_mean
0,499669200000,7.792,34.958,production,Ethiopia,Oromia,Ilubabor,Sale Nono,10.0,10.0
1,499669200000,7.875,34.958,production,Ethiopia,Oromia,Ilubabor,Sale Nono,26.0,26.0
2,499669200000,8.292,41.708,production,Ethiopia,Oromia,Misraq Harerge,Golo Oda,0.0,0.0
3,499669200000,8.375,40.542,production,Ethiopia,Oromia,Mirab Hararghe,Daro Lebu,0.0,0.0
4,499669200000,8.375,41.792,production,Ethiopia,Oromia,Misraq Harerge,Meyu,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...
77949,1583038800000,9.375,38.542,production,Ethiopia,Oromia,Mirab Shewa,Adda Berga,999.0,999.0
77950,1583038800000,9.375,38.625,production,Ethiopia,Oromia,North Shewa,Mulo,89.0,89.0
77951,1583038800000,9.375,38.708,production,Ethiopia,Oromia,North Shewa,Sululta,115.0,115.0
77952,1583038800000,9.458,38.542,production,Ethiopia,Oromia,Mirab Shewa,Adda Berga,313.0,313.0


In [113]:
# This determines the number of bins(subtiles) per tile. Eg. Each tile has 4^6=4096 grid cells (subtiles) when LEVEL_DIFF is 6
# Tile (z, x, y) will have a sutbile where its zoom level is z + LEVEL_DIFF
# eg. Tile (9, 0, 0) will have (15, 0, 0) as a subtile with LEVEL_DIFF = 6
LEVEL_DIFF = 6

# Note: We need to figure out the spatial resolution of a run output in advance. For some model, 15 precision is way too high.
# For example, lpjml model covers the entire world in very coarse resolution and with 15 precision, it takes 1 hour to process and upload
# the tiles resulting 397395 tile files. (uploading takes most of the time )
# where it takes only a minitue with 10 precision. And having high precision tiles doesn't make
# significant difference visually since underlying data itself is very coarse.
MAX_SUBTILE_PRECISION = 14

MIN_SUBTILE_PRECISION = LEVEL_DIFF # since (0,0,0) main tile wil have (LEVEL_DIFF, x, y) subtiles as its grid cells

# Maximum zoom level for a main tile
MAX_ZOOM = MAX_SUBTILE_PRECISION - LEVEL_DIFF

In [114]:
%time

# Subtile aggregation
spatial_df = temporal_df.copy()
stile = spatial_df.apply(lambda x: deg2num(x.lat, x.lng, MAX_SUBTILE_PRECISION), axis=1, meta=(None, 'object'))
subtile_df = spatial_df.assign(subtile=stile)
subtile_df = subtile_df[['feature', 'timestamp', 'subtile', 't_sum', 't_mean']] \
    .groupby(['feature', 'timestamp', 'subtile']) \
    .agg(['sum', 'count'])

# Rename columns
spatial_lookup = {('t_sum', 'sum'): 's_sum_t_sum', ('t_sum', 'count'): 's_count_t_sum',
        ('t_mean', 'sum'): 's_sum_t_mean', ('t_mean', 'count'): 's_count'}
subtile_df.columns = subtile_df.columns.to_flat_index()
subtile_df = subtile_df.rename(columns=spatial_lookup).drop(columns='s_count_t_sum').reset_index()
subtile_df.compute()

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 11 µs


Unnamed: 0,feature,timestamp,subtile,s_sum_t_sum,s_sum_t_mean,s_count
0,production,496987200000,"(14, 10120, 7748)",0.0,0.0,1
1,production,499669200000,"(14, 9756, 7790)",10.0,10.0,1
2,production,499669200000,"(14, 9760, 7767)",579.0,579.0,1
3,production,499669200000,"(14, 9760, 7771)",1294.0,1294.0,1
4,production,499669200000,"(14, 9760, 7774)",1326.0,1326.0,1
...,...,...,...,...,...,...
77949,production,1593576000000,"(14, 10010, 7855)",154.0,154.0,1
77950,production,1593576000000,"(14, 10010, 7859)",220.0,220.0,1
77951,production,1593576000000,"(14, 10010, 7863)",277.0,277.0,1
77952,production,1593576000000,"(14, 10010, 7866)",294.0,294.0,1


In [115]:
%time

# Get a list of all acestor tiles for each subtile
stile = subtile_df.copy().apply(lambda x: filter_by_min_zoom(ancestor_tiles(x.subtile), MIN_SUBTILE_PRECISION), axis=1, meta=(None, 'object'))
explode_df = subtile_df.assign(subtile=stile)
# Explode data and duplicate data points for each zoom level (zoom level is defiend by subtile coordinates)
explode_df = explode_df.explode('subtile').repartition(npartitions = 12)

explode_df = explode_df.groupby(['feature', 'timestamp', 'subtile']).agg('sum')
explode_df = explode_df.reset_index()
explode_df.compute()

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs


Unnamed: 0,feature,timestamp,subtile,s_sum_t_sum,s_sum_t_mean,s_count
0,production,496987200000,"(6, 39, 30)",0.0,0.0,1
1,production,496987200000,"(7, 79, 60)",0.0,0.0,1
2,production,496987200000,"(8, 158, 121)",0.0,0.0,1
3,production,496987200000,"(9, 316, 242)",0.0,0.0,1
4,production,496987200000,"(10, 632, 484)",0.0,0.0,1
...,...,...,...,...,...,...
292506,production,1593576000000,"(14, 10010, 7855)",154.0,154.0,1
292507,production,1593576000000,"(14, 10010, 7859)",220.0,220.0,1
292508,production,1593576000000,"(14, 10010, 7863)",277.0,277.0,1
292509,production,1593576000000,"(14, 10010, 7866)",294.0,294.0,1


In [162]:
%time

stats_df = explode_df.assign(s_mean_t_sum=explode_df['s_sum_t_sum'] / explode_df['s_count'], s_mean_t_mean=explode_df['s_sum_t_mean'] / explode_df['s_count'])
zoom = stats_df['subtile'].apply(lambda x: x[0], meta=('subtile', 'object')) 
stats_df = stats_df.assign(zoom=zoom).drop(['subtile', 's_count'], axis=1)
stats_df = stats_df.groupby(['feature', 'timestamp', 'zoom']).agg(['min', 'max'])
# Flatten multi index columns to single index e.g (s_sum_t_sum, min) -> min_s_sum_t_sum
stats_df.columns = ["_".join(tuple(reversed(cols))) for cols in stats_df.columns.to_flat_index()]
stats_df = stats_df.reset_index()
stats_df.compute()

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 14.1 µs


Unnamed: 0,feature,timestamp,zoom,min_s_sum_t_sum,max_s_sum_t_sum,min_s_sum_t_mean,max_s_sum_t_mean,min_s_mean_t_sum,max_s_mean_t_sum,min_s_mean_t_mean,max_s_mean_t_mean
0,production,496987200000,6,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,production,496987200000,7,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2,production,496987200000,8,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,production,496987200000,9,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,production,496987200000,10,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
3568,production,1593576000000,10,98.0,1891.0,98.0,1891.0,32.666667,270.142857,32.666667,270.142857
3569,production,1593576000000,11,98.0,850.0,98.0,850.0,32.666667,283.333333,32.666667,283.333333
3570,production,1593576000000,12,11.0,345.0,11.0,345.0,11.000000,345.000000,11.000000,345.000000
3571,production,1593576000000,13,11.0,345.0,11.0,345.0,11.000000,345.000000,11.000000,345.000000


In [161]:
def save_subtile_stats(df, dest, model_id, run_id, time_res):
  bucket = dest['bucket']
  feature = df['feature'].values[0]
  timestamp = df['timestamp'].values[0]
  columns = df.columns.tolist()
  columns.remove('feature')
  columns.remove('timestamp')

  df[columns].to_csv(f's3://{bucket}/{model_id}/{run_id}/{time_res}/{feature}/stats/grid/{timestamp}.csv',
    storage_options=get_storage_options(dest), index=False)


In [163]:
%time
stats_df = stats_df.groupby(['feature', 'timestamp']).apply(lambda x: save_subtile_stats(x, dest, model_id, run_id, time_res), meta=(None, 'object'))
stats_df.compute()

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.91 µs


Series([], dtype: object)