In [16]:
from dask.distributed import Client

client = Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 51587 instead


In [17]:
print(client)
client

<Client: 'tcp://127.0.0.1:51588' processes=4 threads=12, memory=16.00 GiB>


0,1
Client  Scheduler: tcp://127.0.0.1:51588  Dashboard: http://127.0.0.1:51587/status,Cluster  Workers: 4  Cores: 12  Memory: 16.00 GiB


In [18]:
import dask.dataframe as dd
import dask.bytes as db
import datetime
import pandas as pd
import math
import boto3
import json

In [19]:
import os
import sys
module_path = os.path.abspath(os.path.join('../flows'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [20]:
import tiles_pb2
from common import to_normalized_time, get_storage_options, extract_region_columns, join_region_columns, save_regional_aggregation

In [21]:
client.upload_file('../flows/tiles_pb2.py')
client.upload_file('../flows/common.py')

{'tcp://127.0.0.1:51594': {'status': 'OK'},
 'tcp://127.0.0.1:51596': {'status': 'OK'},
 'tcp://127.0.0.1:51600': {'status': 'OK'},
 'tcp://127.0.0.1:51601': {'status': 'OK'}}

In [22]:
# Configs

source = {
    'endpoint_url': 'http://10.65.18.73:9000',
    'region_name':'us-east-1',
    'key': 'foobar',
    'secret': 'foobarbaz',
    'bucket': 'test'
}

dest = {
    'endpoint_url': 'http://10.65.18.73:9000',
    'region_name': 'us-east-1',
    'key': 'foobar',
    'secret': 'foobarbaz',
    'bucket': 'experiments'
}

s_bucket = source['bucket']
# TODO: provide these as input parameters
model_id = '2fe40c11-8862-4ab4-b528-c85dacdc615e'
run_id = '04f97328-2c73-48ce-8020-d74632336670'
#parquet_path = f's3://{s_bucket}/geo-test-data.parquet'
parquet_path = f's3://{s_bucket}/{model_id}/{run_id}/*.parquet'



In [23]:
parquet_path

's3://test/2fe40c11-8862-4ab4-b528-c85dacdc615e/04f97328-2c73-48ce-8020-d74632336670/*.parquet'

In [24]:
# Read parquet files in as set of dataframes
df = dd.read_parquet(parquet_path,
    storage_options={
        'anon': False,
        'use_ssl': False,
        'key': source['key'],
        'secret': source['secret'],
        'client_kwargs':{
            'region_name': source['region_name'],
            'endpoint_url': source['endpoint_url']
        }
    }).repartition(npartitions = 100)
df.dtypes

timestamp    datetime64[ns]
lat                 float64
lng                 float64
feature              object
value               float64
country              object
admin1               object
admin2               object
admin3               object
dtype: object

In [25]:
# Temporal aggregation (compute for both sum and mean)
time_res = 'month'

columns = df.columns.tolist()
columns.remove('value')

t = dd.to_datetime(df['timestamp'], unit='s').apply(lambda x: to_normalized_time(x, time_res), meta=(None, 'int'))
temporal_df = df.assign(timestamp=t) \
                .groupby(columns)['value'].agg(['sum', 'mean'])
# Rename agg column names
temporal_df.columns = temporal_df.columns.str.replace('sum', 't_sum').str.replace('mean', 't_mean')
temporal_df = temporal_df.reset_index()

In [26]:
# save timeseries as a json file
def save_timeseries(df, dest, model_id, run_id, time_res, timeseries_agg_columns):
    for col in timeseries_agg_columns:
        timeseries_to_json(df[['timestamp', col]], dest, model_id, run_id, df['feature'].values[0], time_res, df['region_id'].values[0], col)

# write timeseries to json
def timeseries_to_json(df, dest, model_id, run_id, feature, time_res, region_id, column):
    bucket = dest['bucket']
    col_map = {}
    col_map[column] = 'value'
    df.rename(columns=col_map, inplace=False).to_json(f's3://{bucket}/{model_id}/{run_id}/{time_res}/{feature}/regional/country/timeseries/{region_id}/{column}.json',
        orient='records',
        storage_options=get_storage_options(dest))

def save_regional_timeseries(df, dest, model_id, run_id, time_res, timeseries_agg_columns, admin_level):
    admin = ['country', 'admin1', 'admin2', 'admin3']
    admin_string = admin[admin_level]
    bucket = dest['bucket']
    feature = df['feature'].values[0]
    region_id = df['region_id'].values[0]
    df = df[['timestamp'] + timeseries_agg_columns]
    df.to_csv(f's3://{bucket}/{model_id}/{run_id}/{time_res}/{feature}/regional/{admin_string}/timeseries/{region_id}.csv',
        storage_options=get_storage_options(dest))
    

In [27]:
%%time

## Option 1. Write individual file for each aggregation type


# For single admin level
regions_cols = extract_region_columns(df)
level = 3
# do for all levells
timeseries_df = temporal_df.copy()
timeseries_df['region_id'] = join_region_columns(timeseries_df, level)
timeseries_aggs = ['min', 'max', 'sum', 'mean', 'count']
timeseries_lookup = {
    ('t_sum', 'min'): 's_min_t_sum', ('t_sum', 'max'): 's_max_t_sum', ('t_sum', 'sum'): 's_sum_t_sum', ('t_sum', 'mean'): 's_mean_t_sum',
    ('t_mean', 'min'): 's_min_t_mean', ('t_mean', 'max'): 's_max_t_mean', ('t_mean', 'sum'): 's_sum_t_mean', ('t_mean', 'mean'): 's_mean_t_mean', 
    ('t_mean', 'count'): 's_count'
}
timeseries_agg_columns = ['s_min_t_sum', 's_max_t_sum', 's_sum_t_sum', 's_mean_t_sum', 's_min_t_mean', 's_max_t_mean', 's_sum_t_mean', 's_mean_t_mean', 's_count']

timeseries_df = timeseries_df.groupby(['feature', 'region_id', 'timestamp']).agg({ 't_sum' : timeseries_aggs, 't_mean' : timeseries_aggs })
timeseries_df.columns = timeseries_df.columns.to_flat_index()
timeseries_df = timeseries_df.rename(columns=timeseries_lookup).reset_index()
timeseries_df = timeseries_df.repartition(npartitions = 12).groupby(['feature', 'region_id']).apply(
    lambda x: save_timeseries(x, dest, model_id, run_id, time_res, timeseries_agg_columns),
    meta=(None, 'object'))
timeseries_df.compute()

CPU times: user 4.13 s, sys: 757 ms, total: 4.89 s
Wall time: 54.1 s


Series([], dtype: object)

In [28]:
%%time

## Option 2. Write a single file that combines all aggregation


# For single admin level
regions_cols = extract_region_columns(df)
level = 3
# do for all levells
timeseries_df = temporal_df.copy()
timeseries_df['region_id'] = join_region_columns(timeseries_df, level)
timeseries_aggs = ['min', 'max', 'sum', 'mean', 'count']
timeseries_lookup = {
    ('t_sum', 'min'): 's_min_t_sum', ('t_sum', 'max'): 's_max_t_sum', ('t_sum', 'sum'): 's_sum_t_sum', ('t_sum', 'mean'): 's_mean_t_sum',
    ('t_mean', 'min'): 's_min_t_mean', ('t_mean', 'max'): 's_max_t_mean', ('t_mean', 'sum'): 's_sum_t_mean', ('t_mean', 'mean'): 's_mean_t_mean', 
    ('t_mean', 'count'): 's_count'
}
timeseries_agg_columns = ['s_min_t_sum', 's_max_t_sum', 's_sum_t_sum', 's_mean_t_sum', 's_min_t_mean', 's_max_t_mean', 's_sum_t_mean', 's_mean_t_mean', 's_count']

timeseries_df = timeseries_df.groupby(['feature', 'region_id', 'timestamp']).agg({ 't_sum' : timeseries_aggs, 't_mean' : timeseries_aggs })
timeseries_df.columns = timeseries_df.columns.to_flat_index()
timeseries_df = timeseries_df.rename(columns=timeseries_lookup).reset_index()
timeseries_df = timeseries_df.repartition(npartitions = 12).groupby(['feature', 'region_id']).apply(
    lambda x: save_regional_timeseries(x, dest, model_id, run_id, time_res, timeseries_agg_columns, level),
    meta=(None, 'object'))
timeseries_df.compute()

CPU times: user 1.86 s, sys: 306 ms, total: 2.17 s
Wall time: 18.8 s


Series([], dtype: object)

In [30]:
%%time
from common import compute_timeseries_by_region, REGION_LEVELS
for level in REGION_LEVELS:
    compute_timeseries_by_region(temporal_df, dest, model_id, run_id, time_res, level)

CPU times: user 3.03 s, sys: 378 ms, total: 3.41 s
Wall time: 21.5 s
