In [1]:
import math
import os
import dask.dataframe as dd
import numpy as np
import pandas as pd

from dask.distributed import Client

In [2]:
client = Client('tcp://scheduler:8786')


+---------+--------+-----------+---------+
| Package | client | scheduler | workers |
+---------+--------+-----------+---------+
| blosc   | 1.9.1  | 1.7.0     | 1.7.0   |
+---------+--------+-----------+---------+


In [3]:
client

0,1
Client  Scheduler: tcp://scheduler:8786  Dashboard: http://scheduler:8787/status,Cluster  Workers: 4  Cores: 16  Memory: 8.35 GB


In [None]:
os.environ['LOCALSTACK_S3_ENDPOINT_URL'] = 'http://localstack:4572'
os.environ['AWS_ACCESS_KEY_ID'] = 'foo'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'bar'
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

In [4]:
def roundup(x, base: int = 5):
    """Round `x` up to nearest `base`"""
    return int(math.ceil(x / float(base))) * base

In [5]:
def round_series_up(s: dd.Series) -> dd.Series:
    """Apply roundup function to all elements of `s`"""
    return s.apply(roundup, meta=pd.Series(data=[], dtype=np.float32))

In [6]:
def transform_dask_dataframe(df: dd.DataFrame) -> dd.DataFrame:
    """Process NYC taxi data"""
    return (
        df[[
            'tpep_pickup_datetime', 'tpep_dropoff_datetime',
            'trip_distance', 'total_amount'
        ]]
        .astype({
            'tpep_pickup_datetime': 'datetime64[ms]',
            'tpep_dropoff_datetime': 'datetime64[ms]'
        })
        .assign(drive_time=(lambda df: (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.seconds // 300))
        .assign(drive_time=lambda df: round_series_up(df.drive_time))
        .assign(trip_distance=lambda df: round_series_up(df.trip_distance))
        .query('drive_time <= 120 & trip_distance <= 50')
        .drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)
        .round({'trip_distance': 0})
        .groupby(['drive_time', 'trip_distance'])
        .mean()
        .rename(columns={'total_amount': 'avg_amount'})
    )

In [7]:
def compute_final_dataframe(df: dd.DataFrame) -> pd.DataFrame:
    """Execute dask task graph and compute final results"""
    return (
        df
        .compute()
        .reset_index()
        .pivot(
             index='drive_time',
             columns='trip_distance',
             values='avg_amount'
        )
        .fillna(0)
    )

In [8]:
def run():

    # Lets toggle localstack by changing where boto3 is pointing to
    if os.environ.get('LOCALSTACK_S3_ENDPOINT_URL'):
        taxi_data = dd.read_csv( 's3://nyc-tlc/trip data/yellow_tripdata_2018-04.csv',
            storage_options={
                'anon': False,
                'use_ssl': False,
                'key': 'foo',
                'secret': 'bar',
                "client_kwargs": {
                    "endpoint_url": "http://localstack:4572",
                    "region_name": "us-east-1"
                }
            }
        )
    else:
        # This assumes your using named profiles in aws cli with a default profile accessing your s3 bucket or EC2
        # instance or ECS task role
        taxi_data = dd.read_csv('s3://nyc-tlc/trip data/yellow_tripdata_2018-04.csv')

    taxi_data = transform_dask_dataframe(taxi_data)

    taxi_data = compute_final_dataframe(taxi_data)

    print(taxi_data)