### Setup the environment:
1. start dask scheduler
   > dask-scheduler
2. start dask-worker on HOST 1:
   > dask-worker scheduler_ip

In [1]:
import os
import time

import dask.dataframe as dd
from dask.delayed import delayed
from dask.distributed import Client
from os.path import expanduser


In [2]:
# change the following parameters
DASK_SCHEDULER = "192.168.1.109:8786"
PARQUET_DIRECTORY = os.path.join(expanduser("~"), "Documents/taxi/parquet_files")

In [3]:
def load_data(parquet_directory, columns):
    """
    Load parquet files. The parquet directory should be present on the worker execution this task

    """
    ddf = dd.concat([dd.read_parquet(f"{parquet_directory}/{x}/*.parquet", columns=columns, engine='pyarrow') for x in os.listdir(parquet_directory)])
    return ddf

def fn_count(ddf, columns):
    """
    Count

    """
    s = ddf[columns].count()
    return s.compute()


def fn_sum(ddf, columns):
    """
    Sum

    """
    s = ddf[columns].sum()
    return s.compute()


def fn_mean(ddf, columns):
    """
    Mean

    """
    s = ddf[columns].mean()
    return s.compute()


def fn_median(ddf, columns):
    """
    Median

    """
    s = ddf[columns].quantile(0.5)
    return s.compute()

In [4]:
def run_tasks(task_list, columns, client, parquet_directory):
    """
    Run the tasks
    
    """
    delayed_computation_list = list()
    # create delayed tasks
    ddf = delayed(load_data)(parquet_directory, columns)
    for task in task_list:
        delayed_computation = delayed(task)(ddf, columns)
        delayed_computation_list.append(delayed_computation)
    # run tasks on the cluster
    future = client.compute(delayed_computation_list)
    # wait for all the results
    return client.gather(future)

In [8]:
def main():
    """Create a Dask client and execute the tasks on the cluster"""
    client = Client(DASK_SCHEDULER)
    task_list = [fn_count, fn_sum, fn_mean]
    t = time.time()
    result_list = run_tasks(task_list, ['nr_rides'], client, PARQUET_DIRECTORY)
    print(f"duration: {time.time() - t}s")
    print(f"result: ")
    print(result_list)
    
main()

duration: 4.179220199584961s
result: 
[nr_rides    25199507
dtype: int64, nr_rides    25199507
dtype: int64, nr_rides    1.0
dtype: float64]
