# End to End Benchmark: Producer (Dask) - Kafka Cluster - Consumer (Spark)

In [3]:
%%capture
# System Libraries
import sys, os
sys.path.insert(0, "..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
logging.getLogger("radical.utils").setLevel(logging.ERROR)
 
# Pilot-Streaming
import pilot.streaming
import masa.spark
import mass.kafka

1. Setup resources for test
2. Start Producer Mini App in Dask producing synthetic data
3. Start Consumer Mini App in Spark for processing data

In [None]:
num_broker_nodes=1
num_producer_nodes=1
number_spark_nodes=1
run_id = "1"
topic_name = "test" + run_id
number_parallel_tasks=8

kafka_pilot_description1 = {
                    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
                    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
                    "number_cores": 48*num_broker_nodes,
                    "project": "TG-MCB090174",
                    "queue": "normal",
                    "walltime": 159,
                    "type":"kafka"
                }
kafka_pilot = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description1)
kafka_pilot.wait()

dask_pilot_description = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48*num_producer_nodes,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 159,
    "type":"dask"
}
dask_pilot = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description)
dask_pilot.wait()

spark_pilot_description = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48*number_spark_nodes,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 159,
    "type":"spark"
}
spark_pilot = pilot.streaming.PilotComputeService.create_pilot(spark_pilot_description)
spark_pilot.wait()

prod=mass.kafka.MiniApp(
                           dask_scheduler=dask_pilot.get_details()['master_url'],
                           kafka_zk_hosts=kafka_pilot.get_details()["master_url"],
                           number_parallel_tasks=number_parallel_tasks,
                           number_clusters=192, # kmeans
                           number_points_per_cluster=52084, # kmeans
                           number_points_per_message=5000, # kmeans
                           number_dim=3, # kmeans
                           number_messages=6400, # light
                           number_produces=2,
                           number_partitions=num_broker_nodes*12,
                           topic_name=topic_name,
                           application_type = "kmeans"
                        )
prod.run_in_background()

consumer = masa.spark.MiniApp(
                          spark_master=spark_pilot.get_details()["master_url"],
                          kafka_zk_hosts=kafka_pilot.get_details()["master_url"],
                          topic_name = topic_name,
                          scenario="%d-%d-%d"%(num_producer_nodes, num_broker_nodes, number_spark_nodes)
                         )
consumer.run_in_background()

# Wait for completion
prod.wait()
consumer.cancel()

kafka_pilot.cancel()
dask_pilot.cancel()
spark_pilot.cancel()