# End to End Benchmark: Producer (Dask) - Kafka Cluster - Consumer (Spark)

In [None]:
%%capture
# System Libraries
import sys, os
sys.path.insert(0, "..")
import pandas as pd
import uuid
## logging
import logging
import time
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
logging.getLogger("radical.utils").setLevel(logging.ERROR)
 
# Pilot-Streaming
import pilot.streaming
import masa.spark
import mass.kafka

1. Setup resources for test
2. Start Producer Mini App in Dask producing synthetic data
3. Start Consumer Mini App in Spark for processing data

In [None]:
for num_repeats in range(3):
    for num_producer_nodes in [1]:
        for num_broker_nodes in [1]:
            for number_spark_nodes in [1]:
                for application in ["kmeansstaticpred-100", "kmeansstatic-10", 
                                    "kmeansstatic-100", "kmeansstatic-1000", 
                                    "kmeansstatic-1000", "light"]:
                    #num_broker_nodes=1
                    #num_producer_nodes=1
                    #number_spark_nodes=1
                    run_id = str(uuid.uuid1())
                    topic_name = "test_" + run_id
                    number_parallel_tasks=8
                    kafka_pilot_description1 = {
                                        "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
                                        "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
                                        "number_cores": 48*num_broker_nodes,
                                        "project": "TG-MCB090174",
                                        "queue": "normal",
                                        "walltime": 159,
                                        "type":"kafka"
                                    }
                    kafka_pilot = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description1)
                    kafka_pilot.wait()
                    
                    dask_pilot_description = {
                        "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
                        "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
                        "number_cores": 48*num_producer_nodes,
                        "project": "TG-MCB090174",
                        "queue": "normal",
                        "walltime": 159,
                        "type":"dask"
                    }
                    dask_pilot = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description)
                    dask_pilot.wait()
                    
                    spark_pilot_description = {
                        "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
                        "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
                        "number_cores": 48*number_spark_nodes,
                        "project": "TG-MCB090174",
                        "queue": "normal",
                        "walltime": 159,
                        "type":"spark"
                    }
                    spark_pilot = pilot.streaming.PilotComputeService.create_pilot(spark_pilot_description)
                    spark_pilot.wait()
                
                    
                    number_clusters = 100
                    if application.startswith("kmeans") and application.find("-")>=0:
                        number_clusters = int(application.split("-")[1])
                        application = application.split("-")[0]
                        
                    print "Application: %s, Number Clusters: %d"%(application, number_clusters)
                        
                    # Scenario: 
                    prod=mass.kafka.MiniApp(
                                               dask_scheduler=dask_pilot.get_details()['master_url'],
                                               kafka_zk_hosts=kafka_pilot.get_details()["master_url"],
                                               number_parallel_tasks=number_parallel_tasks,
                                               number_clusters=192, # kmeans
                                               number_points_per_cluster=52084, # kmeans
                                               number_points_per_message=5000, # kmeans
                                               number_dim=3, # kmeans
                                               number_messages=60000, # light
                                               number_produces=80,
                                               number_partitions=num_broker_nodes*12,
                                               topic_name=topic_name,
                                               application_type=application,
                                               produce_interval=0
                                            )
                    prod.run_in_background()
    
                    consumer = masa.spark.MiniApp(
                                              spark_master=spark_pilot.get_details()["master_url"],
                                              kafka_zk_hosts=kafka_pilot.get_details()["master_url"],
                                              topic_name = topic_name,
                                              number_clusters=number_clusters,
                                              test_scenario="%s-%d-%d-%d-%d"%(application,num_producer_nodes, num_broker_nodes, number_spark_nodes, number_clusters),
                                              application = application
                                             )
                    consumer.run_in_background()
                    
                    # Wait for completion
                    prod.wait()
                    time.sleep(240)
                    print "******** Producer Wait RETURNED. Cancel Streaming App"
                    consumer.cancel()
                    
                    kafka_pilot.cancel()
                    dask_pilot.cancel()
                    spark_pilot.cancel() 

**** Job: 61182 State : Pending
look for configs in: /work/01131/tg804093/wrangler/work/kafka-73e9d5d0-fe4b-11e7-955a-549f3509766c/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-73e9d5d0-fe4b-11e7-955a-549f3509766c/config (Sat Jan 20 19:36:07 2018)
{'zookeeper.connection.timeout.ms': '6000', 'broker.id': '0', 'listeners': 'PLAINTEXT://c251-133:9092', 'zookeeper.connect': 'c251-133:2181'}
**** Job: 61183 State : Pending
**** Job: 61184 State : Pending
Create Spark Context for URL: spark://129.114.58.136:7077
Create Spark Context for URL: spark://129.114.58.136:7077
Application: kmeansstaticpred, Number Clusters: 100
look for configs in: /work/01131/tg804093/wrangler/work/kafka-73e9d5d0-fe4b-11e7-955a-549f3509766c/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-73e9d5d0-fe4b-11e7-955a-549f3509766c/config (Sat Jan 20 19:36:07 2018)
{'zookeeper.connection.timeout.ms': '6000', 'broker.id': '0', 'listeners': 'PLAINTEXT://c251-133:9092', 

In [None]:
%%capture
kafka_pilot.cancel()
dask_pilot.cancel()
spark_pilot.cancel()

In [2]:
prod=mass.kafka.MiniApp(
                                              #dask_scheduler=dask_pilot.get_details()['master_url'],
                                              kafka_zk_hosts="c251-133:2181",
                                              number_parallel_tasks=8,
                                              number_clusters=192, # kmeans
                                              number_points_per_cluster=52084, # kmeans
                                              number_points_per_message=5000, # kmeans
                                              number_dim=3, # kmeans
                                              number_messages=60000, # light
                                              number_produces=80,
                                              number_partitions=1*12,
                                              topic_name="test",
                                              application_type="kmeansstaticpred-100",
                                              produce_interval=0
                                           )
prod.run()

2000
Kafka: c251-133:2181, Dask: inproc://129.114.58.132/144113/1, Number Dask Nodes: 1,  Number Parallel Producers: 8
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --delete --zookeeper c251-133:2181 --topic test
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --create --zookeeper c251-133:2181 --replication-factor 1 --partitions 12 --topic test
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --describe --zookeeper c251-133:2181 --topic test
Application: kmeansstaticpred-100, Generate Block ID: 0
Application: kmeansstaticpred-100, Generate Block ID: 1
Application: kmeansstaticpred-100, Generate Block ID: 2
Application: kmeansstaticpred-100, Generate Block ID: 3
Application: kmeansstaticpred-100, Generate Block ID: 4
Application: kmeansstaticpred-100, Generate Block ID: 5
Application: kmeansstaticpred-100, Generate Block ID: 6
Application: kmeansstaticpred-100, Generate Block ID: 7
Waiting for Dask Tasks to complete
Zookeeper: c251-133:2181

Exception AttributeError: "'NoneType' object has no attribute 'requests'" in <bound method RequestHandler.__del__ of <pykafka.handlers.RequestHandler object at 0x2ae5bfee9350>> ignored


[{'bytes_per_message': '323012', 'transmission_time': '3.82773', 'block_id': 0, 'points_per_message': 5000, 'data_generation_time': '0.299018', 'runtime': '4.12675', 'number_messages': 250}, {'bytes_per_message': '319611', 'transmission_time': '3.54085', 'block_id': 1, 'points_per_message': 5000, 'data_generation_time': '0.276037', 'runtime': '3.81689', 'number_messages': 250}, {'bytes_per_message': '322912', 'transmission_time': '4.01133', 'block_id': 2, 'points_per_message': 5000, 'data_generation_time': '0.378483', 'runtime': '4.38981', 'number_messages': 250}, {'bytes_per_message': '315971', 'transmission_time': '3.83083', 'block_id': 3, 'points_per_message': 5000, 'data_generation_time': '0.262731', 'runtime': '4.09356', 'number_messages': 250}, {'bytes_per_message': '312411', 'transmission_time': '3.44670', 'block_id': 4, 'points_per_message': 5000, 'data_generation_time': '0.454906', 'runtime': '3.90160', 'number_messages': 250}, {'bytes_per_message': '324778', 'transmission_tim

KeyboardInterrupt: 