# Using Producer Mini-App

In [2]:
%%capture
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd
import numpy as np
import ast
import pykafka
import mass.kafka


## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
logging.getLogger("tornado.application").setLevel(logging.CRITICAL)
logging.getLogger("distributed.utils").setLevel(logging.CRITICAL)


# Pilot-Streaming
import pilot.streaming

import uuid
import time

# 1. Benchmark Loop

In [None]:
for num_repeats in range(6):
    for num_producer_nodes in [1,2,4,8]:
        for num_broker_nodes in [1,2,4,8]:
            for application in [ "kmeans-static", "kmeans", "light"]:
                kafka_pilot_description1 = {
                    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
                    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
                    "number_cores": 48*num_broker_nodes,
                    "project": "TG-MCB090174",
                    "queue": "normal",
                    "walltime": 159,
                    "type":"kafka"
                }
                kafka_pilot = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description1)
                kafka_pilot.wait()
                
                dask_pilot_description = {
                    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
                    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
                    "number_cores": 48*num_producer_nodes,
                    "project": "TG-MCB090174",
                    "queue": "normal",
                    "walltime": 159,
                    "type":"dask"
                }
                dask_pilot = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description)
                dask_pilot.wait()
                time.sleep(5)
                for number_parallel_tasks in [8]:
                    number_parallel_tasks = num_producer_nodes*number_parallel_tasks
                    run_id = str(uuid.uuid1())
                    miniapp=mass.kafka.MiniApp(
                                                 dask_scheduler=dask_pilot.get_details()['master_url'],
                                                 kafka_zk_hosts=kafka_pilot.get_details()["master_url"],
                                                 number_parallel_tasks=number_parallel_tasks,
                                                 number_clusters=192, # kmeans
                                                 number_points_per_cluster=52084, # kmeans
                                                 number_points_per_message=10000, # kmeans
                                                 number_dim=3, # kmeans
                                                 number_messages=6400, # light
                                                 number_produces=8,
                                                 number_partitions=num_broker_nodes*12,
                                                 topic_name="test-"+run_id,
                                                 application_type = application
                                                )
                    miniapp.run()
                try:
                    kafka_pilot.cancel()
                    dask_pilot.cancel()
                except:
                    pass

**** Job: 60653 State : Pending
look for configs in: /work/01131/tg804093/wrangler/work/kafka-1307492a-fcaa-11e7-9011-b083fed168bf/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-1307492a-fcaa-11e7-9011-b083fed168bf/config (Thu Jan 18 17:49:08 2018)
{'zookeeper.connection.timeout.ms': '6000', 'broker.id': '0', 'listeners': 'PLAINTEXT://c251-136:9092', 'zookeeper.connect': 'c251-136:2181'}
**** Job: 60654 State : Pending
init distributed client
init distributed client
look for configs in: /work/01131/tg804093/wrangler/work/kafka-1307492a-fcaa-11e7-9011-b083fed168bf/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-1307492a-fcaa-11e7-9011-b083fed168bf/config (Thu Jan 18 17:49:08 2018)
{'zookeeper.connection.timeout.ms': '6000', 'broker.id': '0', 'listeners': 'PLAINTEXT://c251-136:9092', 'zookeeper.connect': 'c251-136:2181'}
Kafka: c251-136:2181, Dask: tcp://129.114.58.137:8786, Number Dask Nodes: 1,  Number Parallel Producers: 8
/home/0

In [3]:
dask_pilot.get_details()

{'master_url': 'tcp://c251-132:8786', 'web_ui_url': 'http://c251-132:8787'}

In [5]:
import distributed
c=distributed.Client("tcp://c251-132:8786")

In [None]:
c.scheduler_info()

# 2. Mini App Test

Check Kafka Broker

In [None]:
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])
topic = client.topics['test']
producer = topic.get_sync_producer()
consumer = topic.get_simple_consumer()

In [None]:
count = 0
number_total_points = 0
number_dimensions = 0
for i in range(100):
    message = consumer.consume(block=False)
    if message is not None:
        data_np = np.array(ast.literal_eval(message.value))
        num_points = data_np.shape[0]
        number_dimensions = data_np.shape[1]
        count =  count + 1
        number_total_points = number_total_points + num_points
    #print "Consumed message: %d, Number Points: %d, Number Dimensions: %d"%\
    #        (count, num_points, number_dimensions)   
        
print("Total Messages: %d, Total Points: %d, Number Dimensions: %d"%(count, number_total_points, number_dimensions))