# Using Producer Mini-App

In [1]:
%%capture
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd
import numpy as np
import ast
import pykafka
import mass.kafka
 

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
logging.getLogger("tornado.application").setLevel(logging.CRITICAL)
logging.getLogger("distributed.utils").setLevel(logging.CRITICAL)


# Pilot-Streaming
import pilot.streaming
import uuid 
import time

# 1. Benchmark Loop

In [None]:
for num_repeats in range(1):
    for num_producer_nodes in [1]:
        
        dask_pilot_description = {
                 "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
                 "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
                 "number_cores": 48*num_producer_nodes,
                 "project": "TG-MCB090174",
                 "queue": "normal",
                 "walltime": 300,
                 "type":"dask"
            }
        dask_pilot = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description)
        dask_pilot.wait()
        dask_details=dask_pilot.get_details()
        #dask_details={'master_url': 'tcp://c251-101:8786'}
        
        for num_broker_nodes in [1]: #,2,4
            for num_partitions_per_node in [1,2,4,8,16]:
                #for application in ["kmeans-5000", "kmeansstatic-5000", "kmeansstatic-10000", "kmeansstatic-20000", "light"]:
                for application_scenario in ["light"]:    #"kmeans-5000",
                    kafka_pilot_description1 = {
                        "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
                        "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
                        "number_cores": 48*num_broker_nodes,
                        "project": "TG-MCB090174",
                        "queue": "normal",
                        "walltime": 300,
                        "type":"kafka"
                    }
                    kafka_pilot = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description1)
                    kafka_pilot.wait()
                    kafka_details = kafka_pilot.get_details()
                    
                    time.sleep(5)
                    for number_parallel_tasks in [1,2,4,8,16,32,48]:
                        number_parallel_tasks = num_producer_nodes*number_parallel_tasks
                        number_points_per_message = 15000
                        if application_scenario.startswith("kmeans"):
                            number_points_per_message = int(application_scenario.split("-")[1])
                            application = application_scenario.split("-")[0]
                        else:
                            application = application_scenario
                            number_points_per_message = 1
                        
                        print("Run Application: %s, Number Points per Messages: %d"%(application, number_points_per_message))
                        run_id = str(uuid.uuid1())
                        miniapp=mass.kafka.MiniApp(
                                                     dask_scheduler=dask_details['master_url'],
                                                     resource_url=kafka_details["master_url"],
                                                     broker_service="kafka",
                                                     number_parallel_tasks=number_parallel_tasks,
                                                     number_clusters=10, # kmeans
                                                     number_points_per_cluster=10000, # kmeans
                                                     number_points_per_message=number_points_per_message, # kmeans
                                                     number_dim=3, # kmeans
                                                     number_messages=192, # light
                                                     number_produces=1,
                                                     number_partitions=num_broker_nodes*num_partitions_per_node,
                                                     topic_name="test-"+run_id,
                                                     application_type = application
                                                    )
                        miniapp.run()
                    try:
                        kafka_pilot.cancel()
                        
                    except:
                        pass
        try:
            dask_pilot.cancel()
            #dask_pilot2.cancel()
            time.sleep(60)
        except: 
            pass

/tmp/tmpkaz188w9
Submission of Job Command: ssh login1.wrangler.tacc.utexas.edu sbatch  tmpkaz188w9
Cleanup: ssh login1.wrangler.tacc.utexas.edu rm tmpkaz188w9
**** Job: 105894 State : Running
/tmp/tmp24icjacu
Submission of Job Command: ssh login1.wrangler.tacc.utexas.edu sbatch  tmp24icjacu
Cleanup: ssh login1.wrangler.tacc.utexas.edu rm tmp24icjacu
**** Job: 105895 State : Running
look for configs in: /work/01131/tg804093/wrangler/work/kafka-fe54cbf4-0236-11e9-a8d3-0b861b3f4538/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-fe54cbf4-0236-11e9-a8d3-0b861b3f4538/config (Mon Dec 17 14:05:03 2018)
{'broker.id': '0', 'listeners': 'PLAINTEXT://c251-119:9092', 'zookeeper.connect': 'c251-119:2181', 'zookeeper.connection.timeout.ms': '6000'}
look for configs in: /work/01131/tg804093/wrangler/work/kafka-fe54cbf4-0236-11e9-a8d3-0b861b3f4538/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-fe54cbf4-0236-11e9-a8d3-0b861b3f4538/config (Mon Dec 

## Scratch

In [None]:
dask_pilot.get_details()

In [None]:
import distributed
c=distributed.Client("tcp://c251-101:8786")

In [None]:
c.scheduler_info()

In [None]:
def inc(x):
    import socket
    return socket.gethostname()

In [None]:
from dask.delayed import delayed
t = delayed(inc)(20)

In [None]:
t.compute()

# 2. Mini App Test

Check Kafka Broker

In [None]:
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])
topic = client.topics['test']
producer = topic.get_sync_producer()
consumer = topic.get_simple_consumer()

In [None]:
count = 0
number_total_points = 0
number_dimensions = 0
for i in range(100):
    message = consumer.consume(block=False)
    if message is not None:
        data_np = np.array(ast.literal_eval(message.value))
        num_points = data_np.shape[0]
        number_dimensions = data_np.shape[1]
        count =  count + 1
        number_total_points = number_total_points + num_points
    #print "Consumed message: %d, Number Points: %d, Number Dimensions: %d"%\
    #        (count, num_points, number_dimensions)   
        
print("Total Messages: %d, Total Points: %d, Number Dimensions: %d"%(count, number_total_points, number_dimensions))