# MASS: Howto Mini App Streaming Source (MASS)

This notebooks demonstrates the usage data source apps that can be used for the development and characterizing of streaming application.

In [1]:
%%capture
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
logging.getLogger("radical.utils").setLevel(logging.ERROR)
 
# Pilot-Streaming
import pilot.streaming

# 1. Resource Setup
## 1.1 Kafka

In [7]:
kafka_pilot_description1 = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 159,
    "type":"kafka"
}
kafka_pilot1 = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description1)
kafka_pilot1.wait()

/tmp/tmp1z79vifg
Submission of Job Command: ssh login1.wrangler.tacc.utexas.edu sbatch  tmp1z79vifg
Cleanup: ssh login1.wrangler.tacc.utexas.edu rm tmp1z79vifg
**** Job: 97044 State : Queue
look for configs in: /work/01131/tg804093/wrangler/work/kafka-0f154f68-f58a-11e8-a016-fd41899827ce/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-0f154f68-f58a-11e8-a016-fd41899827ce/config (Sat Dec  1 11:35:23 2018)
{'broker.id': '0', 'listeners': 'PLAINTEXT://c251-119:9092', 'zookeeper.connect': 'c251-119:2181', 'zookeeper.connection.timeout.ms': '6000'}


In [8]:
kafka_details = kafka_pilot1.get_details()
kafka_details

look for configs in: /work/01131/tg804093/wrangler/work/kafka-0f154f68-f58a-11e8-a016-fd41899827ce/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-0f154f68-f58a-11e8-a016-fd41899827ce/config (Sat Dec  1 11:35:23 2018)
{'broker.id': '0', 'listeners': 'PLAINTEXT://c251-119:9092', 'zookeeper.connect': 'c251-119:2181', 'zookeeper.connection.timeout.ms': '6000'}


{'master_url': 'c251-119:2181',
 'details': {'broker.id': '0',
  'listeners': 'PLAINTEXT://c251-119:9092',
  'zookeeper.connect': 'c251-119:2181',
  'zookeeper.connection.timeout.ms': '6000'}}

In [4]:
kafka_pilot1.cancel()

## 1.2. Dask

In [9]:
dask_pilot_description = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 159,
    "type":"dask"
}
dask_pilot = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description)
dask_pilot.wait()

/tmp/tmpxvj5vkku
Submission of Job Command: ssh login1.wrangler.tacc.utexas.edu sbatch  tmpxvj5vkku
Cleanup: ssh login1.wrangler.tacc.utexas.edu rm tmpxvj5vkku
**** Job: 97063 State : Queue


In [10]:
dask_details = dask_pilot.get_details()
dask_details

{'master_url': 'tcp://c251-112:8786', 'web_ui_url': 'http://c251-112:8787'}

In [None]:
dask_pilot.cancel()

# 2. Mini App Test
## 2.1 KMeans
### 2.1.1 Run App

In [6]:
# System Libraries
import sys, os
sys.path.insert(0, "..")
import pandas as pd
import numpy as np
import ast
import mass.kafka
import pykafka

kafka_details= {'master_url': 'c251-103:2181'}
dask_details = {'master_url': 'tcp://c251-102:8786'} 

miniapp=mass.kafka.MiniApp(
                            #dask_scheduler=dask_details['master_url'],
                            dask_scheduler=None,
                            kafka_zk_hosts=kafka_details["master_url"],
                            number_parallel_tasks=2,
                            number_clusters=10,
                            number_points_per_cluster=10000,
                            number_points_per_message=1000,
                            number_messages=1,
                            number_dim=3,
                            number_produces=1,
                            number_partitions=4,
                            topic_name="test",
                            application_type="kmeans"
                           )

100.0


KeyboardInterrupt: 

In [2]:
miniapp.run()

/home/01131/tg804093/work/kafka_2.11-2.1.0/bin/kafka-topics.sh --delete --zookeeper c251-103:2181 --topic test
/home/01131/tg804093/work/kafka_2.11-2.1.0/bin/kafka-topics.sh --create --zookeeper c251-103:2181 --replication-factor 1 --partitions 4 --topic test
/home/01131/tg804093/work/kafka_2.11-2.1.0/bin/kafka-topics.sh --describe --zookeeper c251-103:2181 --topic test
Waiting for Dask Tasks to complete
Zookeeper: c251-103:2181, Block Id: 0, Num Cluster: 100
Zookeeper: c251-103:2181, Block Id: 1, Num Cluster: 100
Points Array Shape: (50000, 3), Number Batches: 50.0
Points Array Shape: (50000, 3), Number Batches: 50.0
[{'block_id': 0, 'number_messages': 50, 'points_per_message': 1000, 'bytes_per_message': '63438', 'data_generation_time': '0.062169', 'transmission_time': '0.33351', 'runtime': '0.39568'}, {'block_id': 1, 'number_messages': 50, 'points_per_message': 1000, 'bytes_per_message': '64583', 'data_generation_time': '0.051659', 'transmission_time': '0.33457', 'runtime': '0.38623'

### 2.1.2 Check Kafka Broker

Ensure that the correct amount of data was successfully written to Kafka

In [3]:
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])
topic = client.topics['test']
producer = topic.get_sync_producer()
consumer = topic.get_simple_consumer()

In [4]:
len(client.brokers)

1

In [5]:
count = 0
number_total_points = 0
number_dimensions = 0
for i in range(100):
    message = consumer.consume(block=False)
    if message is not None:
        data_np = np.array(ast.literal_eval(message.value.decode("utf-8")))
        num_points = data_np.shape[0]
        number_dimensions = data_np.shape[1]
        count =  count + 1
        number_total_points = number_total_points + num_points
    #print "Consumed message: %d, Number Points: %d, Number Dimensions: %d"%\
    #        (count, num_points, number_dimensions)   
        
print("Total Messages: %d, Total Points: %d, Number Dimensions: %d"%(count, number_total_points, number_dimensions))

Total Messages: 100, Total Points: 100000, Number Dimensions: 3


## 2.2 Light Source
### 2.2.1 Run Mini App

In [1]:
%%time
import sys, os
sys.path.insert(0, "..")
import pandas as pd
import numpy as np
import ast
import mass.kafka
import pykafka
kafka_details= {'master_url': 'c251-103:2181'}
dask_details = {'master_url': 'tcp://c251-102:8786'} 

miniapp=mass.kafka.MiniApp(
                                   #dask_scheduler=dask_details['master_url'],
                                   dask_scheduler=None,
                                   kafka_zk_hosts=kafka_details["master_url"],
                                   number_parallel_tasks=1,
                                   number_messages=1,
                                   number_produces=2,
                                   number_partitions=24,
                                   topic_name="light_test8",
                                   application_type = "light"
                                  )
miniapp.run()

1
Kafka: c251-103:2181, Dask: inproc://129.114.58.101/41998/1, Number Dask Nodes: 1,  Number Parallel Producers: 1
/home/01131/tg804093/work/kafka_2.11-2.1.0/bin/kafka-topics.sh --delete --zookeeper c251-103:2181 --topic light_test8
/home/01131/tg804093/work/kafka_2.11-2.1.0/bin/kafka-topics.sh --create --zookeeper c251-103:2181 --replication-factor 1 --partitions 24 --topic light_test8
/home/01131/tg804093/work/kafka_2.11-2.1.0/bin/kafka-topics.sh --describe --zookeeper c251-103:2181 --topic light_test8


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Waiting for Dask Tasks to complete


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte

### 2.2.2 Check if Light Source Data is correct

In [8]:
%matplotlib inline
import os, sys, time
import tomopy
import pandas as pd
import dxchange
import tempfile
import pykafka
import base64
import io
import binascii

In [6]:
def reconstruct(message):
    start = 0
    end = 2
    msg_bin = base64.urlsafe_b64decode(message.value)
    #print len(msg_bin)
    tf = tempfile.NamedTemporaryFile(delete=True)
    #tf = open("test.h5", "w")
    tf.write(msg_bin)
    tf.flush()
    proj, flat, dark, theta = dxchange.read_aps_32id(tf.name, sino=(start, end))
    theta = tomopy.angles(proj.shape[0])
    proj = tomopy.normalize(proj, flat, dark)
    rot_center = tomopy.find_center(proj, theta, init=290, ind=0, tol=0.5)
    proj = tomopy.minus_log(proj)
    recon = tomopy.recon(proj, theta, center=rot_center, algorithm='gridrec')
    recon = tomopy.circ_mask(recon, axis=0, ratio=0.95)
    #tf.close()

In [13]:
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])
topic = client.topics['light_test6']
consumer = topic.get_simple_consumer(fetch_message_max_bytes=10000000)

In [14]:
message = consumer.consume(block=False)
message.offset

0

In [15]:
reconstruct(message)

For testing serialization

In [14]:
import pkg_resources, base64
import binascii
import tempfile

data = None
with open("../mass/tooth.h5", "r") as f:
    data = f.read()
    
data_enc=binascii.hexlify(data)
print data_enc[:20]
print type(data_enc)

tf = tempfile.NamedTemporaryFile(delete=True)
#tf = open("test.h5", "w")
tf.write(binascii.unhexlify(data_enc))
tf.flush()
proj, flat, dark, theta = dxchange.read_aps_32id(tf.name, sino=(0, 2))

894844460d0a1a0a0000
<type 'str'>


In [15]:
theta.dtype

dtype('float64')

In [7]:
count = 0
number_total_points = 0
read_bytes = 0
for i in range(100):
    message = consumer.consume(block=False)
    if message is not None:
        print "Message %d, Bytes: %d"%(count, len(message.value))
        reconstruct(message)
        read_bytes = read_bytes + len(message.value)
        count =  count + 1
   
        
print("Total Messages: %d, Read Bytes: %d"%(count, read_bytes))

Total Messages: 0, Read Bytes: 0


# Scratch Pad

General Kafka Test

In [8]:
producer.produce("hello")

<pykafka.protocol.Message at 0x2b267f61db50>

In [9]:
message = consumer.consume(block=False)
print message.value

hello


In [11]:
consumer.partitions

{0: <pykafka.partition.Partition at 0x2b267f6b4710 (id=0)>}

In [12]:
from distributed import Client
dask_distributed_client = Client('tcp://c251-136:8786')

#def map_test():
#    return 1


class DaskTest():
    
    def __init__(self):
        self.dask_distributed_client = Client('tcp://c251-136:8786')


    def map_test(self):
        return 1
    
    def run(self):
        tasks = []
        for block_id in range(3):
            tasks.append(self.dask_distributed_client.submit(self.map_test))
           
        return self.dask_distributed_client.gather(tasks)
        

In [None]:
t = DaskTest()
t.run()

In [10]:
tasks = []
for block_id in range(3):
    tasks.append(dask_distributed_client.submit(map_test))
           
dask_distributed_client.gather(tasks)
#f = dask_distributed_client.submit(map_test)

[1, 1, 1]