# MASS: Howto Mini App Streaming Source (MASS)

This notebooks demonstrates the usage data source apps that can be used for the development and characterizing of streaming application.

In [1]:
%%capture
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
logging.getLogger("radical.utils").setLevel(logging.ERROR)
 
# Pilot-Streaming
import pilot.streaming

# 1. Resource Setup
## 1.1 Kafka

In [2]:
kafka_pilot_description1 = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 159,
    "type":"kafka"
}
kafka_pilot1 = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description1)
kafka_pilot1.wait()

**** Job: 60576 State : Pending
look for configs in: /work/01131/tg804093/wrangler/work/kafka-48551a46-fbfe-11e7-b2d9-b083fed043f0/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-48551a46-fbfe-11e7-b2d9-b083fed043f0/config (Wed Jan 17 21:19:22 2018)
{'zookeeper.connection.timeout.ms': '6000', 'broker.id': '0', 'listeners': 'PLAINTEXT://c251-132:9092', 'zookeeper.connect': 'c251-132:2181'}


In [3]:
kafka_details = kafka_pilot1.get_details()
kafka_details

look for configs in: /work/01131/tg804093/wrangler/work/kafka-48551a46-fbfe-11e7-b2d9-b083fed043f0/config
['broker-0']
Kafka Config: /work/01131/tg804093/wrangler/work/kafka-48551a46-fbfe-11e7-b2d9-b083fed043f0/config (Wed Jan 17 21:19:22 2018)
{'zookeeper.connection.timeout.ms': '6000', 'broker.id': '0', 'listeners': 'PLAINTEXT://c251-132:9092', 'zookeeper.connect': 'c251-132:2181'}


{'details': {'broker.id': '0',
  'listeners': 'PLAINTEXT://c251-132:9092',
  'zookeeper.connect': 'c251-132:2181',
  'zookeeper.connection.timeout.ms': '6000'},
 'master_url': 'c251-132:2181'}

In [7]:
kafka_pilot1.cancel()

## 1.2. Dask

In [4]:
dask_pilot_description = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 159,
    "type":"dask"
}
dask_pilot = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description)
dask_pilot.wait()

**** Job: 60577 State : Pending
init distributed client
init distributed client


In [5]:
dask_details = dask_pilot.get_details()
dask_details

{'master_url': 'tcp://c251-135:8786', 'web_ui_url': 'http://c251-135:8787'}

In [8]:
dask_pilot.cancel()

# 2. Mini App Test
## 2.1 KMeans
### 2.1.1 Run App

In [30]:
# System Libraries
import sys, os
sys.path.insert(0, "..")
import pandas as pd
import numpy as np
import ast
import mass.kafka
import pykafka

kafka_details= {'master_url': 'c251-135:2181'}
dask_details = {'master_url': 'tcp://c251-136:8786'} 

miniapp=mass.kafka.MiniApp(
                            #dask_scheduler=dask_details['master_url'],
                            dask_scheduler=None,
                            kafka_zk_hosts=kafka_details["master_url"],
                            number_parallel_tasks=2,
                            number_clusters=10,
                            number_points_per_cluster=10000,
                            number_points_per_message=1000,
                            number_messages=1,
                            number_dim=3,
                            number_produces=1,
                            number_partitions=4,
                            topic_name="test",
                            application_type="kmeans"
                           )

Kafka: c251-135:2181, Dask: inproc://129.114.58.137/72918/1, Number Dask Nodes: 1,  Number Parallel Producers: 2


In [31]:
miniapp.run()

/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --delete --zookeeper c251-135:2181 --topic test
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --create --zookeeper c251-135:2181 --replication-factor 1 --partitions 4 --topic test
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --describe --zookeeper c251-135:2181 --topic test
Waiting for Dask Tasks to complete
Access sample data: mass; File: tooth.h5; Size: 1043312
Access sample data: mass; File: tooth.h5; Size: 1043312
[{'bytes_per_message': '1043312', 'transmission_time': '0.00000', 'block_id': 0, 'data_generation_time': '0.052927', 'runtime': '0.05293', 'number_messages': 0}, {'bytes_per_message': '1043312', 'transmission_time': '0.00000', 'block_id': 1, 'data_generation_time': '0.054766', 'runtime': '0.05477', 'number_messages': 0}]
End Produce via Dask
Number: 0, Number Parallel Tasks: 2, Runtime: 5.1


### 2.1.2 Check Kafka Broker

Ensure that the correct amount of data was successfully written to Kafka

In [32]:
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])
topic = client.topics['test']
producer = topic.get_sync_producer()
consumer = topic.get_simple_consumer()

In [33]:
len(client.brokers)

1

In [10]:
count = 0
number_total_points = 0
number_dimensions = 0
for i in range(100):
    message = consumer.consume(block=False)
    if message is not None:
        data_np = np.array(ast.literal_eval(message.value))
        num_points = data_np.shape[0]
        number_dimensions = data_np.shape[1]
        count =  count + 1
        number_total_points = number_total_points + num_points
    #print "Consumed message: %d, Number Points: %d, Number Dimensions: %d"%\
    #        (count, num_points, number_dimensions)   
        
print("Total Messages: %d, Total Points: %d, Number Dimensions: %d"%(count, number_total_points, number_dimensions))

Total Messages: 100, Total Points: 100000, Number Dimensions: 3


## 2.2 Light Source
### 2.2.1 Run Mini App

In [1]:
%%time
import sys, os
sys.path.insert(0, "..")
import pandas as pd
import numpy as np
import ast
import mass.kafka
import pykafka
kafka_details= {'master_url': 'c251-132:2181'}
dask_details = {'master_url': 'tcp://c251-135:8786'} 

miniapp=mass.kafka.MiniApp(
                                   #dask_scheduler=dask_details['master_url'],
                                   dask_scheduler=None,
                                   kafka_zk_hosts=kafka_details["master_url"],
                                   number_parallel_tasks=1,
                                   number_messages=10,
                                   number_produces=1,
                                   number_partitions=24,
                                   topic_name="light_test12",
                                   application_type = "light"
                                  )
miniapp.run()

Kafka: c251-132:2181, Dask: inproc://129.114.58.137/98200/1, Number Dask Nodes: 1,  Number Parallel Producers: 1
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --delete --zookeeper c251-132:2181 --topic light_test12
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --create --zookeeper c251-132:2181 --replication-factor 1 --partitions 24 --topic light_test12
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --describe --zookeeper c251-132:2181 --topic light_test12
Waiting for Dask Tasks to complete
Access sample data: mass; File: tooth.h5; Size: 1043312
Base64 Len: 1391084
[{'transmission_time': '0.67697', 'block_id': 0, 'bytes_per_message_bin': '1043312', 'bytes_per_message_b64': '1391084', 'data_generation_time': '0.073477', 'runtime': '0.75045', 'number_messages': 10}]
End Produce via Dask
Number: 0, Number Parallel Tasks: 1, Runtime: 5.8
CPU times: user 1.38 s, sys: 420 ms, total: 1.8 s
Wall time: 9.7 s


### 2.2.2 Check if Light Source Data is correct

In [14]:
%matplotlib inline
import os, sys, time
import tomopy
import pandas as pd
import dxchange
import tempfile
import pykafka

In [15]:
def reconstruct(message):
    start = 0
    end = 2
    msg_bin = base64.b64decode(message.value)
    print len(msg_bin)
    
    tf = tempfile.NamedTemporaryFile(delete=True)
    tf.write()
    tf.flush()
    proj, flat, dark, theta = dxchange.read_aps_32id(tf.name, sino=(start, end))
    theta = tomopy.angles(proj.shape[0])
    proj = tomopy.normalize(proj, flat, dark)
    rot_center = tomopy.find_center(proj, theta, init=290, ind=0, tol=0.5)
    proj = tomopy.minus_log(proj)
    recon = tomopy.recon(proj, theta, center=rot_center, algorithm='gridrec')
    recon = tomopy.circ_mask(recon, axis=0, ratio=0.95)
    tf.close()

In [2]:
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])
topic = client.topics['light_test12']
consumer = topic.get_simple_consumer()

In [3]:
message = consumer.consume(block=False)

MessageSetDecodeFailure: 1391147

In [10]:
count = 0
number_total_points = 0
read_bytes = 0
for i in range(100):
    message = consumer.consume(block=False)
    if message is not None:
        print "Message %d, Bytes: %d"%(count, len(message.value))
        reconstruct(message)
        read_bytes = read_bytes + len(message.value)
        count =  count + 1
   
        
print("Total Messages: %d, Read Bytes: %d"%(count, read_bytes))

MessageSetDecodeFailure: 1391147

# Scratch Pad

General Kafka Test

In [8]:
producer.produce("hello")

<pykafka.protocol.Message at 0x2b267f61db50>

In [9]:
message = consumer.consume(block=False)
print message.value

hello


In [11]:
consumer.partitions

{0: <pykafka.partition.Partition at 0x2b267f6b4710 (id=0)>}

In [12]:
from distributed import Client
dask_distributed_client = Client('tcp://c251-136:8786')

#def map_test():
#    return 1


class DaskTest():
    
    def __init__(self):
        self.dask_distributed_client = Client('tcp://c251-136:8786')


    def map_test(self):
        return 1
    
    def run(self):
        tasks = []
        for block_id in range(3):
            tasks.append(self.dask_distributed_client.submit(self.map_test))
           
        return self.dask_distributed_client.gather(tasks)
        

In [None]:
t = DaskTest()
t.run()

In [10]:
tasks = []
for block_id in range(3):
    tasks.append(dask_distributed_client.submit(map_test))
           
dask_distributed_client.gather(tasks)
#f = dask_distributed_client.submit(map_test)

[1, 1, 1]