# Howto Mini App Streaming Source (MASS)

This notebooks demonstrates the usage data source apps that can be used for the development and characterizing of streaming application.

In [1]:
# System Libraries
import sys, os
sys.path.append("..")
import pandas as pd

## logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("py4j").setLevel(logging.ERROR)
logging.getLogger("radical.utils").setLevel(logging.ERROR)
 
# Pilot-Streaming
import pilot.streaming



# 1. Resource Setup
## 1.1 Kafka

In [None]:
kafka_pilot_description1 = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 159,
    "type":"kafka"
}
kafka_pilot1 = pilot.streaming.PilotComputeService.create_pilot(kafka_pilot_description1)
kafka_pilot1.wait()

**** Job: 60291 State : Pending


In [None]:
kafka_details = kafka_pilot1.get_details()
kafka_details

In [20]:
kafka_pilot1.cancel()

## 1.2. Dask

In [None]:
dask_pilot_description = {
    "resource":"slurm+ssh://login1.wrangler.tacc.utexas.edu",
    "working_directory": os.path.join('/work/01131/tg804093/wrangler/', "work"),
    "number_cores": 48,
    "project": "TG-MCB090174",
    "queue": "normal",
    "walltime": 159,
    "type":"dask"
}
dask_pilot = pilot.streaming.PilotComputeService.create_pilot(dask_pilot_description)
dask_pilot.wait()

In [None]:
dask_details = dask_pilot.get_details()
dask_details

In [19]:
dask_pilot.cancel()

# 2. Mini App Test
## 2.1 KMeans
### 2.1.1 Run App

In [1]:
# System Libraries
import sys, os
sys.path.insert(0, "..")
import pandas as pd
import numpy as np
import ast
import mass.KafkaProducer
import pykafka

kafka_details= {'master_url': 'c251-133:2181'}
dask_details = {'master_url': 'tcp://c251-135:8786'} 

miniapp=mass.kafka.MiniApp(
                            dask_scheduler=dask_details['master_url'],
                            #dask_scheduler=None,
                            kafka_zk_hosts=kafka_details["master_url"],
                            number_parallel_tasks=2,
                            number_clusters=10,
                            number_points_per_cluster=10000,
                            number_points_per_message=1000,
                            number_dim=3,
                            number_produces=1,
                            number_partitions=4,
                            topic_name="test"
                           )

Kafka: c251-133:2181, Dask: tcp://129.114.58.135:8786, Number Dask Nodes: 1,  Number Parallel Producers: 2


In [2]:
miniapp.run()

/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --delete --zookeeper c251-133:2181 --topic test
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --create --zookeeper c251-133:2181 --replication-factor 1 --partitions 4 --topic test
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --describe --zookeeper c251-133:2181 --topic test
Generate Block ID: 0
Generate Block ID: 1
Waiting for Dask Tasks to complete
[{'bytes_per_message': '62303', 'transmission_time': '1.51162', 'block_id': 0, 'points_per_message': 1000, 'data_generation_time': '0.120591', 'runtime': '1.63222', 'number_messages': 50}, {'bytes_per_message': '64305', 'transmission_time': '1.23478', 'block_id': 1, 'points_per_message': 1000, 'data_generation_time': '0.135225', 'runtime': '1.37001', 'number_messages': 50}]
End Produce via Dask
Number: 0, Number Parallel Tasks: 2, Time to produce 100000 points: 6.8


### 2.1.2 Check Kafka Broker

Ensure that the correct amount of data was successfully written to Kafka

In [18]:
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])
topic = client.topics['test']
producer = topic.get_sync_producer()
consumer = topic.get_simple_consumer()

In [19]:
len(client.brokers)

2

In [4]:
count = 0
number_total_points = 0
number_dimensions = 0
for i in range(100):
    message = consumer.consume(block=False)
    if message is not None:
        data_np = np.array(ast.literal_eval(message.value))
        num_points = data_np.shape[0]
        number_dimensions = data_np.shape[1]
        count =  count + 1
        number_total_points = number_total_points + num_points
    #print "Consumed message: %d, Number Points: %d, Number Dimensions: %d"%\
    #        (count, num_points, number_dimensions)   
        
print("Total Messages: %d, Total Points: %d, Number Dimensions: %d"%(count, number_total_points, number_dimensions))

Total Messages: 100, Total Points: 100000, Number Dimensions: 3


## 2.2 Light Source
### 2.2.1 Run Mini App

In [18]:
%%time
import sys, os
sys.path.insert(0, "..")
import pandas as pd
import numpy as np
import ast
import mass.KafkaProducer
import pykafka
import mass.LightSourceKafkaProducer
kafka_details= {'master_url': 'c251-133:2181'}
dask_details = {'master_url': 'tcp://c251-135:8786'} 

miniapp=mass.LightSourceKafkaProducer.MiniApp(
                                   #dask_scheduler=dask_details['master_url'],
                                   dask_scheduler=None,
                                   kafka_zk_hosts=kafka_details["master_url"],
                                   number_parallel_tasks=1,
                                   number_messages=6400,
                                   number_produces=1,
                                   number_partitions=24,
                                   topic_name="light_test6",
                                   application_type = "light"
                                  )
miniapp.run()

Kafka: c251-133:2181, Dask: inproc://129.114.58.132/87079/36, Number Dask Nodes: 1,  Number Parallel Producers: 1
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --delete --zookeeper c251-133:2181 --topic light_test6
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --create --zookeeper c251-133:2181 --replication-factor 1 --partitions 24 --topic light_test6
/home/01131/tg804093/work/kafka_2.11-1.0.0/bin/kafka-topics.sh --describe --zookeeper c251-133:2181 --topic light_test6
Waiting for Dask Tasks to complete
Access sample data: mass; File: tooth.h5; Size: 1043312
[{'bytes_per_message': '1043312', 'transmission_time': '81.74284', 'block_id': 0, 'data_generation_time': '0.041020', 'runtime': '81.78386', 'number_messages': 6400}]
End Produce via Dask
Number: 0, Number Parallel Tasks: 1, Runtime: 86.8
CPU times: user 44.8 s, sys: 12.4 s, total: 57.2 s
Wall time: 1min 29s


### 2.2.2 Check Whether Data is correct

In [7]:
client = pykafka.KafkaClient(zookeeper_hosts=kafka_details["master_url"])
topic = client.topics['light_test6']
consumer = topic.get_simple_consumer()

In [8]:
count = 0
number_total_points = 0
read_bytes = 0
for i in range(100):
    message = consumer.consume(block=False)
    if message is not None:
        print "Message %d, Bytes: %d"%(count, len(message.value))
        read_bytes = read_bytes + len(message.value)
        count =  count + 1
   
        
print("Total Messages: %d, Read Bytes: %d"%(count, read_bytes))

Message 0, Bytes: 1043312
Message 1, Bytes: 1043312
Message 2, Bytes: 1043312
Message 3, Bytes: 1043312
Total Messages: 4, Read Bytes: 4173248


No handlers could be found for logger "kazoo.client"


# Scratch Pad

General Kafka Test

In [8]:
producer.produce("hello")

<pykafka.protocol.Message at 0x2b267f61db50>

In [9]:
message = consumer.consume(block=False)
print message.value

hello


In [11]:
consumer.partitions

{0: <pykafka.partition.Partition at 0x2b267f6b4710 (id=0)>}

In [12]:
from distributed import Client
dask_distributed_client = Client('tcp://c251-136:8786')

#def map_test():
#    return 1


class DaskTest():
    
    def __init__(self):
        self.dask_distributed_client = Client('tcp://c251-136:8786')


    def map_test(self):
        return 1
    
    def run(self):
        tasks = []
        for block_id in range(3):
            tasks.append(self.dask_distributed_client.submit(self.map_test))
           
        return self.dask_distributed_client.gather(tasks)
        

In [None]:
t = DaskTest()
t.run()

In [10]:
tasks = []
for block_id in range(3):
    tasks.append(dask_distributed_client.submit(map_test))
           
dask_distributed_client.gather(tasks)
#f = dask_distributed_client.submit(map_test)

[1, 1, 1]