From d27ab74e5cc05c78441b374de479a60c77dc9d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Thu, 4 Oct 2012 12:02:38 -0300 Subject: [PATCH 01/37] First commit --- README.markdown | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 README.markdown diff --git a/README.markdown b/README.markdown new file mode 100644 index 0000000..e69de29 From f72a4eb48c494dbd76dbef84546b31f3072d2d87 Mon Sep 17 00:00:00 2001 From: Felipe cruz Date: Thu, 4 Oct 2012 12:20:03 -0300 Subject: [PATCH 02/37] Add Initial description and examples --- README.markdown | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.markdown b/README.markdown index e69de29..4b79c3e 100644 --- a/README.markdown +++ b/README.markdown @@ -0,0 +1,29 @@ +pypelinin +========= + +Python library to distribute jobs and pipelines among a cluster. + +Usage +===== + +Pypelinin will provide a high level python dsl to describe your workflow. + +Example 1: + +```python +pipeline = Worker('do a task') | [Worker('parallel task 1'), + Worker('parallel task 2')] | Worker('finalizer') +``` + +Example 2: + +```python +pipeline = Worker('do a task') | [Worker('parallel task 1') | Worker('after 1'), + Worker('parallel task 2')] | Worker('finalizer') +``` + +After defined, you just have to start your pipeline. + +``` +Pipeliner.start(pipeline) +``` From 43d3ce234a1b10ea15bf0cf83b97c58829a8b7ca Mon Sep 17 00:00:00 2001 From: Felipe cruz Date: Thu, 4 Oct 2012 23:40:48 -0300 Subject: [PATCH 03/37] add first pipeline api implementation --- pypelinin/__init__.py | 0 pypelinin/worker.py | 65 +++++++++++++++++++++++++++++++++++++++++++ tests/test_worker.py | 48 ++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) create mode 100644 pypelinin/__init__.py create mode 100644 pypelinin/worker.py create mode 100644 tests/test_worker.py diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypelinin/worker.py b/pypelinin/worker.py new file mode 100644 index 0000000..0942263 --- /dev/null +++ b/pypelinin/worker.py @@ -0,0 +1,65 @@ +import json +import cPickle as pickle + +def todict(obj, classkey=None): + if isinstance(obj, dict): + for k in obj.keys(): + obj[k] = todict(obj[k], classkey) + return obj + elif hasattr(obj, "__iter__"): + return [todict(v, classkey) for v in obj] + elif hasattr(obj, "__dict__"): + data = dict([(key, todict(value, classkey)) + for key, value in obj.__dict__.iteritems() + if not callable(value) and not key.startswith('_')]) + if classkey is not None and hasattr(obj, "__class__"): + data[classkey] = obj.__class__.__name__ + return data + else: + return obj + +class Worker(object): + def __init__(self, worker_name): + self.name = worker_name + self.after = [] + + def then(self, *after): + self.after.extend(list(after)) + return self + + def __or__(self, after): + self.then(*[after]) + return self + + def __eq__(self, other): + return self.name == other.name + + def __repr__(self): + return "Worker({name})".format(**self.__dict__) + + def serialize(self): + if not self.after: + return "worker: {name}".format(name=self.name) + else: + data = "main: worker: {name}".format(name=self.name) + for node in self.after: + data += " " + node.serialize() + return data + + @staticmethod + def from_json(value): + temp_after = [] + data = json.loads(value) + + if isinstance(data, list): + for node in data: + temp_after.append(Worker.from_json(json.dumps(node))) + return temp_after + + worker = Worker(data['name']) + worker.after = data['after'] + for node in worker.after: + temp_after.append(Worker.from_json(json.dumps(node))) + + worker.after = temp_after + return worker diff --git a/tests/test_worker.py b/tests/test_worker.py new file mode 100644 index 0000000..2600e53 --- /dev/null +++ b/tests/test_worker.py @@ -0,0 +1,48 @@ +import unittest + +import json +from pypelinin.worker import Worker, todict + +class WorkerTest(unittest.TestCase): + def test_pipeline_init(self): + pipeline = Worker('worker_id') + + self.assertEquals(pipeline.name, 'worker_id') + self.assertEquals(pipeline.after, []) + self.assertEquals(pipeline.serialize(), "worker: worker_id") + + def test_pipeline_worker_pipe_pipeline(self): + pipeline = Worker('w1') | Worker('w2') + + self.assertEquals(pipeline.name, "w1") + self.assertEquals(pipeline.after, [Worker('w2')]) + + def test_pipeline_worker_pipe_parallel_pipelines_pipe_worker(self): + pipeline = Worker('V1') | [Worker('V2'), Worker('V3')] | Worker('V4') + self.assertEquals(pipeline.after, + [[Worker('V2'), Worker('V3')], Worker('V4')]) + + def test_pipeline_worker_pipe_nested_pipe_in_parallel_pipe_worker(self): + pipeline = Worker('V1') | [ Worker('V2') | Worker('A2'), + Worker('V3') + ] | Worker('V4') + + self.assertEquals(pipeline.after, + [[Worker('V2') | Worker('A2'), + Worker('V3')], Worker('V4')]) + + def test_complex_pipeline_to_json_and_from_json(self): + pipeline = Worker('V1') | [ Worker('V2') | Worker('A2'), + Worker('V3') + ] | Worker('V4') + + jdata = json.dumps(todict(pipeline), indent=4) + pipeline_from_json = Worker.from_json(jdata) + + self.assertEquals(pipeline, pipeline_from_json) + self.assertEquals(pipeline_from_json.after, + [[Worker('V2') | Worker('A2'), + Worker('V3')], Worker('V4')]) + self.assertEquals(json.dumps(todict(pipeline)), + json.dumps(todict(pipeline_from_json))) + From 53a4014590f2d5fea812477b518e496183c932ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sat, 6 Oct 2012 14:31:41 -0300 Subject: [PATCH 04/37] Add .gitignore --- .gitignore | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c7f3e6a --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +.*.sw? +*.pyc +*~ +.idea/* +dist/* +build/* +pypln.egg-info/* +.idea/* +.coverage +reg_settings.py +MANIFEST +.directory +*.db +.env From 8da71def717227a804b35f8b47b50038fe40ffd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sat, 6 Oct 2012 15:35:54 -0300 Subject: [PATCH 05/37] Add Manager (and its tests), setup.py and Makefile --- Makefile | 15 ++++ example/my_manager.py | 24 +++++ pypelinin/__init__.py | 7 ++ pypelinin/manager.py | 135 ++++++++++++++++++++++++++++ pypelinin/setup.py | 25 ++++++ requirements/development.txt | 4 + requirements/production.txt | 2 + tests/test_manager.py | 166 +++++++++++++++++++++++++++++++++++ tests/utils.py | 3 + 9 files changed, 381 insertions(+) create mode 100644 Makefile create mode 100644 example/my_manager.py create mode 100644 pypelinin/__init__.py create mode 100755 pypelinin/manager.py create mode 100644 pypelinin/setup.py create mode 100644 requirements/development.txt create mode 100644 requirements/production.txt create mode 100644 tests/test_manager.py create mode 100644 tests/utils.py diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6e0d986 --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +TEST_RUNNER=nosetests -dsv --with-yanc + +bootstrap-environment: + pip install -r requirements/development.txt + +bootstrap-tests: + python pypelinin/setup.py install + +test: bootstrap-tests + ${TEST_RUNNER} tests/ + +test-manager: bootstrap-tests + ${TEST_RUNNER} tests/test_manager.py + +.PHONY: bootstrap-environment bootstrap-tests test test-manager diff --git a/example/my_manager.py b/example/my_manager.py new file mode 100644 index 0000000..f17ea79 --- /dev/null +++ b/example/my_manager.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python2 +# coding: utf-8 + +from sys import stdout +from logging import Logger, StreamHandler, Formatter +from pypelinin import Manager + + +def main(): + logger = Logger('Manager') + handler = StreamHandler(stdout) + formatter = Formatter('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + api_host_port = ('*', 5555) + broadcast_host_port = ('*', 5556) + default_config = {'db': {'data': 'test'}, 'monitoring interval': 60, } + manager = Manager(api_host_port, broadcast_host_port, default_config, + logger) + manager.start() + +if __name__ == '__main__': + main() diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py new file mode 100644 index 0000000..124f483 --- /dev/null +++ b/pypelinin/__init__.py @@ -0,0 +1,7 @@ +# coding: utf-8 + +from .manager import Manager +from .client import Client +from .broker import Broker +#from .pipeline import PipelineManager, Worker +#from .pipeliner import Pipeliner diff --git a/pypelinin/manager.py b/pypelinin/manager.py new file mode 100755 index 0000000..9a412c2 --- /dev/null +++ b/pypelinin/manager.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# coding: utf-8 + +import uuid +from Queue import Queue +from logging import Logger, NullHandler +import zmq + + +class Manager(object): + #TODO: add another queue for processing jobs + #TODO: add a timeout for processing jobs (default or get it from client) + #TODO: if processing job have timeout, remove from processing queue, add + # again in job_queue and announce pending job + #TODO: validate all received data (types, keys etc.) + #TODO: handle 'job failed' messages + def __init__(self, api_host_port, broadcast_host_port, config, logger=None, + logger_name='Manager'): + self.job_queue = Queue() + self.pipeline_queue = Queue() + #TODO: should persist jobs and recover in case of failure + self.pending_job_ids = [] + self.pending_pipeline_ids = [] + self.context = zmq.Context() + self.config = config + if logger is None: + self.logger = Logger(logger_name) + self.logger.addHandler(NullHandler()) + else: + self.logger = logger + self.api_host_port = api_host_port + self.broadcast_host_port = broadcast_host_port + + def bind(self): + self.api = self.context.socket(zmq.REP) + self.broadcast = self.context.socket(zmq.PUB) + self.api.linger = 0 + self.broadcast.linger = 0 + self.api.bind('tcp://{}:{}'.format(*self.api_host_port)) + self.broadcast.bind('tcp://{}:{}'.format(*self.broadcast_host_port)) + + def close_sockets(self): + self.api.close() + self.broadcast.close() + + def get_request(self): + message = self.api.recv_json() + self.logger.info('[API] Request: {}'.format(message)) + return message + + def reply(self, message): + self.api.send_json(message) + self.logger.info('[API] Reply: {}'.format(message)) + + def start(self): + try: + self.bind() + self.run() + except KeyboardInterrupt: + self.logger.info('Got SIGNINT (KeyboardInterrupt), exiting.') + self.close_sockets() + + def run(self): + self.logger.info('Entering main loop') + while True: + message = self.get_request() + if 'command' not in message: + self.reply({'answer': 'undefined command'}) + continue + command = message['command'] + if command == 'get configuration': + self.reply(self.config) + elif command == 'add job': + message['job id'] = uuid.uuid4().hex + del message['command'] + self.job_queue.put(message) + self.pending_job_ids.append(message['job id']) + self.reply({'answer': 'job accepted', + 'job id': message['job id']}) + self.broadcast.send('new job') + self.logger.info('[Broadcast] Sent "new job"') + elif command == 'get job': + if self.job_queue.empty(): + self.reply({'worker': None}) + else: + job = self.job_queue.get() + self.reply(job) + elif command == 'job finished': + if 'job id' not in message or 'duration' not in message: + self.reply({'answer': 'syntax error'}) + else: + job_id = message['job id'] + if job_id not in self.pending_job_ids: + self.reply({'answer': 'unknown job id'}) + else: + self.pending_job_ids.remove(job_id) + self.reply({'answer': 'good job!'}) + new_message = 'job finished: {} duration: {}'\ + .format(job_id, message['duration']) + self.broadcast.send(new_message) + self.logger.info('[Broadcast] Sent: {}'\ + .format(new_message)) + elif command == 'add pipeline': + pipeline_id = uuid.uuid4().hex + message['pipeline id'] = pipeline_id + del message['command'] + self.pipeline_queue.put(message) + self.pending_pipeline_ids.append(pipeline_id) + self.reply({'answer': 'pipeline accepted', + 'pipeline id': pipeline_id}) + self.broadcast.send('new pipeline') + self.logger.info('[Broadcast] Sent "new pipeline"') + elif command == 'get pipeline': + if self.pipeline_queue.empty(): + self.reply({'pipeline': None}) + else: + pipeline = self.pipeline_queue.get() + self.reply(pipeline) + elif command == 'pipeline finished': + if 'pipeline id' not in message: + self.reply({'answer': 'syntax error'}) + else: + pipeline_id = message['pipeline id'] + if pipeline_id not in self.pending_pipeline_ids: + self.reply({'answer': 'unknown pipeline id'}) + else: + self.pending_pipeline_ids.remove(pipeline_id) + self.reply({'answer': 'good job!'}) + new_message = 'pipeline finished: {}'\ + .format(pipeline_id) + self.broadcast.send(new_message) + self.logger.info('[Broadcast] Sent: {}'\ + .format(new_message)) + else: + self.reply({'answer': 'unknown command'}) diff --git a/pypelinin/setup.py b/pypelinin/setup.py new file mode 100644 index 0000000..7416ac8 --- /dev/null +++ b/pypelinin/setup.py @@ -0,0 +1,25 @@ +# coding: utf-8 + +from distutils.core import setup + + +setup(name='pypelinin', + version='0.1.0-dev', + author=u'Álvaro Justen', + author_email='alvarojusten@gmail.com', + url='https://github.com/turicas/pypelinin/', + description='Easily distribute and process jobs and pipelines among a cluster', + packages=['pypelinin'], + install_requires=['pyzmq', 'psutil'], + license='GPL3', + keywords=['jobs', 'tasks', 'distributed', 'pipelines', 'cluster'], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: GNU General Public License (GPL)', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 2.7', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], +) diff --git a/requirements/development.txt b/requirements/development.txt new file mode 100644 index 0000000..dfe412d --- /dev/null +++ b/requirements/development.txt @@ -0,0 +1,4 @@ +--requirement=production.txt + +nose +yanc diff --git a/requirements/production.txt b/requirements/production.txt new file mode 100644 index 0000000..d4da163 --- /dev/null +++ b/requirements/production.txt @@ -0,0 +1,2 @@ +pyzmq +psutil diff --git a/tests/test_manager.py b/tests/test_manager.py new file mode 100644 index 0000000..9c6cef1 --- /dev/null +++ b/tests/test_manager.py @@ -0,0 +1,166 @@ +# coding: utf-8 + +import unittest +from signal import SIGINT, SIGKILL +from time import sleep +from subprocess import Popen, PIPE +from .utils import default_config +import shlex +import zmq + + +time_to_wait = 150 + +class TestManager(unittest.TestCase): + def setUp(self): + self.context = zmq.Context() + self.start_manager_process() + self.api = self.context.socket(zmq.REQ) + self.api.connect('tcp://localhost:5555') + self.broadcast = self.context.socket(zmq.SUB) + self.broadcast.connect('tcp://localhost:5556') + self.broadcast.setsockopt(zmq.SUBSCRIBE, 'new job') + + def tearDown(self): + self.end_manager_process() + self.close_sockets() + self.context.term() + + def start_manager_process(self): + self.manager = Popen(shlex.split('python ./example/my_manager.py'), + stdin=PIPE, stdout=PIPE, stderr=PIPE) + for line in self.manager.stdout.readline(): + if 'main loop' in line: + break + + def end_manager_process(self): + self.manager.send_signal(SIGINT) + sleep(time_to_wait / 1000.0) + self.manager.send_signal(SIGKILL) + self.manager.wait() + + def close_sockets(self): + self.api.close() + self.broadcast.close() + + def test_connect_to_manager_api_zmq_socket_and_execute_undefined_command(self): + self.api.send_json({'spam': 'eggs'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'undefined command' from manager") + message = self.api.recv_json() + self.assertEqual(message, {'answer': 'undefined command'}) + + def test_should_connect_to_manager_api_zmq_socket(self): + self.api.send_json({'command': 'hello'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'unknown command' from manager") + message = self.api.recv_json() + self.assertEqual(message, {'answer': 'unknown command'}) + + def test_should_receive_new_job_from_broadcast_when_a_job_is_submitted(self): + self.api.send_json({'command': 'add job', 'worker': 'x', + 'document': 'y'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'add job' reply") + self.api.recv_json() + if not self.broadcast.poll(time_to_wait): + self.fail("Didn't receive 'new job' from broadcast") + message = self.broadcast.recv() + self.assertEqual(message, 'new job') + + def test_command_get_configuration_should_return_dict_passed_on_setUp(self): + self.api.send_json({'command': 'get configuration'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive configuration from manager") + message = self.api.recv_json() + self.assertEqual(message, default_config) + + def test_command_add_job_should_return_a_job_id(self): + cmd = {'command': 'add job', 'worker': 'test', 'document': 'eggs'} + self.api.send_json(cmd) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'job accepted' from manager") + message = self.api.recv_json() + self.assertEqual(message['answer'], 'job accepted') + self.assertIn('job id', message) + self.assertEqual(len(message['job id']), 32) + + def test_command_get_job_should_return_empty_if_no_job(self): + self.api.send_json({'command': 'get job'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive job (None) from manager") + message = self.api.recv_json() + self.assertEqual(message['worker'], None) + + def test_command_get_job_should_return_a_job_after_adding_one(self): + self.api.send_json({'command': 'add job', 'worker': 'spam', + 'document': 'eggs'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'add job' reply") + job = self.api.recv_json() + self.api.send_json({'command': 'get job'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive job from manager") + message = self.api.recv_json() + self.assertEqual(message['worker'], 'spam') + self.assertEqual(message['document'], 'eggs') + self.assertIn('job id', message) + self.assertEqual(len(message['job id']), 32) + + def test_finished_job_without_job_id_should_return_error(self): + self.api.send_json({'command': 'job finished'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'syntax error' from manager") + message = self.api.recv_json() + self.assertEqual(message['answer'], 'syntax error') + + def test_finished_job_with_unknown_job_id_should_return_error(self): + self.api.send_json({'command': 'job finished', 'job id': 'python rlz', + 'duration': 0.1}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'unknown job id' from manager") + message = self.api.recv_json() + self.assertEqual(message['answer'], 'unknown job id') + + def test_finished_job_with_correct_job_id_should_return_good_job(self): + self.api.send_json({'command': 'add job', 'worker': 'a', + 'document': 'b'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'add job' reply") + message = self.api.recv_json() + self.api.send_json({'command': 'job finished', + 'job id': message['job id'], + 'duration': 0.1}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'good job!' from manager. " + "#foreveralone :-(") + message = self.api.recv_json() + self.assertEqual(message['answer'], 'good job!') + + def test_should_receive_job_finished_message_with_job_id_and_duration_when_a_job_finishes(self): + self.api.send_json({'command': 'add job', 'worker': 'x', + 'document': 'y'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'add job' reply") + self.api.recv_json() + if not self.broadcast.poll(time_to_wait): + self.fail("Didn't receive 'new job' from broadcast") + message = self.broadcast.recv() + self.assertEqual(message, 'new job') + self.api.send_json({'command': 'get job'}) + if not self.api.poll(time_to_wait): + self.fail("Didn't receive 'get job' reply") + job = self.api.recv_json() + self.broadcast.setsockopt(zmq.SUBSCRIBE, + 'job finished: {}'.format(job['job id'])) + del job['worker'] + job['command'] = 'job finished' + job['duration'] = 0.1 + self.api.send_json(job) + if not self.broadcast.poll(time_to_wait): + self.fail("Didn't receive 'new job' from broadcast") + message = self.broadcast.recv() + expected = 'job finished: {} duration: 0.1'.format(job['job id']) + self.assertEqual(message, expected) + + #TODO: create tests for pipelines diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..a6027c2 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,3 @@ +# coding: utf-8 + +default_config = {'db': {'data': 'test'}, 'monitoring interval': 60} From 677ebfbeb60ba0762b1123fcf799a0c6e3e9b20e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sun, 7 Oct 2012 11:24:05 -0300 Subject: [PATCH 06/37] Add Client and its tests --- Makefile | 3 + pypelinin/__init__.py | 2 +- pypelinin/client.py | 140 ++++++++++++++++++++++ pypelinin/manager.py | 1 + requirements/development.txt | 1 + tests/test_client.py | 220 +++++++++++++++++++++++++++++++++++ 6 files changed, 366 insertions(+), 1 deletion(-) create mode 100755 pypelinin/client.py create mode 100644 tests/test_client.py diff --git a/Makefile b/Makefile index 6e0d986..bc48857 100644 --- a/Makefile +++ b/Makefile @@ -12,4 +12,7 @@ test: bootstrap-tests test-manager: bootstrap-tests ${TEST_RUNNER} tests/test_manager.py +test-client: bootstrap-tests + ${TEST_RUNNER} --with-coverage --cover-package=pypelinin.client tests/test_client.py + .PHONY: bootstrap-environment bootstrap-tests test test-manager diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index 124f483..d4eaffb 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -2,6 +2,6 @@ from .manager import Manager from .client import Client -from .broker import Broker +#from .broker import Broker #from .pipeline import PipelineManager, Worker #from .pipeliner import Pipeliner diff --git a/pypelinin/client.py b/pypelinin/client.py new file mode 100755 index 0000000..1b8add1 --- /dev/null +++ b/pypelinin/client.py @@ -0,0 +1,140 @@ +# coding: utf-8 + +from zmq import Context, REQ, SUB, SUBSCRIBE, UNSUBSCRIBE + + +class Client(object): + '''Base class to communicate with pypelinin's Manager + + Probably you don't want to use this class by hand since it does not + implement Manager's protocol. Use one of the class that subclass `Client`, + as `pypelinin.Broker` and `pypelinin.Pipeliner`. + ''' + #TODO: validate all received data (types, keys etc.) + #TODO: use some kind of encryption? + + def __init__(self): + self.context = Context() + self.api_address = None + self.broadcast_address = None + self._manager_api = None + self._manager_broadcast = None + + def __del__(self): + self.disconnect_api(silent=True) + self.disconnect_broadcast(silent=True) + + def connect(self, api=None, broadcast=None): + '''Connect to Manager's API and/or broadcast channel(s) + + API and broadcast addresses should be specified in this form: + `tcp://ip-address-or-host:port`, like in `tcp://127.0.0.1:5555`. + ''' + if api is broadcast is None: + raise ValueError("At least one of the Manager's communication " + "channels (broadcast or API) need to be specified") + else: + if api is not None: + self.api_address = api + self._manager_api = self.context.socket(REQ) + self._manager_api.connect(api) + self._manager_api.linger = 0 + if broadcast is not None: + self.broadcast_address = broadcast + self._manager_broadcast = self.context.socket(SUB) + self._manager_broadcast.connect(broadcast) + self._manager_broadcast.linger = 0 + + def send_api_request(self, data): + '''Send an API request to Manager + + `data` needs to be a pickleable `dict`. + ''' + if self._manager_api is None: + raise RuntimeError("Not connected to Manager's API channel") + else: + return self._manager_api.send_json(data) + + def get_api_reply(self): + '''Receive an API reply from Manager + + It'll hang if you didn't send a request (using `send_api_request`). + The return data is a `dict`. + ''' + if self._manager_api is None: + raise RuntimeError("Not connected to Manager's API channel") + else: + return self._manager_api.recv_json() + + def api_poll(self, timeout=0): + '''Poll API channel until `timeout` (in milliseconds) + + Return `True`/`False` if there is any to be received (or not). If it + returns `True` so you can use `get_api_reply` and it won't hang. + ''' + if self._manager_api is None: + raise RuntimeError("Not connected to Manager's API channel") + else: + return self._manager_api.poll(timeout) + + def broadcast_subscribe(self, subscribe_to): + '''Subscribe to a Manager's broadcast type + + `subscribe_to` needs to be a string. + ''' + if self._manager_broadcast is None: + raise RuntimeError("Not connected to Manager's broadcast channel") + else: + return self._manager_broadcast.setsockopt(SUBSCRIBE, subscribe_to) + + def broadcast_unsubscribe(self, unsubscribe_to): + if self._manager_broadcast is None: + raise RuntimeError("Not connected to Manager's broadcast channel") + else: + return self._manager_broadcast.setsockopt(UNSUBSCRIBE, + unsubscribe_to) + + def broadcast_poll(self, timeout=0): + if self._manager_broadcast is None: + raise RuntimeError("Not connected to Manager's broadcast channel") + else: + return self._manager_broadcast.poll(timeout) + + def broadcast_receive(self): + if self._manager_broadcast is None: + raise RuntimeError("Not connected to Manager's broadcast channel") + else: + return self._manager_broadcast.recv() + + def disconnect_api(self, silent=False): + '''Disconnect from Manager's API channel + + Raise RuntimeError if not connected to API channel and `silent=False` + ''' + if self._manager_api is None and not silent: + raise RuntimeError("Not connected to Manager's API channel") + elif self._manager_api is not None: + self._manager_api.close() + self._manager_api = None + + def disconnect_broadcast(self, silent=False): + '''Disconnect from Manager's broadcast channel + + Raise RuntimeError if not connected to broadcast channel + ''' + if self._manager_broadcast is None and not silent: + raise RuntimeError("Not connected to Manager's broadcast channel") + elif self._manager_broadcast is not None: + self._manager_broadcast.close() + self._manager_broadcast = None + + def disconnect(self, silent=False): + '''Disconnect from both Manager's API and broadcast channels + + Raise RuntimeError if not connected to at least one of both channels + ''' + if self._manager_broadcast is self._manager_api is None and not silent: + raise RuntimeError("Not connected") + else: + self.disconnect_api(silent=True) + self.disconnect_broadcast(silent=True) diff --git a/pypelinin/manager.py b/pypelinin/manager.py index 9a412c2..fc33dfb 100755 --- a/pypelinin/manager.py +++ b/pypelinin/manager.py @@ -14,6 +14,7 @@ class Manager(object): # again in job_queue and announce pending job #TODO: validate all received data (types, keys etc.) #TODO: handle 'job failed' messages + #TODO: some attributes should start with '_' def __init__(self, api_host_port, broadcast_host_port, config, logger=None, logger_name='Manager'): self.job_queue = Queue() diff --git a/requirements/development.txt b/requirements/development.txt index dfe412d..ed809a5 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -2,3 +2,4 @@ nose yanc +coverage diff --git a/tests/test_client.py b/tests/test_client.py new file mode 100644 index 0000000..f19f700 --- /dev/null +++ b/tests/test_client.py @@ -0,0 +1,220 @@ +# coding: utf-8 + +import time +import unittest +from zmq import Context, REP, PUB +from pypelinin import Client + + +TIMEOUT = 150 +API_ADDRESS = 'tcp://127.0.0.1:5555' +API_BIND_ADDRESS = 'tcp://*:5555' # WTF zmq? Why can't I use address above? +BROADCAST_ADDRESS = 'tcp://127.0.0.1:5556' +BROADCAST_BIND_ADDRESS = 'tcp://*:5556' # WTF zmq? Why can't I use address above? + +class TestClient(unittest.TestCase): + def setUp(self): + self.context = Context() + self.start_manager_sockets() + + def tearDown(self): + self.close_sockets() + self.context.term() + + def start_manager_sockets(self): + self.api = self.context.socket(REP) + self.broadcast = self.context.socket(PUB) + self.api.bind(API_BIND_ADDRESS) + self.broadcast.bind(BROADCAST_BIND_ADDRESS) + + def close_sockets(self): + self.api.close() + self.broadcast.close() + + def test_connect_raises_ValueError_when_no_communication_channel_is_specified(self): + client = Client() + with self.assertRaises(ValueError): + client.connect() + + def test_api_methods_should_raise_RuntimeError_if_not_connected_to_api(self): + client = Client() + client.connect(broadcast=BROADCAST_ADDRESS) + with self.assertRaises(RuntimeError): + client.send_api_request({'command': 'get configuration'}) + with self.assertRaises(RuntimeError): + client.get_api_reply() + with self.assertRaises(RuntimeError): + client.api_poll(timeout=1) # milliseconds + with self.assertRaises(RuntimeError): + client.disconnect_api() + + def test_broadcast_methods_should_raise_RuntimeError_if_not_connected_to_broadcast(self): + client = Client() + client.connect(api=API_ADDRESS) + with self.assertRaises(RuntimeError): + client.broadcast_subscribe('42') + with self.assertRaises(RuntimeError): + client.broadcast_unsubscribe('42') + with self.assertRaises(RuntimeError): + client.broadcast_poll(timeout=1) # milliseconds + with self.assertRaises(RuntimeError): + client.broadcast_receive() + with self.assertRaises(RuntimeError): + client.disconnect_broadcast() + + def test_send_api_request(self): + client = Client() + client.connect(api=API_ADDRESS) + client.send_api_request({'command': 'get configuration'}) + if not self.api.poll(TIMEOUT): + self.fail('Timeout wainting for API command') + message = self.api.recv_json() + self.assertEqual(message, {'command': 'get configuration'}) + + def test_get_api_reply(self): + client = Client() + client.connect(api=API_ADDRESS) + client.send_api_request({'command': 'get configuration'}) + if not self.api.poll(TIMEOUT): + self.fail('Timeout wainting for API command') + self.api.recv_json() + self.api.send_json({'configuration': 'spam eggs ham'}) + message = client.get_api_reply() # what if it hangs? + self.assertEqual(message, {'configuration': 'spam eggs ham'}) + + def test_api_poll(self, timeout=0): + client = Client() + client.connect(api=API_ADDRESS) + client.send_api_request({'command': 'get configuration'}) + if not self.api.poll(TIMEOUT): + self.fail('Timeout waiting for API message') + self.api.recv_json() + start_time = time.time() + result = client.api_poll(TIMEOUT) + end_time = time.time() + self.assertFalse(result) + # there is no message, should wait for the entire TIMEOUT + total_time = (end_time - start_time) * 1000 # milliseconds + self.assertTrue(TIMEOUT <= total_time <= 1.1 * TIMEOUT) + + self.api.send_json({'configuration': 'spam eggs ham'}) + start_time = time.time() + result = client.api_poll(TIMEOUT) + end_time = time.time() + self.assertTrue(result) + # poll should return almost immediatly (there is a message) + total_time = (end_time - start_time) * 1000 # milliseconds + self.assertTrue(total_time < TIMEOUT) + + def test_broadcast_subscribe_poll_and_receive(self): + client = Client() + client.connect(broadcast=BROADCAST_ADDRESS) + client.broadcast_subscribe('spam') + time.sleep(TIMEOUT / 1000.0) # wait for subscribe to take effect + + self.broadcast.send('spam eggs ham') + start_time = time.time() + poll_result = client.broadcast_poll(TIMEOUT) + end_time = time.time() + self.assertTrue(poll_result) + total_time = (end_time - start_time) * 1000 + self.assertTrue(total_time < TIMEOUT) + message = client.broadcast_receive() # what if it hangs? + self.assertEqual(message, 'spam eggs ham') + + self.broadcast.send('eggs ham') + start_time = time.time() + poll_result = client.broadcast_poll(TIMEOUT) + end_time = time.time() + self.assertFalse(poll_result) + total_time = (end_time - start_time) * 1000 + self.assertTrue(TIMEOUT <= total_time <= 1.1 * TIMEOUT) + + def test_broadcast_unsubscribe(self): + client = Client() + client.connect(broadcast=BROADCAST_ADDRESS) + client.broadcast_subscribe('spam') + time.sleep(TIMEOUT / 1000.0) # wait for subscribe to take effect + + self.broadcast.send('spam eggs ham') + self.assertTrue(client.broadcast_poll(TIMEOUT)) + self.assertEqual(client.broadcast_receive(), 'spam eggs ham') + + client.broadcast_unsubscribe('spam') + self.broadcast.send('spam eggs ham') + self.assertFalse(client.broadcast_poll(TIMEOUT)) + + def test_disconnect(self): + client = Client() + client.connect(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) + + #connected we can communicate... + client.send_api_request({'command': 'get configuration'}) + self.assertTrue(self.api.poll(TIMEOUT)) + self.api.recv_json() + self.api.send_json({'command': 'ok'}) + self.assertTrue(client.api_poll(TIMEOUT)) + self.assertEqual(client.get_api_reply(), {'command': 'ok'}) + client.broadcast_subscribe('spam') + time.sleep(TIMEOUT / 1000.0) # wait for subscribe to take effect + self.broadcast.send('spam eggs ham') + self.assertTrue(client.broadcast_poll(TIMEOUT)) + self.assertEqual(client.broadcast_receive(), 'spam eggs ham') + + #disconnected not! + client.disconnect_api() + with self.assertRaises(RuntimeError): + client.send_api_request({'command': 'get configuration'}) + with self.assertRaises(RuntimeError): + client.api_poll(TIMEOUT) + with self.assertRaises(RuntimeError): + client.get_api_reply() + client.broadcast_subscribe('spam') + time.sleep(TIMEOUT / 1000.0) # wait for subscribe to take effect + self.broadcast.send('spam eggs ham') + self.assertTrue(client.broadcast_poll(TIMEOUT)) + self.assertEqual(client.broadcast_receive(), 'spam eggs ham') + client.disconnect_broadcast() + with self.assertRaises(RuntimeError): + client.broadcast_subscribe('spam') + with self.assertRaises(RuntimeError): + client.broadcast_poll(TIMEOUT) + with self.assertRaises(RuntimeError): + client.broadcast_receive() + + #connect again... + client.connect(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) + client.send_api_request({'command': 'get configuration'}) + self.assertTrue(self.api.poll(TIMEOUT)) + self.api.recv_json() + self.api.send_json({'command': 'ok'}) + self.assertTrue(client.api_poll(TIMEOUT)) + self.assertEqual(client.get_api_reply(), {'command': 'ok'}) + client.broadcast_subscribe('spam') + time.sleep(TIMEOUT / 1000.0) # wait for subscribe to take effect + self.broadcast.send('spam eggs ham') + self.assertTrue(client.broadcast_poll(TIMEOUT)) + self.assertEqual(client.broadcast_receive(), 'spam eggs ham') + + #disconnected everything + client.disconnect() + with self.assertRaises(RuntimeError): + client.send_api_request({'command': 'get configuration'}) + with self.assertRaises(RuntimeError): + client.api_poll(TIMEOUT) + with self.assertRaises(RuntimeError): + client.get_api_reply() + with self.assertRaises(RuntimeError): + client.broadcast_subscribe('spam') + with self.assertRaises(RuntimeError): + client.broadcast_poll(TIMEOUT) + with self.assertRaises(RuntimeError): + client.broadcast_receive() + with self.assertRaises(RuntimeError): + client.disconnect() + # Should not raises: + client.disconnect(silent=True) + client.disconnect_api(silent=True) + client.disconnect_broadcast(silent=True) + + #TODO: should we test linger? From 6cb54ff0bf7829b3c8f14cf00cd41f50c383c3f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 8 Oct 2012 22:25:20 -0300 Subject: [PATCH 07/37] Add Broker and its tests (need more tests) Thanks, @andrebco for the pair programming session --- Makefile | 6 +- example/my_broker.py | 40 +++++ example/my_manager.py | 2 +- example/workers.py | 18 +++ pypelinin/__init__.py | 2 +- pypelinin/broker.py | 315 ++++++++++++++++++++++++++++++++++++ pypelinin/monitoring.py | 103 ++++++++++++ tests/test_broker.py | 346 ++++++++++++++++++++++++++++++++++++++++ tests/utils.py | 2 +- 9 files changed, 830 insertions(+), 4 deletions(-) create mode 100644 example/my_broker.py create mode 100644 example/workers.py create mode 100755 pypelinin/broker.py create mode 100755 pypelinin/monitoring.py create mode 100644 tests/test_broker.py diff --git a/Makefile b/Makefile index bc48857..edebba8 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ bootstrap-environment: pip install -r requirements/development.txt bootstrap-tests: + clear python pypelinin/setup.py install test: bootstrap-tests @@ -15,4 +16,7 @@ test-manager: bootstrap-tests test-client: bootstrap-tests ${TEST_RUNNER} --with-coverage --cover-package=pypelinin.client tests/test_client.py -.PHONY: bootstrap-environment bootstrap-tests test test-manager +test-broker: bootstrap-tests + ${TEST_RUNNER} -x tests/test_broker.py + +.PHONY: bootstrap-environment bootstrap-tests test test-manager test-client test-broker diff --git a/example/my_broker.py b/example/my_broker.py new file mode 100644 index 0000000..6645d7d --- /dev/null +++ b/example/my_broker.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# coding: utf-8 + +import json +from sys import stdout +from logging import Logger, StreamHandler, Formatter, NullHandler +from pypelinin import Broker + + +class MyStore(object): + def __init__(self, **configuration): + self.monitoring = open('/tmp/broker-monitoring', 'w') + + def retrieve(self, data): + #data = {'worker_name': ..., 'data': ..., 'worker_meta': ...} + return data['data'] + + def save(self, data): + #data = {'worker_name': ..., 'worker_result': ..., 'worker_meta': ...} + pass + + def save_monitoring(self, data): + data_as_json_string = json.dumps(data) + self.monitoring.write(data_as_json_string + "\n") + self.monitoring.flush() + +def main(): + logger = Logger('Broker') + handler = StreamHandler(stdout) + formatter = Formatter('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + broker = Broker(api='tcp://localhost:5555', + broadcast='tcp://localhost:5556', store_class=MyStore, + logger=logger, workers='workers') + broker.start() + +if __name__ == '__main__': + main() diff --git a/example/my_manager.py b/example/my_manager.py index f17ea79..b8f43bb 100644 --- a/example/my_manager.py +++ b/example/my_manager.py @@ -15,7 +15,7 @@ def main(): logger.addHandler(handler) api_host_port = ('*', 5555) broadcast_host_port = ('*', 5556) - default_config = {'db': {'data': 'test'}, 'monitoring interval': 60, } + default_config = {'store': {'data': 'test'}, 'monitoring interval': 60, } manager = Manager(api_host_port, broadcast_host_port, default_config, logger) manager.start() diff --git a/example/workers.py b/example/workers.py new file mode 100644 index 0000000..4cf0377 --- /dev/null +++ b/example/workers.py @@ -0,0 +1,18 @@ +# coding: utf-8 + +__all__ = ['dummy', 'echo', 'snorlax'] + + +def dummy(document): + return {} +dummy.__meta__ = {} + +def echo(document): + return {'key-c': document['key-a'], 'key-d': document['key-b']} +echo.__meta__ = {} + +def snorlax(document): + import time + time.sleep(document['sleep-for']) + return {} +snorlax.__meta__ = {} diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index d4eaffb..124f483 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -2,6 +2,6 @@ from .manager import Manager from .client import Client -#from .broker import Broker +from .broker import Broker #from .pipeline import PipelineManager, Worker #from .pipeliner import Pipeliner diff --git a/pypelinin/broker.py b/pypelinin/broker.py new file mode 100755 index 0000000..6bf913e --- /dev/null +++ b/pypelinin/broker.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python +# coding: utf-8 + +from importlib import import_module +from multiprocessing import Process, Pipe, cpu_count +from os import kill, getpid +from time import sleep, time +from signal import SIGKILL +from pypelinin import Client +from pypelinin.monitoring import (get_host_info, get_outgoing_ip, + get_process_info) + + +def worker_wrapper(pipe, workers_module_name): + #TODO: should receive the document or database's configuration? + # Note that if a worker should process a big document or an entire + # corpus, it's better to received database's configuration and pass to + # worker only an lazy iterator for the collection (pymongo's cursor) + #TODO: create documentation about object type returned by worker (strings + # must be unicode) + #TODO: add the possibility to create workers that are executables (they + # receive data as JSON in stdin and put the result as JSON in stdout), + # so we can create workers in C, Perl, Ruby etc. + #TODO: should get any exception information and send it to broker signaling + # 'job failed' and sending the traceback + + + try: + workers_module = import_module(workers_module_name) + except: + pipe.send({'command': 'error'}) + #TODO: handle this on broker + else: + worker_functions = {} + for worker_function_name in workers_module.__all__: + worker_functions[worker_function_name] = getattr(workers_module, + worker_function_name) + try: + while True: + message = pipe.recv() + if message['command'] == 'exit': + break + elif message['command'] == 'execute job': + worker_function_name = message['worker'] + data = message['data'] + try: + result = worker_functions[worker_function_name](data) + except Exception as e: + result = {'_error': True, '_exception': e} + #TODO: handle this on broker + finally: + pipe.send(result) + except KeyboardInterrupt: + pass + +class WorkerPool(object): + #TODO: test it! + + def __init__(self, number_of_workers, workers): + self.workers = [] + self.number_of_workers = number_of_workers + for i in range(number_of_workers): + self.workers.append(Worker(workers)) + + def __len__(self): + return len(self.workers) + + def available(self): + return [worker.working for worker in self.workers].count(False) + + def working(self): + return [worker.working for worker in self.workers].count(True) + + def start_job(self, job_description, data): + for worker in self.workers: + if not worker.working: + break + else: + return False + return worker.start_job(job_description, data) + + def end_processes(self): + for worker in self.workers: + worker.end() + + def kill_processes(self): + for worker in self.workers: + try: + kill(worker.pid, SIGKILL) + except OSError: + pass + for worker in self.workers: + worker.process.join() + +class Worker(object): + #TODO: test it! + + def __init__(self, workers): + parent_connection, child_connection = Pipe() + self.parent_connection = parent_connection + self.child_connection = child_connection + self.start_time = time() + #TODO: is there any way to *do not* connect stdout/stderr? + self.process = Process(target=worker_wrapper, + args=(child_connection, workers)) + self.process.start() + self.pid = self.process.pid + self.working = False + self.job_info = None + + def start_job(self, job_description, data): + if self.working: + return False + message = {'command': 'execute job', + 'worker': job_description['worker'], + 'data': data,} + self.parent_connection.send(message) + self.job_info = job_description + self.job_info['start time'] = time() + self.working = True + return True + + def __repr__(self): + return (''.format(self.pid, + self.start_time)) + + def get_result(self): + if not self.finished_job(): + return None + result = self.parent_connection.recv() + self.job_info = None + self.working = False + return result + + def finished_job(self): + return self.parent_connection.poll() + + def end(self): + self.parent_connection.send({'command': 'exit'}) + self.parent_connection.close() + +class Broker(Client): + #TODO: validate all received data (types, keys etc.) + #TODO: use log4mongo (?) + + def __init__(self, api, broadcast, store_class, workers, logger, + logger_name='Broker', poll_time=50, + number_of_workers=cpu_count()): + super(Broker, self).__init__() + self._api_address = api + self._broadcast_address = broadcast + self.logger = logger + self.poll_time = poll_time + self.last_time_saved_monitoring_information = 0 + self.StoreClass = store_class + + self.number_of_workers = number_of_workers + self.pid = getpid() + self.logger.info('Starting worker processes') + self.workers = workers + self.workers_module = import_module(workers) + self.available_workers = self.workers_module.__all__ + self.worker_pool = WorkerPool(self.number_of_workers, self.workers) + self.logger.info('Broker started') + + def request(self, message): + self.send_api_request(message) + self.logger.info('[API] Request to manager: {}'.format(message)) + + def get_reply(self): + message = self.get_api_reply() + self.logger.info('[API] Reply from manager: {}'.format(message)) + return message + + def get_configuration(self): + self.request({'command': 'get configuration'}) + self._config = self.get_reply() + + def connect_to_manager(self): + self.logger.info('Trying to connect to manager...') + self.connect(api=self._api_address, broadcast=self._broadcast_address) + api_host, api_port = self._api_address.split('//')[1].split(':') + self.ip = get_outgoing_ip((api_host, int(api_port))) + + def save_monitoring_information(self): + #TODO: should we send average measures insted of instant measures of + # some measured variables? + #TODO: timestamp sent should be UTC + host_info = get_host_info() + host_info['network']['cluster ip'] = self.ip + broker_process = get_process_info(self.pid) + broker_process['type'] = 'broker' + broker_process['number of workers'] = len(self.worker_pool) + broker_process['active workers'] = self.worker_pool.working() + processes = [broker_process] + for worker in self.worker_pool.workers: + process_info = get_process_info(worker.pid) + process_info['type'] = 'worker' + if worker.working: + process_info['worker'] = worker.job_info['worker'] + process_info['data'] = worker.job_info['data'] + processes.append(process_info) + data = {'host': host_info, 'timestamp': time(), 'processes': processes} + self._store.save_monitoring(data) + self.last_time_saved_monitoring_information = time() + self.logger.info('Saved monitoring information in MongoDB') + self.logger.debug(' Information: {}'.format(data)) + + def start(self): + try: + self.started_at = time() + self.connect_to_manager() + self.broadcast_subscribe('new job') + self.get_configuration() + self._store = self.StoreClass(**self._config['store']) + self.save_monitoring_information() + self.run() + except KeyboardInterrupt: + self.logger.info('Got SIGNINT (KeyboardInterrupt), exiting.') + self.worker_pool.end_processes() + self.worker_pool.kill_processes() + self.disconnect() + + def start_job(self, job_description): + #info = getattr(self.workers[job_description['worker']], '__meta__', {}) + info = {} + info.update(job_description) + data = self._store.retrieve(info) + self.worker_pool.start_job(job_description, data) + self.logger.debug('Started job "{}" for document "{}"'\ + .format(job_description['worker'], job_description['data'])) + + def get_a_job(self): + self.logger.debug('Available workers: {}'.format(self.worker_pool.available())) + for i in range(self.worker_pool.available()): + self.request({'command': 'get job'}) + message = self.get_reply() + #TODO: if manager stops and doesn't answer, broker will stop here + if 'worker' in message and message['worker'] is None: + break # Don't have a job, stop asking + elif 'worker' in message and 'data' in message and \ + message['worker'] in self.available_workers: + self.start_job(message) + else: + self.logger.info('Ignoring malformed job: {}'.format(message)) + #TODO: send a 'rejecting job' request to Manager + + def manager_has_job(self): + if self.broadcast_poll(self.poll_time): + message = self.broadcast_receive() + self.logger.info('[Broadcast] Received from manager: {}'\ + .format(message)) + #TODO: what if broker subscribe to another thing? + return True + else: + return False + + def full_of_jobs(self): + return len(self.worker_pool) == self.worker_pool.working() + + def should_save_monitoring_information_now(self): + time_difference = time() - self.last_time_saved_monitoring_information + return time_difference >= self._config['monitoring interval'] + + def check_if_some_job_finished_and_do_what_you_need_to(self): + for worker in self.worker_pool.workers: + if not worker.finished_job(): + continue + job_id = worker.job_info['job id'] + job_data = worker.job_info['data'] + worker_function = worker.job_info['worker'] + start_time = worker.job_info['start time'] + result = worker.get_result() + end_time = time() + self.logger.info('Job finished: job id={}, worker={}, ' + 'data={}, start time={}, result={}'.format(job_id, + worker_function, job_data, start_time, result)) + + #worker_info = getattr(self.workers[worker_function], '__meta__', + # {}) + worker_info = {} + job_data = { + 'worker': worker_function, + 'data': job_data, + 'result': result, + } + job_data.update(worker_info) + try: + #TODO: what if I want to the caller to receive job information + # as a "return" from a function call? Should use a store? + self._store.save(job_data) + except ValueError: + self.request({'command': 'job failed', + 'job id': job_id, + 'duration': end_time - start_time, + 'message': "Can't save information on store"}) + else: + self.request({'command': 'job finished', + 'job id': job_id, + 'duration': end_time - start_time}) + result = self.get_reply() + self.get_a_job() + + def run(self): + self.get_a_job() + self.logger.info('Entering main loop') + while True: + if self.should_save_monitoring_information_now(): + self.save_monitoring_information() + if not self.full_of_jobs() and self.manager_has_job(): + self.get_a_job() + self.check_if_some_job_finished_and_do_what_you_need_to() + +#TODO: reject jobs if can't get information from store or something like +# that diff --git a/pypelinin/monitoring.py b/pypelinin/monitoring.py new file mode 100755 index 0000000..a98634a --- /dev/null +++ b/pypelinin/monitoring.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# coding: utf-8 + +import socket +from time import time +import psutil + + +#TODO: test this module! + +def get_outgoing_ip((host, port)): + """Connect to remote host/port, return local IP used by OS""" + #TODO: handle exception if can't connect and add a timeout + raw_socket = socket.socket(socket.AF_INET) + raw_socket.connect((host, port)) + data = raw_socket.getsockname() + raw_socket.close() + return data[0] + +def get_host_info(): + """Return a ``dict`` with system's information + + `Example of its output `_ + """ + memory_usage = psutil.phymem_usage() + cached_memory = psutil.cached_phymem() + buffered_memory = psutil.phymem_buffers() + real_used = memory_usage.used - buffered_memory - cached_memory + real_free = memory_usage.total - real_used + percent = 100 * (float(memory_usage.used) / memory_usage.total) + real_percent = 100 * (float(real_used) / memory_usage.total) + virtual_used = psutil.used_virtmem() + virtual_free = psutil.avail_virtmem() + virtual_total = virtual_used + virtual_free + info_per_nic = psutil.network_io_counters(pernic=True) + network_info = {} + for key, value in info_per_nic.iteritems(): + network_info[key] = {'bytes sent': value.bytes_sent, + 'bytes received': value.bytes_recv, + 'packets sent': value.packets_sent, + 'packets received': value.packets_recv,} + partitions = psutil.disk_partitions() + storage_info = {} + for partition in partitions: + disk_usage = psutil.disk_usage(partition.mountpoint) + storage_info[partition.device] = {'mount point': partition.mountpoint, + 'file system': partition.fstype, + 'total bytes': disk_usage.total, + 'total used bytes': disk_usage.used, + 'total free bytes': disk_usage.free, + 'percent used': disk_usage.percent,} + return {'memory': {'free': memory_usage.free, + 'total': memory_usage.total, + 'used': memory_usage.used, + 'cached': cached_memory, + 'buffers': buffered_memory, + 'real used': real_used, + 'real free': real_free, + 'percent': percent, + 'real percent': real_percent, + 'total virtual': virtual_total, + 'used virtual': virtual_used, + 'free virtual': virtual_free,}, + 'cpu': {'number of cpus': psutil.NUM_CPUS, + 'cpu percent': psutil.cpu_percent(),}, + 'network': {'interfaces': network_info,}, + 'storage': storage_info, + 'uptime': time() - psutil.BOOT_TIME,} + +def get_process_info(process_id): + """Return CPU and memory information for a given PID""" + try: + process = psutil.Process(process_id) + except psutil.error.NoSuchProcess: + return None + memory_info = process.get_memory_info() + return {'cpu percent': process.get_cpu_percent(), + 'resident memory': memory_info.rss, + 'virtual memory': memory_info.vms, + 'pid': process.pid, + 'started at': process.create_time,} + + +if __name__ == '__main__': + from pprint import pprint + from time import time + from os import getpid + + host_info = get_host_info() + host_info['network']['cluster ip'] = get_outgoing_ip(('localhost', 80)) + broker_info = get_process_info(getpid()) + broker_info['type'] = 'broker' + broker_info['active workers'] = 4 + processes = [broker_info] + for job in range(broker_info['active workers']): + worker_info = get_process_info(getpid()) + if worker_info is not None: + worker_info['worker'] = 'the worker is a lie' + worker_info['data'] = '...' + worker_info['type'] = 'worker' + processes.append(worker_info) + data = {'host': host_info, 'processes': processes, 'timestamp': time()} + pprint(data) diff --git a/tests/test_broker.py b/tests/test_broker.py new file mode 100644 index 0000000..4ba09a3 --- /dev/null +++ b/tests/test_broker.py @@ -0,0 +1,346 @@ +# coding: utf-8 + +from __future__ import print_function +import unittest +import shlex +import select +import json +from signal import SIGINT, SIGKILL +from time import sleep, time +from subprocess import Popen, PIPE +from multiprocessing import cpu_count +from md5 import md5 +import zmq +from psutil import Process, NoSuchProcess +from .utils import default_config + + +TIMEOUT = 1500 +DEBUG_STDOUT = False +DEBUG_STDERR = True + +def _print_debug(name, message): + print() + print('----- {} BEGIN -----'.format(name)) + print(message) + print('----- {} END -----'.format(name)) + +def _kill(pid, timeout=1.5): + try: + process = Process(pid) + except NoSuchProcess: + return + try: + process.send_signal(SIGINT) + sleep(timeout) + except OSError: + pass + finally: + try: + process.send_signal(SIGKILL) + except (OSError, NoSuchProcess): + pass + process.wait() + +class TestBroker(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.cpus = cpu_count() + cls.monitoring_interval = 60 + cls.config = default_config + + @classmethod + def tearDownClass(cls): + pass + + def setUp(self): + self.context = zmq.Context() + self.start_manager_sockets() + self.start_broker_process() + + def tearDown(self): + self.end_broker_process() + self.close_sockets() + self.context.term() + + def start_broker_process(self): + #TODO: call process passing a configuration file + self.broker = Popen(shlex.split('python ./example/my_broker.py'), + stdin=PIPE, stdout=PIPE, stderr=PIPE) + #TODO: use select and self.fail + for line in self.broker.stdout.readline(): + if 'main loop' in line: + break + + def end_broker_process(self): + try: + broker_process = Process(self.broker.pid) + except NoSuchProcess: + return # was killed + # get stdout and stderr + select_config = [self.broker.stdout, self.broker.stderr], [], [], 0.1 + stdout, stderr = [], [] + result = select.select(*select_config) + while any(result): + if result[0]: + stdout.append(result[0][0].readline()) + if result[1]: + stderr.append(result[1][0].readline()) + result = select.select(*select_config) + if stdout and DEBUG_STDOUT: + _print_debug('STDOUT', ''.join(stdout)) + if stderr and DEBUG_STDERR: + _print_debug('STDERR', ''.join(stderr)) + + # kill main process and its children + children = [process.pid for process in broker_process.get_children()] + _kill(self.broker.pid, timeout=TIMEOUT / 1000.0) + for child_pid in children: + _kill(child_pid, timeout=TIMEOUT / 1000.0) + + def start_manager_sockets(self): + self.api = self.context.socket(zmq.REP) + self.broadcast = self.context.socket(zmq.PUB) + self.api.bind('tcp://*:5555') + self.broadcast.bind('tcp://*:5556') + + def close_sockets(self): + self.api.close() + self.broadcast.close() + + def receive_get_configuration_and_send_it_to_broker(self): + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get configuration' from broker") + message = self.api.recv_json() + self.config['monitoring interval'] = self.monitoring_interval + self.api.send_json(self.config) + self.assertEqual(message, {'command': 'get configuration'}) + + def receive_get_job_and_send_it_to_broker(self, job=None): + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get job' from broker") + message = self.api.recv_json() + if job is None: + job = {'worker': 'dummy', 'data': {'id': '1'}, 'job id': '2'} + self.api.send_json(job) + self.assertEqual(message, {'command': 'get job'}) + + def broker_should_be_quiet(self): + sleep(TIMEOUT / 1000.0) + with self.assertRaises(zmq.ZMQError): + self.api.recv_json(zmq.NOBLOCK) + + def send_and_receive_jobs(self, jobs, wait_finished_job=False): + my_jobs = list(jobs) + finished_jobs = [True for job in my_jobs] + messages = [] + condition = True + while condition: + if not self.api.poll(3 * TIMEOUT): + self.fail("Didn't receive 'get job' from broker") + msg = self.api.recv_json() + messages.append(msg) + if msg['command'] == 'get job': + if len(my_jobs): + job = my_jobs.pop(0) + else: + job = {'worker': None} + if 'job id' not in job: + job['job id'] = md5().hexdigest() + self.api.send_json(job) + elif msg['command'] == 'job finished': + self.api.send_json({'answer': 'good job!'}) + finished_jobs.pop() + condition = len(my_jobs) or \ + (wait_finished_job and len(finished_jobs)) + return messages + + def test_should_ask_for_configuration_on_start(self): + self.receive_get_configuration_and_send_it_to_broker() + self.send_and_receive_jobs([{'worker': None}]) + # it's necessary to send a job to wait for broker enter on run() + + def test_should_ask_for_a_job_after_configuration(self): + self.receive_get_configuration_and_send_it_to_broker() + job = {'worker': 'dummy', 'data': {'id': '1'}, 'job id': '2'} + self.send_and_receive_jobs([job]) + + def test_should_send_get_job_just_after_manager_broadcast_new_job(self): + self.receive_get_configuration_and_send_it_to_broker() + self.send_and_receive_jobs([{'worker': None}]) + self.broker_should_be_quiet() + self.broadcast.send('new job') + self.send_and_receive_jobs([{'worker': None}]) # just kidding! :D + + def test_should_send_finished_job_when_asked_to_run_dummy_worker(self): + jobs = [] + for i in range(self.cpus): + jobs.append({'worker': 'dummy', 'data': {'id': 'xpto'}, + 'job id': i}) + self.receive_get_configuration_and_send_it_to_broker() + messages = self.send_and_receive_jobs(jobs, wait_finished_job=True) + finished_jobs = 0 + for message in messages: + if message['command'] == 'job finished': + finished_jobs += 1 + self.assertEqual(finished_jobs, self.cpus) + self.assertEqual(self.api.recv_json(), {'command': 'get job'}) + self.api.send_json({'worker': None}) + self.broker_should_be_quiet() + + def test_should_start_worker_process_even_if_no_job(self): + self.receive_get_configuration_and_send_it_to_broker() + broker_pid = self.broker.pid + children_pid = [process.pid for process in \ + Process(broker_pid).get_children()] + self.assertEqual(len(children_pid), self.cpus) + + def test_should_kill_workers_processes_when_receive_SIGINT(self): + self.receive_get_configuration_and_send_it_to_broker() + self.send_and_receive_jobs([{'worker': None}]) + broker_pid = self.broker.pid + children_pid = [process.pid for process in \ + Process(broker_pid).get_children()] + self.end_broker_process() + sleep(0.5 * (self.cpus + 1)) # cpu_count + 1 processes + for child_pid in children_pid: + with self.assertRaises(NoSuchProcess): + worker_process = Process(child_pid) + with self.assertRaises(NoSuchProcess): + broker_process = Process(broker_pid) + + def test_should_reuse_the_same_workers_processes_for_all_jobs(self): + #TODO: maybe it can use new or different worker processes depending on + #broker's configuration + self.receive_get_configuration_and_send_it_to_broker() + broker_pid = self.broker.pid + children_pid_before = [process.pid for process in \ + Process(broker_pid).get_children()] + job = {'worker': 'dummy', 'data': {'data': {}},} + jobs = [job] * self.cpus + self.send_and_receive_jobs(jobs, wait_finished_job=True) + children_pid_after = [process.pid for process in \ + Process(broker_pid).get_children()] + self.broadcast.send('new job') + self.send_and_receive_jobs(jobs, wait_finished_job=True) + children_pid_after_2 = [process.pid for process in \ + Process(broker_pid).get_children()] + self.assertEqual(children_pid_before, children_pid_after) + self.assertEqual(children_pid_before, children_pid_after_2) + + def test_should_return_time_spent_by_each_job(self): + sleep_time = 1.43 + job = {'worker': 'snorlax', 'data': {'sleep-for': sleep_time},} + jobs = [job] * self.cpus + self.receive_get_configuration_and_send_it_to_broker() + start_time = time() + messages = self.send_and_receive_jobs(jobs, wait_finished_job=True) + end_time = time() + total_time = end_time - start_time + counter = 0 + for message in messages: + if message['command'] == 'job finished': + counter += 1 + self.assertIn('duration', message) + self.assertTrue(sleep_time < message['duration']) + self.assertEqual(len(jobs), counter) + self.assertTrue(total_time > sleep_time) + + def test_should_save_monitoring_information_regularly(self): + self.monitoring_interval = 0.5 + self.receive_get_configuration_and_send_it_to_broker() + self.send_and_receive_jobs([{'worker': None}]) + sleep((self.monitoring_interval + 0.05 + 0.2) * 3) + # 0.05 = default broker poll time, 0.2 = some overhead + monitoring_file = open('/tmp/broker-monitoring') + self.assertEqual(monitoring_file.read().count('\n'), 3) + + def test_should_save_monitoring_information(self): + self.monitoring_interval = 0.5 + self.receive_get_configuration_and_send_it_to_broker() + self.send_and_receive_jobs([{'worker': None}]) + sleep(self.monitoring_interval + 0.05 + 0.2) + # 0.05 = default broker poll time, 0.2 = some overhead + monitoring_file = open('/tmp/broker-monitoring') + info = json.loads(monitoring_file.readline().strip()) + + self.assertIn('host', info) + self.assertIn('processes', info) + + needed_host_keys = ['cpu', 'memory', 'network', 'storage', 'uptime'] + for key in needed_host_keys: + self.assertIn(key, info['host']) + + needed_cpu_keys = ['cpu percent', 'number of cpus'] + for key in needed_cpu_keys: + self.assertIn(key, info['host']['cpu']) + + needed_memory_keys = ['buffers', 'cached', 'free', 'free virtual', + 'percent', 'real free', 'real percent', + 'real used', 'total', 'total virtual', 'used', + 'used virtual'] + for key in needed_memory_keys: + self.assertIn(key, info['host']['memory']) + + self.assertIn('cluster ip', info['host']['network']) + self.assertIn('interfaces', info['host']['network']) + first_interface = info['host']['network']['interfaces'].keys()[0] + interface_info = info['host']['network']['interfaces'][first_interface] + needed_interface_keys = ['bytes received', 'bytes sent', + 'packets received', 'packets sent'] + for key in needed_interface_keys: + self.assertIn(key, interface_info) + + first_partition = info['host']['storage'].keys()[0] + partition_info = info['host']['storage'][first_partition] + needed_storage_keys = ['file system', 'mount point', 'percent used', + 'total bytes', 'total free bytes', + 'total used bytes'] + for key in needed_storage_keys: + self.assertIn(key, partition_info) + + self.assertEqual(len(info['processes']), self.cpus + 1) + needed_process_keys = ['cpu percent', 'pid', 'resident memory', + 'virtual memory', 'type', 'started at'] + process_info = info['processes'][0] + for key in needed_process_keys: + self.assertIn(key, process_info) + + def test_should_insert_monitoring_information_about_workers(self): + self.monitoring_interval = 0.5 + self.receive_get_configuration_and_send_it_to_broker() + jobs = [] + start_time = time() + for i in range(self.cpus): + jobs.append({'worker': 'snorlax', 'data': {'sleep-for': 100, + 'data': {'id': 143}}}) + self.send_and_receive_jobs(jobs) + end_time = time() + sleep(self.monitoring_interval * 3) # wait for broker to save info + monitoring_file = open('/tmp/broker-monitoring') + last_line = monitoring_file.read().split('\n')[-2].strip() + monitoring_info = json.loads(last_line) + self.assertEqual(len(monitoring_info['processes']), self.cpus + 1) + + needed_process_keys = ['cpu percent', 'pid', 'resident memory', 'type', + 'virtual memory', 'started at'] + for process in monitoring_info['processes']: + for key in needed_process_keys: + self.assertIn(key, process) + + broker_process = monitoring_info['processes'][0] + self.assertEqual(broker_process['number of workers'], self.cpus) + self.assertEqual(broker_process['active workers'], self.cpus) + self.assertEqual(broker_process['type'], 'broker') + self.assertTrue(start_time - 3 < broker_process['started at'] < \ + end_time + 3) + for process in monitoring_info['processes'][1:]: + self.assertEqual(process['data'], + {'sleep-for': 100, 'data': {'id': 143}}) + self.assertTrue(start_time - 3 < process['started at'] < \ + end_time + 3) + self.assertEqual(process['type'], 'worker') + self.assertEqual(process['worker'], 'snorlax') + + #TODO: test usage of store methods + #TODO: test __meta__ diff --git a/tests/utils.py b/tests/utils.py index a6027c2..5887931 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,3 @@ # coding: utf-8 -default_config = {'db': {'data': 'test'}, 'monitoring interval': 60} +default_config = {'store': {'data': 'test'}, 'monitoring interval': 60} From eb59b485bfc118968cb765ebecec75e12dd691e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Tue, 9 Oct 2012 19:11:52 -0300 Subject: [PATCH 08/37] Replace 'document' with 'data' in Manager tests --- pypelinin/manager.py | 1 + tests/test_manager.py | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pypelinin/manager.py b/pypelinin/manager.py index fc33dfb..2bba4eb 100755 --- a/pypelinin/manager.py +++ b/pypelinin/manager.py @@ -72,6 +72,7 @@ def run(self): if command == 'get configuration': self.reply(self.config) elif command == 'add job': + #TODO: needs to validate a job (have keys worker, data, ...?) message['job id'] = uuid.uuid4().hex del message['command'] self.job_queue.put(message) diff --git a/tests/test_manager.py b/tests/test_manager.py index 9c6cef1..ca5a4cc 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -59,7 +59,7 @@ def test_should_connect_to_manager_api_zmq_socket(self): def test_should_receive_new_job_from_broadcast_when_a_job_is_submitted(self): self.api.send_json({'command': 'add job', 'worker': 'x', - 'document': 'y'}) + 'data': 'y'}) if not self.api.poll(time_to_wait): self.fail("Didn't receive 'add job' reply") self.api.recv_json() @@ -76,7 +76,7 @@ def test_command_get_configuration_should_return_dict_passed_on_setUp(self): self.assertEqual(message, default_config) def test_command_add_job_should_return_a_job_id(self): - cmd = {'command': 'add job', 'worker': 'test', 'document': 'eggs'} + cmd = {'command': 'add job', 'worker': 'test', 'data': 'eggs'} self.api.send_json(cmd) if not self.api.poll(time_to_wait): self.fail("Didn't receive 'job accepted' from manager") @@ -94,7 +94,7 @@ def test_command_get_job_should_return_empty_if_no_job(self): def test_command_get_job_should_return_a_job_after_adding_one(self): self.api.send_json({'command': 'add job', 'worker': 'spam', - 'document': 'eggs'}) + 'data': 'eggs'}) if not self.api.poll(time_to_wait): self.fail("Didn't receive 'add job' reply") job = self.api.recv_json() @@ -103,7 +103,7 @@ def test_command_get_job_should_return_a_job_after_adding_one(self): self.fail("Didn't receive job from manager") message = self.api.recv_json() self.assertEqual(message['worker'], 'spam') - self.assertEqual(message['document'], 'eggs') + self.assertEqual(message['data'], 'eggs') self.assertIn('job id', message) self.assertEqual(len(message['job id']), 32) @@ -124,7 +124,7 @@ def test_finished_job_with_unknown_job_id_should_return_error(self): def test_finished_job_with_correct_job_id_should_return_good_job(self): self.api.send_json({'command': 'add job', 'worker': 'a', - 'document': 'b'}) + 'data': 'b'}) if not self.api.poll(time_to_wait): self.fail("Didn't receive 'add job' reply") message = self.api.recv_json() @@ -139,7 +139,7 @@ def test_finished_job_with_correct_job_id_should_return_good_job(self): def test_should_receive_job_finished_message_with_job_id_and_duration_when_a_job_finishes(self): self.api.send_json({'command': 'add job', 'worker': 'x', - 'document': 'y'}) + 'data': 'y'}) if not self.api.poll(time_to_wait): self.fail("Didn't receive 'add job' reply") self.api.recv_json() From 49ea8a986455961d7ce038f686694faae63d835a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Tue, 9 Oct 2012 19:34:42 -0300 Subject: [PATCH 09/37] Refactor job description, data and worker input There were two "data": job data (sent from "client" to Manager and then to Broker) and data retrieved from store. It was changed to: `data` is the first one and `worker_input` is the last. --- pypelinin/broker.py | 61 +++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/pypelinin/broker.py b/pypelinin/broker.py index 6bf913e..d471ff9 100755 --- a/pypelinin/broker.py +++ b/pypelinin/broker.py @@ -42,7 +42,7 @@ def worker_wrapper(pipe, workers_module_name): break elif message['command'] == 'execute job': worker_function_name = message['worker'] - data = message['data'] + data = message['worker_input'] try: result = worker_functions[worker_function_name](data) except Exception as e: @@ -71,13 +71,17 @@ def available(self): def working(self): return [worker.working for worker in self.workers].count(True) - def start_job(self, job_description, data): + def start_job(self, job_info): + ''' + job_info = {'worker': 'name as string', 'worker_input': {...data...}, + 'data': {...data...}} + ''' for worker in self.workers: if not worker.working: break else: return False - return worker.start_job(job_description, data) + return worker.start_job(job_info) def end_processes(self): for worker in self.workers: @@ -108,14 +112,14 @@ def __init__(self, workers): self.working = False self.job_info = None - def start_job(self, job_description, data): + def start_job(self, job_info): if self.working: return False message = {'command': 'execute job', - 'worker': job_description['worker'], - 'data': data,} + 'worker': job_info['worker'], + 'worker_input': job_info['worker_input'],} self.parent_connection.send(message) - self.job_info = job_description + self.job_info = job_info self.job_info['start time'] = time() self.working = True return True @@ -160,6 +164,9 @@ def __init__(self, api, broadcast, store_class, workers, logger, self.workers = workers self.workers_module = import_module(workers) self.available_workers = self.workers_module.__all__ + self.worker_function = {} + for worker in self.available_workers: + self.worker_function[worker] = getattr(self.workers_module, worker) self.worker_pool = WorkerPool(self.number_of_workers, self.workers) self.logger.info('Broker started') @@ -222,12 +229,19 @@ def start(self): self.disconnect() def start_job(self, job_description): - #info = getattr(self.workers[job_description['worker']], '__meta__', {}) - info = {} - info.update(job_description) - data = self._store.retrieve(info) - self.worker_pool.start_job(job_description, data) - self.logger.debug('Started job "{}" for document "{}"'\ + worker_meta = getattr(self.worker_function[job_description['worker']], + '__meta__', {}) + info = {'worker': job_description['worker'], + 'worker_meta': worker_meta, + 'data': job_description['data']} + worker_input = self._store.retrieve(info) + job_info = {'worker': job_description['worker'], + 'worker_input': worker_input, + 'data': job_description['data'], + 'job id': job_description['job id'], + 'worker_meta': worker_meta,} + self.worker_pool.start_job(job_info) + self.logger.debug('Started job: worker="{}", data="{}"'\ .format(job_description['worker'], job_description['data'])) def get_a_job(self): @@ -268,32 +282,31 @@ def check_if_some_job_finished_and_do_what_you_need_to(self): continue job_id = worker.job_info['job id'] job_data = worker.job_info['data'] - worker_function = worker.job_info['worker'] + worker_name = worker.job_info['worker'] + worker_meta = worker.job_info['worker_meta'] start_time = worker.job_info['start time'] result = worker.get_result() end_time = time() - self.logger.info('Job finished: job id={}, worker={}, ' + self.logger.info('Job finished: id={}, worker={}, ' 'data={}, start time={}, result={}'.format(job_id, - worker_function, job_data, start_time, result)) + worker_name, job_data, start_time, result)) - #worker_info = getattr(self.workers[worker_function], '__meta__', - # {}) - worker_info = {} - job_data = { - 'worker': worker_function, + job_information = { + 'worker': worker_name, + 'worker_meta': worker_meta, + 'worker_result': result, 'data': job_data, - 'result': result, } - job_data.update(worker_info) try: #TODO: what if I want to the caller to receive job information # as a "return" from a function call? Should use a store? - self._store.save(job_data) + self._store.save(job_information) except ValueError: self.request({'command': 'job failed', 'job id': job_id, 'duration': end_time - start_time, 'message': "Can't save information on store"}) + #TODO: handle this on Manager else: self.request({'command': 'job finished', 'job id': job_id, From 9c337d3d03a0c52a8b3773d77cb81ad8a13ea99e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Tue, 9 Oct 2012 22:33:17 -0300 Subject: [PATCH 10/37] Add tests for Broker's StoreClass - Test `save`/`retrieve` methods - Change interface of workers --- example/my_broker.py | 27 +++++++++++++++++------- example/workers.py | 38 +++++++++++++++++++++++----------- pypelinin/broker.py | 49 ++++++++++++++++++++++++++------------------ tests/test_broker.py | 38 +++++++++++++++++++++++++--------- 4 files changed, 103 insertions(+), 49 deletions(-) diff --git a/example/my_broker.py b/example/my_broker.py index 6645d7d..e11f178 100644 --- a/example/my_broker.py +++ b/example/my_broker.py @@ -11,13 +11,26 @@ class MyStore(object): def __init__(self, **configuration): self.monitoring = open('/tmp/broker-monitoring', 'w') - def retrieve(self, data): - #data = {'worker_name': ..., 'data': ..., 'worker_meta': ...} - return data['data'] - - def save(self, data): - #data = {'worker_name': ..., 'worker_result': ..., 'worker_meta': ...} - pass + def retrieve(self, info): + #info = {'worker': ..., 'data': ..., 'worker_requires': ...} + if info['worker'] == 'Upper': + filename = info['data']['filename'] + with open(filename, 'r') as fp: + contents = fp.read() + with open(filename + '.requires', 'w') as fp: + fp.write(str(info['worker_requires'])) + return {'text': contents} + else: + return info['data'] + + def save(self, info): + #info = {'worker': ..., 'worker_result': ..., 'worker_requires': ..., + # 'data': ...} + if info['worker'] == 'Upper': + filename = info['data']['filename'] + '.result' + contents = info['worker_result']['upper_text'] + with open(filename, 'w') as fp: + fp.write(contents) def save_monitoring(self, data): data_as_json_string = json.dumps(data) diff --git a/example/workers.py b/example/workers.py index 4cf0377..8db62d4 100644 --- a/example/workers.py +++ b/example/workers.py @@ -1,18 +1,32 @@ # coding: utf-8 -__all__ = ['dummy', 'echo', 'snorlax'] +import time -def dummy(document): - return {} -dummy.__meta__ = {} +__all__ = ['Dummy', 'Echo', 'Upper', 'Snorlax'] -def echo(document): - return {'key-c': document['key-a'], 'key-d': document['key-b']} -echo.__meta__ = {} -def snorlax(document): - import time - time.sleep(document['sleep-for']) - return {} -snorlax.__meta__ = {} +class Dummy(object): + requires = [] + + def process(self, data): + return {} + +class Echo(object): + requires = ['key-a', 'key-b'] + + def process(self, data): + return {'key-c': data['key-a'], 'key-d': data['key-b']} + +class Upper(object): + requires = ['text'] + + def process(self, data): + return {'upper_text': data['text'].upper()} + +class Snorlax(object): + requires = ['sleep-for'] + + def process(self, data): + time.sleep(data['sleep-for']) + return {} diff --git a/pypelinin/broker.py b/pypelinin/broker.py index d471ff9..9558f43 100755 --- a/pypelinin/broker.py +++ b/pypelinin/broker.py @@ -31,20 +31,19 @@ def worker_wrapper(pipe, workers_module_name): pipe.send({'command': 'error'}) #TODO: handle this on broker else: - worker_functions = {} - for worker_function_name in workers_module.__all__: - worker_functions[worker_function_name] = getattr(workers_module, - worker_function_name) + workers = {} + for worker in workers_module.__all__: + workers[worker] = getattr(workers_module, worker)() try: while True: message = pipe.recv() if message['command'] == 'exit': break elif message['command'] == 'execute job': - worker_function_name = message['worker'] + worker_name = message['worker'] data = message['worker_input'] try: - result = worker_functions[worker_function_name](data) + result = workers[worker_name].process(data) except Exception as e: result = {'_error': True, '_exception': e} #TODO: handle this on broker @@ -164,9 +163,14 @@ def __init__(self, api, broadcast, store_class, workers, logger, self.workers = workers self.workers_module = import_module(workers) self.available_workers = self.workers_module.__all__ - self.worker_function = {} + self.worker_requirements = {} for worker in self.available_workers: - self.worker_function[worker] = getattr(self.workers_module, worker) + try: + WorkerClass = getattr(self.workers_module, worker) + except AttributeError: + raise RuntimeError("Could not find worker '{}'".format(worker)) + self.worker_requirements[worker] = getattr(WorkerClass, 'requires', + []) self.worker_pool = WorkerPool(self.number_of_workers, self.workers) self.logger.info('Broker started') @@ -229,20 +233,21 @@ def start(self): self.disconnect() def start_job(self, job_description): - worker_meta = getattr(self.worker_function[job_description['worker']], - '__meta__', {}) - info = {'worker': job_description['worker'], - 'worker_meta': worker_meta, + worker = job_description['worker'] + worker_requires = self.worker_requirements[worker] + info = {'worker': worker, + 'worker_requires': worker_requires, 'data': job_description['data']} + #TODO: handle if retrieve raises exception worker_input = self._store.retrieve(info) - job_info = {'worker': job_description['worker'], + job_info = {'worker': worker, 'worker_input': worker_input, 'data': job_description['data'], 'job id': job_description['job id'], - 'worker_meta': worker_meta,} + 'worker_requires': worker_requires,} self.worker_pool.start_job(job_info) self.logger.debug('Started job: worker="{}", data="{}"'\ - .format(job_description['worker'], job_description['data'])) + .format(worker, job_description['data'])) def get_a_job(self): self.logger.debug('Available workers: {}'.format(self.worker_pool.available())) @@ -252,9 +257,12 @@ def get_a_job(self): #TODO: if manager stops and doesn't answer, broker will stop here if 'worker' in message and message['worker'] is None: break # Don't have a job, stop asking - elif 'worker' in message and 'data' in message and \ - message['worker'] in self.available_workers: - self.start_job(message) + elif 'worker' in message and 'data' in message: + if message['worker'] not in self.available_workers: + self.logger.info('Ignoring job (inexistent worker): {}'.format(message)) + #TODO: send a 'rejecting job' request to Manager + else: + self.start_job(message) else: self.logger.info('Ignoring malformed job: {}'.format(message)) #TODO: send a 'rejecting job' request to Manager @@ -283,7 +291,7 @@ def check_if_some_job_finished_and_do_what_you_need_to(self): job_id = worker.job_info['job id'] job_data = worker.job_info['data'] worker_name = worker.job_info['worker'] - worker_meta = worker.job_info['worker_meta'] + worker_requires = worker.job_info['worker_requires'] start_time = worker.job_info['start time'] result = worker.get_result() end_time = time() @@ -293,13 +301,14 @@ def check_if_some_job_finished_and_do_what_you_need_to(self): job_information = { 'worker': worker_name, - 'worker_meta': worker_meta, + 'worker_requires': worker_requires, 'worker_result': result, 'data': job_data, } try: #TODO: what if I want to the caller to receive job information # as a "return" from a function call? Should use a store? + #TODO: handle if retrieve raises exception self._store.save(job_information) except ValueError: self.request({'command': 'job failed', diff --git a/tests/test_broker.py b/tests/test_broker.py index 4ba09a3..8bca6d3 100644 --- a/tests/test_broker.py +++ b/tests/test_broker.py @@ -1,10 +1,12 @@ # coding: utf-8 from __future__ import print_function +import json +import os import unittest import shlex import select -import json +import tempfile from signal import SIGINT, SIGKILL from time import sleep, time from subprocess import Popen, PIPE @@ -121,7 +123,7 @@ def receive_get_job_and_send_it_to_broker(self, job=None): self.fail("Didn't receive 'get job' from broker") message = self.api.recv_json() if job is None: - job = {'worker': 'dummy', 'data': {'id': '1'}, 'job id': '2'} + job = {'worker': 'Dummy', 'data': {'id': '1'}, 'job id': '2'} self.api.send_json(job) self.assertEqual(message, {'command': 'get job'}) @@ -162,7 +164,7 @@ def test_should_ask_for_configuration_on_start(self): def test_should_ask_for_a_job_after_configuration(self): self.receive_get_configuration_and_send_it_to_broker() - job = {'worker': 'dummy', 'data': {'id': '1'}, 'job id': '2'} + job = {'worker': 'Dummy', 'data': {'id': '1'}, 'job id': '2'} self.send_and_receive_jobs([job]) def test_should_send_get_job_just_after_manager_broadcast_new_job(self): @@ -175,7 +177,7 @@ def test_should_send_get_job_just_after_manager_broadcast_new_job(self): def test_should_send_finished_job_when_asked_to_run_dummy_worker(self): jobs = [] for i in range(self.cpus): - jobs.append({'worker': 'dummy', 'data': {'id': 'xpto'}, + jobs.append({'worker': 'Dummy', 'data': {'id': 'xpto'}, 'job id': i}) self.receive_get_configuration_and_send_it_to_broker() messages = self.send_and_receive_jobs(jobs, wait_finished_job=True) @@ -216,7 +218,7 @@ def test_should_reuse_the_same_workers_processes_for_all_jobs(self): broker_pid = self.broker.pid children_pid_before = [process.pid for process in \ Process(broker_pid).get_children()] - job = {'worker': 'dummy', 'data': {'data': {}},} + job = {'worker': 'Dummy', 'data': {'data': {}},} jobs = [job] * self.cpus self.send_and_receive_jobs(jobs, wait_finished_job=True) children_pid_after = [process.pid for process in \ @@ -230,7 +232,7 @@ def test_should_reuse_the_same_workers_processes_for_all_jobs(self): def test_should_return_time_spent_by_each_job(self): sleep_time = 1.43 - job = {'worker': 'snorlax', 'data': {'sleep-for': sleep_time},} + job = {'worker': 'Snorlax', 'data': {'sleep-for': sleep_time},} jobs = [job] * self.cpus self.receive_get_configuration_and_send_it_to_broker() start_time = time() @@ -312,7 +314,7 @@ def test_should_insert_monitoring_information_about_workers(self): jobs = [] start_time = time() for i in range(self.cpus): - jobs.append({'worker': 'snorlax', 'data': {'sleep-for': 100, + jobs.append({'worker': 'Snorlax', 'data': {'sleep-for': 100, 'data': {'id': 143}}}) self.send_and_receive_jobs(jobs) end_time = time() @@ -340,7 +342,23 @@ def test_should_insert_monitoring_information_about_workers(self): self.assertTrue(start_time - 3 < process['started at'] < \ end_time + 3) self.assertEqual(process['type'], 'worker') - self.assertEqual(process['worker'], 'snorlax') + self.assertEqual(process['worker'], 'Snorlax') - #TODO: test usage of store methods - #TODO: test __meta__ + def test_if_broker_calls_save_and_retrieve_methods_from_store(self): + input_file = tempfile.NamedTemporaryFile(delete=False) + input_file.write('Answer: 42.') + input_file.close() + self.receive_get_configuration_and_send_it_to_broker() + jobs = [{'worker': 'Upper', 'data': {'filename': input_file.name}}] + messages = self.send_and_receive_jobs(jobs, wait_finished_job=True) + result_filename = input_file.name + '.result' + requires_filename = input_file.name + '.requires' + with open(result_filename, 'r') as fp: + contents = fp.read() + self.assertEqual(contents, 'ANSWER: 42.') + with open(requires_filename, 'r') as fp: + contents = fp.read() + self.assertEqual(contents, "['text']") + os.unlink(input_file.name) + os.unlink(result_filename) + os.unlink(requires_filename) From ca48500d877eccee949f40cdf8348c2f0b59fe10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 10 Oct 2012 00:44:19 -0300 Subject: [PATCH 11/37] Move Manager and Broker daemons to tests/ --- Makefile | 8 ++++++-- {example => tests}/my_broker.py | 0 {example => tests}/my_manager.py | 0 tests/test_broker.py | 4 ++-- tests/test_manager.py | 4 ++-- {example => tests}/workers.py | 0 6 files changed, 10 insertions(+), 6 deletions(-) rename {example => tests}/my_broker.py (100%) rename {example => tests}/my_manager.py (100%) rename {example => tests}/workers.py (100%) diff --git a/Makefile b/Makefile index edebba8..10ab1b4 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,13 @@ TEST_RUNNER=nosetests -dsv --with-yanc +clean: + find -regex '.*\.pyc' -exec rm {} \; + find -regex '.*~' -exec rm {} \; + bootstrap-environment: pip install -r requirements/development.txt -bootstrap-tests: +bootstrap-tests: clean clear python pypelinin/setup.py install @@ -19,4 +23,4 @@ test-client: bootstrap-tests test-broker: bootstrap-tests ${TEST_RUNNER} -x tests/test_broker.py -.PHONY: bootstrap-environment bootstrap-tests test test-manager test-client test-broker +.PHONY: clean bootstrap-environment bootstrap-tests test test-manager test-client test-broker diff --git a/example/my_broker.py b/tests/my_broker.py similarity index 100% rename from example/my_broker.py rename to tests/my_broker.py diff --git a/example/my_manager.py b/tests/my_manager.py similarity index 100% rename from example/my_manager.py rename to tests/my_manager.py diff --git a/tests/test_broker.py b/tests/test_broker.py index 8bca6d3..54a6221 100644 --- a/tests/test_broker.py +++ b/tests/test_broker.py @@ -14,7 +14,7 @@ from md5 import md5 import zmq from psutil import Process, NoSuchProcess -from .utils import default_config +from utils import default_config TIMEOUT = 1500 @@ -67,7 +67,7 @@ def tearDown(self): def start_broker_process(self): #TODO: call process passing a configuration file - self.broker = Popen(shlex.split('python ./example/my_broker.py'), + self.broker = Popen(shlex.split('python ./tests/my_broker.py'), stdin=PIPE, stdout=PIPE, stderr=PIPE) #TODO: use select and self.fail for line in self.broker.stdout.readline(): diff --git a/tests/test_manager.py b/tests/test_manager.py index ca5a4cc..1bbca8a 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -4,7 +4,7 @@ from signal import SIGINT, SIGKILL from time import sleep from subprocess import Popen, PIPE -from .utils import default_config +from utils import default_config import shlex import zmq @@ -27,7 +27,7 @@ def tearDown(self): self.context.term() def start_manager_process(self): - self.manager = Popen(shlex.split('python ./example/my_manager.py'), + self.manager = Popen(shlex.split('python ./tests/my_manager.py'), stdin=PIPE, stdout=PIPE, stderr=PIPE) for line in self.manager.stdout.readline(): if 'main loop' in line: diff --git a/example/workers.py b/tests/workers.py similarity index 100% rename from example/workers.py rename to tests/workers.py From f48bbe789fa96b54b1389e2f5485b74d413c1e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 10 Oct 2012 00:51:34 -0300 Subject: [PATCH 12/37] Manager is now Broker! Thanks @flavioamieiro for the suggestion --- Makefile | 6 +- pypelinin/__init__.py | 5 +- pypelinin/broker.py | 24 +++--- pypelinin/client.py | 100 +++++++++++----------- pypelinin/{manager.py => router.py} | 4 +- tests/{my_manager.py => my_router.py} | 9 +- tests/test_broker.py | 6 +- tests/test_client.py | 4 +- tests/{test_manager.py => test_router.py} | 42 ++++----- 9 files changed, 99 insertions(+), 101 deletions(-) rename pypelinin/{manager.py => router.py} (98%) rename tests/{my_manager.py => my_router.py} (74%) rename tests/{test_manager.py => test_router.py} (83%) diff --git a/Makefile b/Makefile index 10ab1b4..e16fd2e 100644 --- a/Makefile +++ b/Makefile @@ -14,8 +14,8 @@ bootstrap-tests: clean test: bootstrap-tests ${TEST_RUNNER} tests/ -test-manager: bootstrap-tests - ${TEST_RUNNER} tests/test_manager.py +test-router: bootstrap-tests + ${TEST_RUNNER} tests/test_router.py test-client: bootstrap-tests ${TEST_RUNNER} --with-coverage --cover-package=pypelinin.client tests/test_client.py @@ -23,4 +23,4 @@ test-client: bootstrap-tests test-broker: bootstrap-tests ${TEST_RUNNER} -x tests/test_broker.py -.PHONY: clean bootstrap-environment bootstrap-tests test test-manager test-client test-broker +.PHONY: clean bootstrap-environment bootstrap-tests test test-router test-client test-broker diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index 124f483..2ff24e5 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -1,7 +1,6 @@ # coding: utf-8 -from .manager import Manager +from .router import Router from .client import Client from .broker import Broker -#from .pipeline import PipelineManager, Worker -#from .pipeliner import Pipeliner +#from .pipeline import Pipeliner, PipelineManager, Worker diff --git a/pypelinin/broker.py b/pypelinin/broker.py index 9558f43..ea14812 100755 --- a/pypelinin/broker.py +++ b/pypelinin/broker.py @@ -176,19 +176,19 @@ def __init__(self, api, broadcast, store_class, workers, logger, def request(self, message): self.send_api_request(message) - self.logger.info('[API] Request to manager: {}'.format(message)) + self.logger.info('[API] Request to router: {}'.format(message)) def get_reply(self): message = self.get_api_reply() - self.logger.info('[API] Reply from manager: {}'.format(message)) + self.logger.info('[API] Reply from router: {}'.format(message)) return message def get_configuration(self): self.request({'command': 'get configuration'}) self._config = self.get_reply() - def connect_to_manager(self): - self.logger.info('Trying to connect to manager...') + def connect_to_router(self): + self.logger.info('Trying to connect to router...') self.connect(api=self._api_address, broadcast=self._broadcast_address) api_host, api_port = self._api_address.split('//')[1].split(':') self.ip = get_outgoing_ip((api_host, int(api_port))) @@ -220,7 +220,7 @@ def save_monitoring_information(self): def start(self): try: self.started_at = time() - self.connect_to_manager() + self.connect_to_router() self.broadcast_subscribe('new job') self.get_configuration() self._store = self.StoreClass(**self._config['store']) @@ -254,23 +254,23 @@ def get_a_job(self): for i in range(self.worker_pool.available()): self.request({'command': 'get job'}) message = self.get_reply() - #TODO: if manager stops and doesn't answer, broker will stop here + #TODO: if router stops and doesn't answer, broker will stop here if 'worker' in message and message['worker'] is None: break # Don't have a job, stop asking elif 'worker' in message and 'data' in message: if message['worker'] not in self.available_workers: self.logger.info('Ignoring job (inexistent worker): {}'.format(message)) - #TODO: send a 'rejecting job' request to Manager + #TODO: send a 'rejecting job' request to router else: self.start_job(message) else: self.logger.info('Ignoring malformed job: {}'.format(message)) - #TODO: send a 'rejecting job' request to Manager + #TODO: send a 'rejecting job' request to router - def manager_has_job(self): + def router_has_job(self): if self.broadcast_poll(self.poll_time): message = self.broadcast_receive() - self.logger.info('[Broadcast] Received from manager: {}'\ + self.logger.info('[Broadcast] Received from router: {}'\ .format(message)) #TODO: what if broker subscribe to another thing? return True @@ -315,7 +315,7 @@ def check_if_some_job_finished_and_do_what_you_need_to(self): 'job id': job_id, 'duration': end_time - start_time, 'message': "Can't save information on store"}) - #TODO: handle this on Manager + #TODO: handle this on router else: self.request({'command': 'job finished', 'job id': job_id, @@ -329,7 +329,7 @@ def run(self): while True: if self.should_save_monitoring_information_now(): self.save_monitoring_information() - if not self.full_of_jobs() and self.manager_has_job(): + if not self.full_of_jobs() and self.router_has_job(): self.get_a_job() self.check_if_some_job_finished_and_do_what_you_need_to() diff --git a/pypelinin/client.py b/pypelinin/client.py index 1b8add1..72e65dd 100755 --- a/pypelinin/client.py +++ b/pypelinin/client.py @@ -4,10 +4,10 @@ class Client(object): - '''Base class to communicate with pypelinin's Manager + '''Base class to communicate with pypelinin's Router Probably you don't want to use this class by hand since it does not - implement Manager's protocol. Use one of the class that subclass `Client`, + implement Router's protocol. Use one of the class that subclass `Client`, as `pypelinin.Broker` and `pypelinin.Pipeliner`. ''' #TODO: validate all received data (types, keys etc.) @@ -17,54 +17,54 @@ def __init__(self): self.context = Context() self.api_address = None self.broadcast_address = None - self._manager_api = None - self._manager_broadcast = None + self._router_api = None + self._router_broadcast = None def __del__(self): self.disconnect_api(silent=True) self.disconnect_broadcast(silent=True) def connect(self, api=None, broadcast=None): - '''Connect to Manager's API and/or broadcast channel(s) + '''Connect to Router's API and/or broadcast channel(s) API and broadcast addresses should be specified in this form: `tcp://ip-address-or-host:port`, like in `tcp://127.0.0.1:5555`. ''' if api is broadcast is None: - raise ValueError("At least one of the Manager's communication " + raise ValueError("At least one of the Router's communication " "channels (broadcast or API) need to be specified") else: if api is not None: self.api_address = api - self._manager_api = self.context.socket(REQ) - self._manager_api.connect(api) - self._manager_api.linger = 0 + self._router_api = self.context.socket(REQ) + self._router_api.connect(api) + self._router_api.linger = 0 if broadcast is not None: self.broadcast_address = broadcast - self._manager_broadcast = self.context.socket(SUB) - self._manager_broadcast.connect(broadcast) - self._manager_broadcast.linger = 0 + self._router_broadcast = self.context.socket(SUB) + self._router_broadcast.connect(broadcast) + self._router_broadcast.linger = 0 def send_api_request(self, data): - '''Send an API request to Manager + '''Send an API request to Router `data` needs to be a pickleable `dict`. ''' - if self._manager_api is None: - raise RuntimeError("Not connected to Manager's API channel") + if self._router_api is None: + raise RuntimeError("Not connected to Router's API channel") else: - return self._manager_api.send_json(data) + return self._router_api.send_json(data) def get_api_reply(self): - '''Receive an API reply from Manager + '''Receive an API reply from Router It'll hang if you didn't send a request (using `send_api_request`). The return data is a `dict`. ''' - if self._manager_api is None: - raise RuntimeError("Not connected to Manager's API channel") + if self._router_api is None: + raise RuntimeError("Not connected to Router's API channel") else: - return self._manager_api.recv_json() + return self._router_api.recv_json() def api_poll(self, timeout=0): '''Poll API channel until `timeout` (in milliseconds) @@ -72,68 +72,68 @@ def api_poll(self, timeout=0): Return `True`/`False` if there is any to be received (or not). If it returns `True` so you can use `get_api_reply` and it won't hang. ''' - if self._manager_api is None: - raise RuntimeError("Not connected to Manager's API channel") + if self._router_api is None: + raise RuntimeError("Not connected to Router's API channel") else: - return self._manager_api.poll(timeout) + return self._router_api.poll(timeout) def broadcast_subscribe(self, subscribe_to): - '''Subscribe to a Manager's broadcast type + '''Subscribe to a Router's broadcast type `subscribe_to` needs to be a string. ''' - if self._manager_broadcast is None: - raise RuntimeError("Not connected to Manager's broadcast channel") + if self._router_broadcast is None: + raise RuntimeError("Not connected to Router's broadcast channel") else: - return self._manager_broadcast.setsockopt(SUBSCRIBE, subscribe_to) + return self._router_broadcast.setsockopt(SUBSCRIBE, subscribe_to) def broadcast_unsubscribe(self, unsubscribe_to): - if self._manager_broadcast is None: - raise RuntimeError("Not connected to Manager's broadcast channel") + if self._router_broadcast is None: + raise RuntimeError("Not connected to Router's broadcast channel") else: - return self._manager_broadcast.setsockopt(UNSUBSCRIBE, + return self._router_broadcast.setsockopt(UNSUBSCRIBE, unsubscribe_to) def broadcast_poll(self, timeout=0): - if self._manager_broadcast is None: - raise RuntimeError("Not connected to Manager's broadcast channel") + if self._router_broadcast is None: + raise RuntimeError("Not connected to Router's broadcast channel") else: - return self._manager_broadcast.poll(timeout) + return self._router_broadcast.poll(timeout) def broadcast_receive(self): - if self._manager_broadcast is None: - raise RuntimeError("Not connected to Manager's broadcast channel") + if self._router_broadcast is None: + raise RuntimeError("Not connected to Router's broadcast channel") else: - return self._manager_broadcast.recv() + return self._router_broadcast.recv() def disconnect_api(self, silent=False): - '''Disconnect from Manager's API channel + '''Disconnect from Router's API channel Raise RuntimeError if not connected to API channel and `silent=False` ''' - if self._manager_api is None and not silent: - raise RuntimeError("Not connected to Manager's API channel") - elif self._manager_api is not None: - self._manager_api.close() - self._manager_api = None + if self._router_api is None and not silent: + raise RuntimeError("Not connected to Router's API channel") + elif self._router_api is not None: + self._router_api.close() + self._router_api = None def disconnect_broadcast(self, silent=False): - '''Disconnect from Manager's broadcast channel + '''Disconnect from Router's broadcast channel Raise RuntimeError if not connected to broadcast channel ''' - if self._manager_broadcast is None and not silent: - raise RuntimeError("Not connected to Manager's broadcast channel") - elif self._manager_broadcast is not None: - self._manager_broadcast.close() - self._manager_broadcast = None + if self._router_broadcast is None and not silent: + raise RuntimeError("Not connected to Router's broadcast channel") + elif self._router_broadcast is not None: + self._router_broadcast.close() + self._router_broadcast = None def disconnect(self, silent=False): - '''Disconnect from both Manager's API and broadcast channels + '''Disconnect from both Router's API and broadcast channels Raise RuntimeError if not connected to at least one of both channels ''' - if self._manager_broadcast is self._manager_api is None and not silent: + if self._router_broadcast is self._router_api is None and not silent: raise RuntimeError("Not connected") else: self.disconnect_api(silent=True) diff --git a/pypelinin/manager.py b/pypelinin/router.py similarity index 98% rename from pypelinin/manager.py rename to pypelinin/router.py index 2bba4eb..9bc56db 100755 --- a/pypelinin/manager.py +++ b/pypelinin/router.py @@ -7,7 +7,7 @@ import zmq -class Manager(object): +class Router(object): #TODO: add another queue for processing jobs #TODO: add a timeout for processing jobs (default or get it from client) #TODO: if processing job have timeout, remove from processing queue, add @@ -16,7 +16,7 @@ class Manager(object): #TODO: handle 'job failed' messages #TODO: some attributes should start with '_' def __init__(self, api_host_port, broadcast_host_port, config, logger=None, - logger_name='Manager'): + logger_name='Router'): self.job_queue = Queue() self.pipeline_queue = Queue() #TODO: should persist jobs and recover in case of failure diff --git a/tests/my_manager.py b/tests/my_router.py similarity index 74% rename from tests/my_manager.py rename to tests/my_router.py index b8f43bb..b2dc1fc 100644 --- a/tests/my_manager.py +++ b/tests/my_router.py @@ -3,11 +3,11 @@ from sys import stdout from logging import Logger, StreamHandler, Formatter -from pypelinin import Manager +from pypelinin import Router def main(): - logger = Logger('Manager') + logger = Logger('Test Router') handler = StreamHandler(stdout) formatter = Formatter('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s') @@ -16,9 +16,8 @@ def main(): api_host_port = ('*', 5555) broadcast_host_port = ('*', 5556) default_config = {'store': {'data': 'test'}, 'monitoring interval': 60, } - manager = Manager(api_host_port, broadcast_host_port, default_config, - logger) - manager.start() + router = Router(api_host_port, broadcast_host_port, default_config, logger) + router.start() if __name__ == '__main__': main() diff --git a/tests/test_broker.py b/tests/test_broker.py index 54a6221..ad51ca5 100644 --- a/tests/test_broker.py +++ b/tests/test_broker.py @@ -57,7 +57,7 @@ def tearDownClass(cls): def setUp(self): self.context = zmq.Context() - self.start_manager_sockets() + self.start_router_sockets() self.start_broker_process() def tearDown(self): @@ -100,7 +100,7 @@ def end_broker_process(self): for child_pid in children: _kill(child_pid, timeout=TIMEOUT / 1000.0) - def start_manager_sockets(self): + def start_router_sockets(self): self.api = self.context.socket(zmq.REP) self.broadcast = self.context.socket(zmq.PUB) self.api.bind('tcp://*:5555') @@ -167,7 +167,7 @@ def test_should_ask_for_a_job_after_configuration(self): job = {'worker': 'Dummy', 'data': {'id': '1'}, 'job id': '2'} self.send_and_receive_jobs([job]) - def test_should_send_get_job_just_after_manager_broadcast_new_job(self): + def test_should_send_get_job_just_after_router_broadcast_new_job(self): self.receive_get_configuration_and_send_it_to_broker() self.send_and_receive_jobs([{'worker': None}]) self.broker_should_be_quiet() diff --git a/tests/test_client.py b/tests/test_client.py index f19f700..1c828f0 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -15,13 +15,13 @@ class TestClient(unittest.TestCase): def setUp(self): self.context = Context() - self.start_manager_sockets() + self.start_router_sockets() def tearDown(self): self.close_sockets() self.context.term() - def start_manager_sockets(self): + def start_router_sockets(self): self.api = self.context.socket(REP) self.broadcast = self.context.socket(PUB) self.api.bind(API_BIND_ADDRESS) diff --git a/tests/test_manager.py b/tests/test_router.py similarity index 83% rename from tests/test_manager.py rename to tests/test_router.py index 1bbca8a..1d1623b 100644 --- a/tests/test_manager.py +++ b/tests/test_router.py @@ -11,10 +11,10 @@ time_to_wait = 150 -class TestManager(unittest.TestCase): +class TestRouter(unittest.TestCase): def setUp(self): self.context = zmq.Context() - self.start_manager_process() + self.start_router_process() self.api = self.context.socket(zmq.REQ) self.api.connect('tcp://localhost:5555') self.broadcast = self.context.socket(zmq.SUB) @@ -22,38 +22,38 @@ def setUp(self): self.broadcast.setsockopt(zmq.SUBSCRIBE, 'new job') def tearDown(self): - self.end_manager_process() + self.end_router_process() self.close_sockets() self.context.term() - def start_manager_process(self): - self.manager = Popen(shlex.split('python ./tests/my_manager.py'), + def start_router_process(self): + self.router = Popen(shlex.split('python ./tests/my_router.py'), stdin=PIPE, stdout=PIPE, stderr=PIPE) - for line in self.manager.stdout.readline(): + for line in self.router.stdout.readline(): if 'main loop' in line: break - def end_manager_process(self): - self.manager.send_signal(SIGINT) + def end_router_process(self): + self.router.send_signal(SIGINT) sleep(time_to_wait / 1000.0) - self.manager.send_signal(SIGKILL) - self.manager.wait() + self.router.send_signal(SIGKILL) + self.router.wait() def close_sockets(self): self.api.close() self.broadcast.close() - def test_connect_to_manager_api_zmq_socket_and_execute_undefined_command(self): + def test_connect_to_router_api_zmq_socket_and_execute_undefined_command(self): self.api.send_json({'spam': 'eggs'}) if not self.api.poll(time_to_wait): - self.fail("Didn't receive 'undefined command' from manager") + self.fail("Didn't receive 'undefined command' from router") message = self.api.recv_json() self.assertEqual(message, {'answer': 'undefined command'}) - def test_should_connect_to_manager_api_zmq_socket(self): + def test_should_connect_to_router_api_zmq_socket(self): self.api.send_json({'command': 'hello'}) if not self.api.poll(time_to_wait): - self.fail("Didn't receive 'unknown command' from manager") + self.fail("Didn't receive 'unknown command' from router") message = self.api.recv_json() self.assertEqual(message, {'answer': 'unknown command'}) @@ -71,7 +71,7 @@ def test_should_receive_new_job_from_broadcast_when_a_job_is_submitted(self): def test_command_get_configuration_should_return_dict_passed_on_setUp(self): self.api.send_json({'command': 'get configuration'}) if not self.api.poll(time_to_wait): - self.fail("Didn't receive configuration from manager") + self.fail("Didn't receive configuration from router") message = self.api.recv_json() self.assertEqual(message, default_config) @@ -79,7 +79,7 @@ def test_command_add_job_should_return_a_job_id(self): cmd = {'command': 'add job', 'worker': 'test', 'data': 'eggs'} self.api.send_json(cmd) if not self.api.poll(time_to_wait): - self.fail("Didn't receive 'job accepted' from manager") + self.fail("Didn't receive 'job accepted' from router") message = self.api.recv_json() self.assertEqual(message['answer'], 'job accepted') self.assertIn('job id', message) @@ -88,7 +88,7 @@ def test_command_add_job_should_return_a_job_id(self): def test_command_get_job_should_return_empty_if_no_job(self): self.api.send_json({'command': 'get job'}) if not self.api.poll(time_to_wait): - self.fail("Didn't receive job (None) from manager") + self.fail("Didn't receive job (None) from router") message = self.api.recv_json() self.assertEqual(message['worker'], None) @@ -100,7 +100,7 @@ def test_command_get_job_should_return_a_job_after_adding_one(self): job = self.api.recv_json() self.api.send_json({'command': 'get job'}) if not self.api.poll(time_to_wait): - self.fail("Didn't receive job from manager") + self.fail("Didn't receive job from router") message = self.api.recv_json() self.assertEqual(message['worker'], 'spam') self.assertEqual(message['data'], 'eggs') @@ -110,7 +110,7 @@ def test_command_get_job_should_return_a_job_after_adding_one(self): def test_finished_job_without_job_id_should_return_error(self): self.api.send_json({'command': 'job finished'}) if not self.api.poll(time_to_wait): - self.fail("Didn't receive 'syntax error' from manager") + self.fail("Didn't receive 'syntax error' from router") message = self.api.recv_json() self.assertEqual(message['answer'], 'syntax error') @@ -118,7 +118,7 @@ def test_finished_job_with_unknown_job_id_should_return_error(self): self.api.send_json({'command': 'job finished', 'job id': 'python rlz', 'duration': 0.1}) if not self.api.poll(time_to_wait): - self.fail("Didn't receive 'unknown job id' from manager") + self.fail("Didn't receive 'unknown job id' from router") message = self.api.recv_json() self.assertEqual(message['answer'], 'unknown job id') @@ -132,7 +132,7 @@ def test_finished_job_with_correct_job_id_should_return_good_job(self): 'job id': message['job id'], 'duration': 0.1}) if not self.api.poll(time_to_wait): - self.fail("Didn't receive 'good job!' from manager. " + self.fail("Didn't receive 'good job!' from router. " "#foreveralone :-(") message = self.api.recv_json() self.assertEqual(message['answer'], 'good job!') From f136db4bc498821c38cc9dfa0024e94d2aecfba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 10 Oct 2012 01:16:12 -0300 Subject: [PATCH 13/37] Fix local package imports --- pypelinin/__init__.py | 2 +- pypelinin/broker.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index 2ff24e5..51f6748 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -3,4 +3,4 @@ from .router import Router from .client import Client from .broker import Broker -#from .pipeline import Pipeliner, PipelineManager, Worker +from .pipeline import Pipeliner#, PipelineManager, Worker diff --git a/pypelinin/broker.py b/pypelinin/broker.py index ea14812..4da14ce 100755 --- a/pypelinin/broker.py +++ b/pypelinin/broker.py @@ -6,9 +6,8 @@ from os import kill, getpid from time import sleep, time from signal import SIGKILL -from pypelinin import Client -from pypelinin.monitoring import (get_host_info, get_outgoing_ip, - get_process_info) +from . import Client +from .monitoring import (get_host_info, get_outgoing_ip, get_process_info) def worker_wrapper(pipe, workers_module_name): From 2671957dce6b2445ff93151837c4af8d706f18b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 10 Oct 2012 14:11:36 -0300 Subject: [PATCH 14/37] Replace assertEquals (deprecated) with assertEqual --- Makefile | 1 + tests/test_worker.py | 27 ++++++++++++++------------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index e16fd2e..c994564 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ TEST_RUNNER=nosetests -dsv --with-yanc clean: find -regex '.*\.pyc' -exec rm {} \; find -regex '.*~' -exec rm {} \; + rm -rf build/ reg_settings.py* bootstrap-environment: pip install -r requirements/development.txt diff --git a/tests/test_worker.py b/tests/test_worker.py index 2600e53..0996a24 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -1,25 +1,27 @@ -import unittest +# coding: utf-8 +import unittest import json -from pypelinin.worker import Worker, todict +from . import Worker, todict + class WorkerTest(unittest.TestCase): def test_pipeline_init(self): pipeline = Worker('worker_id') - self.assertEquals(pipeline.name, 'worker_id') - self.assertEquals(pipeline.after, []) - self.assertEquals(pipeline.serialize(), "worker: worker_id") + self.assertEqual(pipeline.name, 'worker_id') + self.assertEqual(pipeline.after, []) + self.assertEqual(pipeline.serialize(), "worker: worker_id") def test_pipeline_worker_pipe_pipeline(self): pipeline = Worker('w1') | Worker('w2') - self.assertEquals(pipeline.name, "w1") - self.assertEquals(pipeline.after, [Worker('w2')]) + self.assertEqual(pipeline.name, "w1") + self.assertEqual(pipeline.after, [Worker('w2')]) def test_pipeline_worker_pipe_parallel_pipelines_pipe_worker(self): pipeline = Worker('V1') | [Worker('V2'), Worker('V3')] | Worker('V4') - self.assertEquals(pipeline.after, + self.assertEqual(pipeline.after, [[Worker('V2'), Worker('V3')], Worker('V4')]) def test_pipeline_worker_pipe_nested_pipe_in_parallel_pipe_worker(self): @@ -27,7 +29,7 @@ def test_pipeline_worker_pipe_nested_pipe_in_parallel_pipe_worker(self): Worker('V3') ] | Worker('V4') - self.assertEquals(pipeline.after, + self.assertEqual(pipeline.after, [[Worker('V2') | Worker('A2'), Worker('V3')], Worker('V4')]) @@ -39,10 +41,9 @@ def test_complex_pipeline_to_json_and_from_json(self): jdata = json.dumps(todict(pipeline), indent=4) pipeline_from_json = Worker.from_json(jdata) - self.assertEquals(pipeline, pipeline_from_json) - self.assertEquals(pipeline_from_json.after, + self.assertEqual(pipeline, pipeline_from_json) + self.assertEqual(pipeline_from_json.after, [[Worker('V2') | Worker('A2'), Worker('V3')], Worker('V4')]) - self.assertEquals(json.dumps(todict(pipeline)), + self.assertEqual(json.dumps(todict(pipeline)), json.dumps(todict(pipeline_from_json))) - From b6005a714118b189b4614f926a67d4d7d4ce3ce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 10 Oct 2012 20:18:39 -0300 Subject: [PATCH 15/37] Add Worker tests to Makefile --- Makefile | 5 ++++- pypelinin/__init__.py | 1 + pypelinin/worker.py | 4 +++- tests/test_worker.py | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index c994564..f24942a 100644 --- a/Makefile +++ b/Makefile @@ -24,4 +24,7 @@ test-client: bootstrap-tests test-broker: bootstrap-tests ${TEST_RUNNER} -x tests/test_broker.py -.PHONY: clean bootstrap-environment bootstrap-tests test test-router test-client test-broker +test-pipeline: bootstrap-tests + ${TEST_RUNNER} -x tests/test_worker.py + +.PHONY: clean bootstrap-environment bootstrap-tests test test-router test-client test-broker test-pipeline diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index 51f6748..a7fdbaf 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -3,4 +3,5 @@ from .router import Router from .client import Client from .broker import Broker +from .worker import Worker, todict from .pipeline import Pipeliner#, PipelineManager, Worker diff --git a/pypelinin/worker.py b/pypelinin/worker.py index 0942263..d89e526 100644 --- a/pypelinin/worker.py +++ b/pypelinin/worker.py @@ -1,5 +1,7 @@ +# coding: utf-8 + import json -import cPickle as pickle + def todict(obj, classkey=None): if isinstance(obj, dict): diff --git a/tests/test_worker.py b/tests/test_worker.py index 0996a24..07aabdd 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -2,7 +2,7 @@ import unittest import json -from . import Worker, todict +from pypelinin import Worker, todict class WorkerTest(unittest.TestCase): From 4a00f53e33332c5ba57b76c3b9e0794c83906b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 10 Oct 2012 20:24:31 -0300 Subject: [PATCH 16/37] `worker.todict` moved to `worker.Worker.to_dict` --- pypelinin/__init__.py | 2 +- pypelinin/worker.py | 12 +++++++----- tests/test_worker.py | 8 ++++---- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index a7fdbaf..c33c26d 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -3,5 +3,5 @@ from .router import Router from .client import Client from .broker import Broker -from .worker import Worker, todict +from .worker import Worker from .pipeline import Pipeliner#, PipelineManager, Worker diff --git a/pypelinin/worker.py b/pypelinin/worker.py index d89e526..9e52993 100644 --- a/pypelinin/worker.py +++ b/pypelinin/worker.py @@ -2,16 +2,15 @@ import json - -def todict(obj, classkey=None): +def _to_dict(obj, classkey=None): if isinstance(obj, dict): for k in obj.keys(): - obj[k] = todict(obj[k], classkey) + obj[k] = _to_dict(obj[k], classkey) return obj elif hasattr(obj, "__iter__"): - return [todict(v, classkey) for v in obj] + return [_to_dict(v, classkey) for v in obj] elif hasattr(obj, "__dict__"): - data = dict([(key, todict(value, classkey)) + data = dict([(key, _to_dict(value, classkey)) for key, value in obj.__dict__.iteritems() if not callable(value) and not key.startswith('_')]) if classkey is not None and hasattr(obj, "__class__"): @@ -65,3 +64,6 @@ def from_json(value): worker.after = temp_after return worker + + def to_dict(self, classkey=None): + return _to_dict(self, classkey) diff --git a/tests/test_worker.py b/tests/test_worker.py index 07aabdd..3501cd7 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -2,7 +2,7 @@ import unittest import json -from pypelinin import Worker, todict +from pypelinin import Worker class WorkerTest(unittest.TestCase): @@ -38,12 +38,12 @@ def test_complex_pipeline_to_json_and_from_json(self): Worker('V3') ] | Worker('V4') - jdata = json.dumps(todict(pipeline), indent=4) + jdata = json.dumps(pipeline.to_dict(), indent=4) pipeline_from_json = Worker.from_json(jdata) self.assertEqual(pipeline, pipeline_from_json) self.assertEqual(pipeline_from_json.after, [[Worker('V2') | Worker('A2'), Worker('V3')], Worker('V4')]) - self.assertEqual(json.dumps(todict(pipeline)), - json.dumps(todict(pipeline_from_json))) + self.assertEqual(json.dumps(pipeline.to_dict()), + json.dumps(pipeline_from_json.to_dict())) From 8886d07f4131f4a7ebfb243775a10fb4f2342874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 10 Oct 2012 20:27:31 -0300 Subject: [PATCH 17/37] Move worker.py to pipeline.py; add Pipeliner code --- pypelinin/__init__.py | 3 +- pypelinin/pipeline.py | 195 ++++++++++++++++++++++++++++++++++++++++++ pypelinin/worker.py | 69 --------------- 3 files changed, 196 insertions(+), 71 deletions(-) create mode 100755 pypelinin/pipeline.py delete mode 100644 pypelinin/worker.py diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index c33c26d..cb15ed5 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -3,5 +3,4 @@ from .router import Router from .client import Client from .broker import Broker -from .worker import Worker -from .pipeline import Pipeliner#, PipelineManager, Worker +from .pipeline import Pipeliner, Worker#, PipelineManager diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py new file mode 100755 index 0000000..6e6c3e7 --- /dev/null +++ b/pypelinin/pipeline.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python +# coding: utf-8 + +#TODO: in future, pipeliner could be a worker in a broker tagged as pipeliner, +# but router needs to support broker tags + +from uuid import uuid4 +import json +from . import Client + + +def _to_dict(obj, classkey=None): + if isinstance(obj, dict): + for k in obj.keys(): + obj[k] = _to_dict(obj[k], classkey) + return obj + elif hasattr(obj, "__iter__"): + return [_to_dict(v, classkey) for v in obj] + elif hasattr(obj, "__dict__"): + data = dict([(key, _to_dict(value, classkey)) + for key, value in obj.__dict__.iteritems() + if not callable(value) and not key.startswith('_')]) + if classkey is not None and hasattr(obj, "__class__"): + data[classkey] = obj.__class__.__name__ + return data + else: + return obj + +class Worker(object): + def __init__(self, worker_name): + self.name = worker_name + self.after = [] + + def then(self, *after): + self.after.extend(list(after)) + return self + + def __or__(self, after): + self.then(*[after]) + return self + + def __eq__(self, other): + return self.name == other.name + + def __repr__(self): + return "Worker({name})".format(**self.__dict__) + + def serialize(self): + if not self.after: + return "worker: {name}".format(name=self.name) + else: + data = "main: worker: {name}".format(name=self.name) + for node in self.after: + data += " " + node.serialize() + return data + + @staticmethod + def from_json(value): + temp_after = [] + data = json.loads(value) + + if isinstance(data, list): + for node in data: + temp_after.append(Worker.from_json(json.dumps(node))) + return temp_after + + worker = Worker(data['name']) + worker.after = data['after'] + for node in worker.after: + temp_after.append(Worker.from_json(json.dumps(node))) + + worker.after = temp_after + return worker + + def to_dict(self, classkey=None): + return _to_dict(self, classkey) + +class Pipeliner(Client): + #TODO: should send monitoring information? + #TODO: should receive and handle a 'job error' from router when some job + # could not be processed (timeout, worker not found etc.) + + def __init__(self, api_host_port, broadcast_host_port, logger=None, + poll_time=50): + super(Pipeliner, self).__init__() + self.api_host_port = api_host_port + self.broadcast_host_port = broadcast_host_port + self.logger = logger + self.poll_time = poll_time + self._new_pipelines = 0 + self._messages = [] + self._pipelines = {} + self._jobs = {} + self.logger.info('Pipeliner started') + + def start(self): + try: + self.connect(self.api_host_port, self.broadcast_host_port) + self.broadcast_subscribe('new pipeline') + self.run() + except KeyboardInterrupt: + self.logger.info('Got SIGNINT (KeyboardInterrupt), exiting.') + self.close_sockets() + + def _update_broadcast(self): + if self.broadcast_poll(self.poll_time): + message = self.broadcast_receive() + self.logger.info('Received from broadcast: {}'.format(message)) + if message.startswith('new pipeline'): + self._new_pipelines += 1 + else: + self._messages.append(message) + + def router_has_new_pipeline(self): + self._update_broadcast() + return self._new_pipelines > 0 + + def ask_for_a_pipeline(self): + self.send_api_request({'command': 'get pipeline'}) + message = self.get_api_reply() + #TODO: if router stops and doesn't answer, pipeliner will stop here + if 'data' in message: + if message['data'] is not None: + self.logger.info('Got this pipeline: {}'.format(message)) + self._new_pipelines -= 1 + return message + elif 'pipeline' in message and message['pipeline'] is None: + self.logger.info('Bad bad router, no pipeline for me.') + return None + else: + self.logger.info('Ignoring malformed pipeline: {}'.format(message)) + #TODO: send a 'rejecting pipeline' request to router + return None + + def get_a_pipeline(self): + data = self.ask_for_a_pipeline() + if data is not None: + self.start_pipeline(data) + + def _send_job(self, worker): + job = {'command': 'add job', 'worker': worker.name, + 'data': worker.data} + self.logger.info('Sending new job: {}'.format(job)) + self.send_api_request(job) + self.logger.info('Sent job: {}'.format(job)) + message = self.get_api_reply() + self.logger.info('Received from router API: {}'.format(message)) + self._jobs[message['job id']] = worker + subscribe_message = 'job finished: {}'.format(message['job id']) + self.broadcast_subscribe(subscribe_message) + self.logger.info('Subscribed on router Broadcast to: {}'\ + .format(subscribe_message)) + + def start_pipeline(self, data): + pipeline_id = data['pipeline id'] + workers = Worker('downloader') + workers.pipeline = pipeline_id + workers.data = data['data'] + self._pipelines[pipeline_id] = [workers] + self._send_job(workers) + + def verify_jobs(self): + self._update_broadcast() + new_messages = [] + for message in self._messages: + if message.startswith('job finished: '): + job_id = message.split(': ')[1].split(' ')[0] + self.logger.info('Processing finished job id {}.'.format(job_id)) + worker = self._jobs[job_id] + self._pipelines[worker.pipeline].remove(worker) + for next_worker in worker.after: + next_worker.data = worker.data + next_worker.pipeline = worker.pipeline + self._pipelines[worker.pipeline].append(next_worker) + self._send_job(next_worker) + del self._jobs[job_id] + if not self._pipelines[worker.pipeline]: + self.send_api_request({'command': 'pipeline finished', + 'pipeline id': worker.pipeline}) + self.get_api_reply() + #TODO: check reply + del self._pipelines[worker.pipeline] + self.logger.info('Finished pipeline {}'\ + .format(worker.pipeline)) + self.get_a_pipeline() + self.broadcast_unsubscribe(message) + self._messages = [] + + def run(self): + self.logger.info('Entering main loop') + self.get_a_pipeline() + while True: + if self.router_has_new_pipeline(): + self.get_a_pipeline() + self.verify_jobs() diff --git a/pypelinin/worker.py b/pypelinin/worker.py deleted file mode 100644 index 9e52993..0000000 --- a/pypelinin/worker.py +++ /dev/null @@ -1,69 +0,0 @@ -# coding: utf-8 - -import json - -def _to_dict(obj, classkey=None): - if isinstance(obj, dict): - for k in obj.keys(): - obj[k] = _to_dict(obj[k], classkey) - return obj - elif hasattr(obj, "__iter__"): - return [_to_dict(v, classkey) for v in obj] - elif hasattr(obj, "__dict__"): - data = dict([(key, _to_dict(value, classkey)) - for key, value in obj.__dict__.iteritems() - if not callable(value) and not key.startswith('_')]) - if classkey is not None and hasattr(obj, "__class__"): - data[classkey] = obj.__class__.__name__ - return data - else: - return obj - -class Worker(object): - def __init__(self, worker_name): - self.name = worker_name - self.after = [] - - def then(self, *after): - self.after.extend(list(after)) - return self - - def __or__(self, after): - self.then(*[after]) - return self - - def __eq__(self, other): - return self.name == other.name - - def __repr__(self): - return "Worker({name})".format(**self.__dict__) - - def serialize(self): - if not self.after: - return "worker: {name}".format(name=self.name) - else: - data = "main: worker: {name}".format(name=self.name) - for node in self.after: - data += " " + node.serialize() - return data - - @staticmethod - def from_json(value): - temp_after = [] - data = json.loads(value) - - if isinstance(data, list): - for node in data: - temp_after.append(Worker.from_json(json.dumps(node))) - return temp_after - - worker = Worker(data['name']) - worker.after = data['after'] - for node in worker.after: - temp_after.append(Worker.from_json(json.dumps(node))) - - worker.after = temp_after - return worker - - def to_dict(self, classkey=None): - return _to_dict(self, classkey) From 5b67f77e52787c34f4834b7c408dc5646d7df14a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 10 Oct 2012 21:07:18 -0300 Subject: [PATCH 18/37] Move setup.py to root directory --- Makefile | 2 +- pypelinin/setup.py => setup.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename pypelinin/setup.py => setup.py (100%) diff --git a/Makefile b/Makefile index f24942a..bc2b754 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ bootstrap-environment: bootstrap-tests: clean clear - python pypelinin/setup.py install + python setup.py install test: bootstrap-tests ${TEST_RUNNER} tests/ diff --git a/pypelinin/setup.py b/setup.py similarity index 100% rename from pypelinin/setup.py rename to setup.py From 4dc8e298975cdee9f288fee9a86c3849387fe1df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Thu, 11 Oct 2012 16:20:00 -0300 Subject: [PATCH 19/37] Change/simplify Worker API - Remove unused code - Removed Worker.then (now only Worker.__or__) - Removed Worker.serialize (not needed) - Replaced all `json` methods with `dict` --- Makefile | 4 ++-- pypelinin/pipeline.py | 45 ++++++++++++------------------------------- tests/test_worker.py | 17 ++++++++-------- 3 files changed, 22 insertions(+), 44 deletions(-) diff --git a/Makefile b/Makefile index bc2b754..50f3686 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ test: bootstrap-tests ${TEST_RUNNER} tests/ test-router: bootstrap-tests - ${TEST_RUNNER} tests/test_router.py + ${TEST_RUNNER} -x tests/test_router.py test-client: bootstrap-tests ${TEST_RUNNER} --with-coverage --cover-package=pypelinin.client tests/test_client.py @@ -25,6 +25,6 @@ test-broker: bootstrap-tests ${TEST_RUNNER} -x tests/test_broker.py test-pipeline: bootstrap-tests - ${TEST_RUNNER} -x tests/test_worker.py + ${TEST_RUNNER} --with-coverage --cover-package=pypelinin.pipeline tests/test_worker.py .PHONY: clean bootstrap-environment bootstrap-tests test test-router test-client test-broker test-pipeline diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py index 6e6c3e7..e7aa843 100755 --- a/pypelinin/pipeline.py +++ b/pypelinin/pipeline.py @@ -5,16 +5,11 @@ # but router needs to support broker tags from uuid import uuid4 -import json from . import Client def _to_dict(obj, classkey=None): - if isinstance(obj, dict): - for k in obj.keys(): - obj[k] = _to_dict(obj[k], classkey) - return obj - elif hasattr(obj, "__iter__"): + if hasattr(obj, "__iter__"): return [_to_dict(v, classkey) for v in obj] elif hasattr(obj, "__dict__"): data = dict([(key, _to_dict(value, classkey)) @@ -31,46 +26,30 @@ def __init__(self, worker_name): self.name = worker_name self.after = [] - def then(self, *after): - self.after.extend(list(after)) - return self - - def __or__(self, after): - self.then(*[after]) + def __or__(self, *after): + self.after.extend(after) return self def __eq__(self, other): return self.name == other.name def __repr__(self): - return "Worker({name})".format(**self.__dict__) - - def serialize(self): - if not self.after: - return "worker: {name}".format(name=self.name) - else: - data = "main: worker: {name}".format(name=self.name) - for node in self.after: - data += " " + node.serialize() - return data + return "Worker({})".format(repr(self.name)) @staticmethod - def from_json(value): + def from_dict(data): temp_after = [] - data = json.loads(value) if isinstance(data, list): for node in data: - temp_after.append(Worker.from_json(json.dumps(node))) + temp_after.append(Worker.from_dict(node)) return temp_after - - worker = Worker(data['name']) - worker.after = data['after'] - for node in worker.after: - temp_after.append(Worker.from_json(json.dumps(node))) - - worker.after = temp_after - return worker + else: + worker = Worker(data['name']) + if data['after']: + temp_after = Worker.from_dict(data['after']) + worker.after = temp_after + return worker def to_dict(self, classkey=None): return _to_dict(self, classkey) diff --git a/tests/test_worker.py b/tests/test_worker.py index 3501cd7..7d54f83 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -1,7 +1,6 @@ # coding: utf-8 import unittest -import json from pypelinin import Worker @@ -11,7 +10,7 @@ def test_pipeline_init(self): self.assertEqual(pipeline.name, 'worker_id') self.assertEqual(pipeline.after, []) - self.assertEqual(pipeline.serialize(), "worker: worker_id") + self.assertEqual(repr(pipeline), "Worker('worker_id')") def test_pipeline_worker_pipe_pipeline(self): pipeline = Worker('w1') | Worker('w2') @@ -33,17 +32,17 @@ def test_pipeline_worker_pipe_nested_pipe_in_parallel_pipe_worker(self): [[Worker('V2') | Worker('A2'), Worker('V3')], Worker('V4')]) - def test_complex_pipeline_to_json_and_from_json(self): + def test_complex_pipeline_to_dict_and_from_dict(self): pipeline = Worker('V1') | [ Worker('V2') | Worker('A2'), Worker('V3') ] | Worker('V4') - jdata = json.dumps(pipeline.to_dict(), indent=4) - pipeline_from_json = Worker.from_json(jdata) + serialized = pipeline.to_dict() + pipeline_from_dict = Worker.from_dict(serialized) - self.assertEqual(pipeline, pipeline_from_json) - self.assertEqual(pipeline_from_json.after, + self.assertEqual(pipeline, pipeline_from_dict) + self.assertEqual(pipeline_from_dict.after, [[Worker('V2') | Worker('A2'), Worker('V3')], Worker('V4')]) - self.assertEqual(json.dumps(pipeline.to_dict()), - json.dumps(pipeline_from_json.to_dict())) + self.assertEqual(pipeline.to_dict(), + pipeline_from_dict.to_dict()) From 08687670bb35c39699536e0c78e7cb6299f0621f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sat, 13 Oct 2012 01:58:34 -0300 Subject: [PATCH 20/37] Create new way to describe pipelines - Use `Pipeline` instead of `Worker` -- more robust implementation - Update README.markdown about `Pipeline` (replace `Worker` with `Pipeline`) - Add the future API of `PipelineManager` to README.markdown - Split `Pipeliner` and `Pipeline` code into 2 files --- Makefile | 2 +- README.markdown | 50 +++++++--- pypelinin/__init__.py | 3 +- pypelinin/pipeline.py | 219 ++++++++++------------------------------- pypelinin/pipeliner.py | 142 ++++++++++++++++++++++++++ tests/test_pipeline.py | 103 +++++++++++++++++++ tests/test_worker.py | 48 --------- 7 files changed, 338 insertions(+), 229 deletions(-) create mode 100755 pypelinin/pipeliner.py create mode 100644 tests/test_pipeline.py delete mode 100644 tests/test_worker.py diff --git a/Makefile b/Makefile index 50f3686..dc09915 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,6 @@ test-broker: bootstrap-tests ${TEST_RUNNER} -x tests/test_broker.py test-pipeline: bootstrap-tests - ${TEST_RUNNER} --with-coverage --cover-package=pypelinin.pipeline tests/test_worker.py + ${TEST_RUNNER} --with-coverage --cover-package=pypelinin.pipeline tests/test_pipeline.py .PHONY: clean bootstrap-environment bootstrap-tests test test-router test-client test-broker test-pipeline diff --git a/README.markdown b/README.markdown index 4b79c3e..988f97a 100644 --- a/README.markdown +++ b/README.markdown @@ -1,29 +1,51 @@ -pypelinin -========= +pypelinin' +========== + +`pypelinin` is a python library to distribute jobs and pipelines among a +cluster. It uses ZeroMQ as its foundation framework for communication between +the daemons. + + +Architecture +------------ + +TODO: talk about Router, Broker and Pipeliner -Python library to distribute jobs and pipelines among a cluster. Usage -===== +----- + +### Daemons + +TODO: talk about starting daemons + + +### Client Pypelinin will provide a high level python dsl to describe your workflow. -Example 1: +#### Example 1 - Creating a pipeline ```python -pipeline = Worker('do a task') | [Worker('parallel task 1'), - Worker('parallel task 2')] | Worker('finalizer') +from pypelinin import Pipeline + +pipeline = Pipeline('task1') | Pipeline('parallel_1', 'parallel_2') | Pipeline('last_task') ``` -Example 2: +#### Example 2 - Submitting a pipeline to be executed -```python -pipeline = Worker('do a task') | [Worker('parallel task 1') | Worker('after 1'), - Worker('parallel task 2')] | Worker('finalizer') -``` +TODO: PipelineManager is not implemented After defined, you just have to start your pipeline. -``` -Pipeliner.start(pipeline) +```python +from pypelinin import Pipeline, PipelineManager + +manager = PipelineManager(api='tcp://localhost:5555', + broadcast='tcp://localhost:5556') +pipeline = Pipeline('task1') | Pipeline('task2') +print 'starting executing tasks...' +manager.start(pipeline) +pipeline.wait_finish() +print 'done' ``` diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index cb15ed5..c04425f 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -3,4 +3,5 @@ from .router import Router from .client import Client from .broker import Broker -from .pipeline import Pipeliner, Worker#, PipelineManager +#from .pipeliner import Pipeliner +from .pipeline import Pipeline#, PipelineManager diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py index e7aa843..abc260a 100755 --- a/pypelinin/pipeline.py +++ b/pypelinin/pipeline.py @@ -1,174 +1,63 @@ -#!/usr/bin/env python # coding: utf-8 -#TODO: in future, pipeliner could be a worker in a broker tagged as pipeliner, -# but router needs to support broker tags - -from uuid import uuid4 -from . import Client - - -def _to_dict(obj, classkey=None): - if hasattr(obj, "__iter__"): - return [_to_dict(v, classkey) for v in obj] - elif hasattr(obj, "__dict__"): - data = dict([(key, _to_dict(value, classkey)) - for key, value in obj.__dict__.iteritems() - if not callable(value) and not key.startswith('_')]) - if classkey is not None and hasattr(obj, "__class__"): - data[classkey] = obj.__class__.__name__ - return data - else: - return obj - -class Worker(object): - def __init__(self, worker_name): - self.name = worker_name - self.after = [] - - def __or__(self, *after): - self.after.extend(after) - return self - - def __eq__(self, other): - return self.name == other.name +class Pipeline(object): + '''Representation of a series of parallel workers + + A `Pipeline` is basically a list of workers that will execute in parallel. + For example: + >>> my_pipeline = Pipeline('w1', 'w2') + + This represents that worker 'w1' will run in parallel to worker 'w2'. + You can combine pipelines using the `|` operator, as follows: + my_pipeline = Pipeline('s1') | Pipeline('s2') + In the example above, worker 's1' will be executed and then, sequentially, + 's2'. + The `|` operation returns a new `Pipeline` object, so you can combine your + pipeline objects easily as: + >>> p1 = Pipeline('w1.1', 'w1.2') + >>> p2 = Pipeline('s2.1') | Pipeline('s2.2') + >>> p3 = p1 | p2 + >>> print repr(p3) + Pipeline('w1.1', 'w1.2') | Pipeline('s2.1') | Pipeline('s2.2') + ''' + def __init__(self, *workers): + self._workers = [] + for worker in workers: + self._workers.append(worker) + self._after = None def __repr__(self): - return "Worker({})".format(repr(self.name)) - - @staticmethod - def from_dict(data): - temp_after = [] - - if isinstance(data, list): - for node in data: - temp_after.append(Worker.from_dict(node)) - return temp_after + workers_representation = [repr(w) for w in self._workers] + if self._after is None: + return 'Pipeline({})'.format(', '.join(workers_representation)) else: - worker = Worker(data['name']) - if data['after']: - temp_after = Worker.from_dict(data['after']) - worker.after = temp_after - return worker - - def to_dict(self, classkey=None): - return _to_dict(self, classkey) - -class Pipeliner(Client): - #TODO: should send monitoring information? - #TODO: should receive and handle a 'job error' from router when some job - # could not be processed (timeout, worker not found etc.) - - def __init__(self, api_host_port, broadcast_host_port, logger=None, - poll_time=50): - super(Pipeliner, self).__init__() - self.api_host_port = api_host_port - self.broadcast_host_port = broadcast_host_port - self.logger = logger - self.poll_time = poll_time - self._new_pipelines = 0 - self._messages = [] - self._pipelines = {} - self._jobs = {} - self.logger.info('Pipeliner started') - - def start(self): - try: - self.connect(self.api_host_port, self.broadcast_host_port) - self.broadcast_subscribe('new pipeline') - self.run() - except KeyboardInterrupt: - self.logger.info('Got SIGNINT (KeyboardInterrupt), exiting.') - self.close_sockets() - - def _update_broadcast(self): - if self.broadcast_poll(self.poll_time): - message = self.broadcast_receive() - self.logger.info('Received from broadcast: {}'.format(message)) - if message.startswith('new pipeline'): - self._new_pipelines += 1 - else: - self._messages.append(message) - - def router_has_new_pipeline(self): - self._update_broadcast() - return self._new_pipelines > 0 - - def ask_for_a_pipeline(self): - self.send_api_request({'command': 'get pipeline'}) - message = self.get_api_reply() - #TODO: if router stops and doesn't answer, pipeliner will stop here - if 'data' in message: - if message['data'] is not None: - self.logger.info('Got this pipeline: {}'.format(message)) - self._new_pipelines -= 1 - return message - elif 'pipeline' in message and message['pipeline'] is None: - self.logger.info('Bad bad router, no pipeline for me.') - return None + workers = ', '.join(workers_representation) + return 'Pipeline({}) | {}'.format(workers, repr(self._after)) + + def __or__(self, other): + if type(self) != type(other): + raise TypeError('You can only use "|" between Pipeline objects') + p = Pipeline(*self._workers) + if self._after is None: + p._after = other else: - self.logger.info('Ignoring malformed pipeline: {}'.format(message)) - #TODO: send a 'rejecting pipeline' request to router - return None + p._after = self._after | other + return p - def get_a_pipeline(self): - data = self.ask_for_a_pipeline() - if data is not None: - self.start_pipeline(data) - - def _send_job(self, worker): - job = {'command': 'add job', 'worker': worker.name, - 'data': worker.data} - self.logger.info('Sending new job: {}'.format(job)) - self.send_api_request(job) - self.logger.info('Sent job: {}'.format(job)) - message = self.get_api_reply() - self.logger.info('Received from router API: {}'.format(message)) - self._jobs[message['job id']] = worker - subscribe_message = 'job finished: {}'.format(message['job id']) - self.broadcast_subscribe(subscribe_message) - self.logger.info('Subscribed on router Broadcast to: {}'\ - .format(subscribe_message)) - - def start_pipeline(self, data): - pipeline_id = data['pipeline id'] - workers = Worker('downloader') - workers.pipeline = pipeline_id - workers.data = data['data'] - self._pipelines[pipeline_id] = [workers] - self._send_job(workers) + def __eq__(self, other): + return type(self) == type(other) and \ + self._workers == other._workers and \ + self._after == other._after - def verify_jobs(self): - self._update_broadcast() - new_messages = [] - for message in self._messages: - if message.startswith('job finished: '): - job_id = message.split(': ')[1].split(' ')[0] - self.logger.info('Processing finished job id {}.'.format(job_id)) - worker = self._jobs[job_id] - self._pipelines[worker.pipeline].remove(worker) - for next_worker in worker.after: - next_worker.data = worker.data - next_worker.pipeline = worker.pipeline - self._pipelines[worker.pipeline].append(next_worker) - self._send_job(next_worker) - del self._jobs[job_id] - if not self._pipelines[worker.pipeline]: - self.send_api_request({'command': 'pipeline finished', - 'pipeline id': worker.pipeline}) - self.get_api_reply() - #TODO: check reply - del self._pipelines[worker.pipeline] - self.logger.info('Finished pipeline {}'\ - .format(worker.pipeline)) - self.get_a_pipeline() - self.broadcast_unsubscribe(message) - self._messages = [] + def to_dict(self): + after = None + if self._after is not None: + after = self._after.to_dict() + return {'workers': self._workers, 'after': after} - def run(self): - self.logger.info('Entering main loop') - self.get_a_pipeline() - while True: - if self.router_has_new_pipeline(): - self.get_a_pipeline() - self.verify_jobs() + @staticmethod + def from_dict(data): + p = Pipeline(*data['workers']) + if data['after'] is not None: + p = p | Pipeline.from_dict(data['after']) + return p diff --git a/pypelinin/pipeliner.py b/pypelinin/pipeliner.py new file mode 100755 index 0000000..8869aea --- /dev/null +++ b/pypelinin/pipeliner.py @@ -0,0 +1,142 @@ +# coding: utf-8 + +#TODO: in future, pipeliner could be a worker in a broker tagged as pipeliner, +# but router needs to support broker tags + +from time import time +from uuid import uuid4 +from . import Client + + +class Pipeliner(Client): + #TODO: should send monitoring information? + #TODO: should receive and handle a 'job error' from router when some job + # could not be processed (timeout, worker not found etc.) + + def __init__(self, api, broadcast, logger=None, poll_time=50): + super(Pipeliner, self).__init__() + self._api_address = api + self._broadcast_address = broadcast + self.logger = logger + self.poll_time = poll_time + self._new_pipelines = None + self._messages = [] + self._pipelines = {} + self._jobs = {} + self.logger.info('Pipeliner started') + + def start(self): + try: + self.connect(self._api_address, self._broadcast_address) + self.broadcast_subscribe('new pipeline') + self.run() + except KeyboardInterrupt: + self.logger.info('Got SIGNINT (KeyboardInterrupt), exiting.') + self.disconnect() + + def _update_broadcast(self): + if self.broadcast_poll(self.poll_time): + message = self.broadcast_receive() + self.logger.info('Received from broadcast: {}'.format(message)) + if message.startswith('new pipeline'): + if self._new_pipelines is None: + self._new_pipelines = 1 + else: + self._new_pipelines += 1 + else: + self._messages.append(message) + + def router_has_new_pipeline(self): + self._update_broadcast() + return self._new_pipelines > 0 + + def ask_for_a_pipeline(self): + self.send_api_request({'command': 'get pipeline'}) + message = self.get_api_reply() + #TODO: if router stops and doesn't answer, pipeliner will stop here + if 'workers' in message and 'data' in message: + if message['data'] is not None: + self.logger.info('Got this pipeline: {}'.format(message)) + if self._new_pipelines is None: + self._new_pipelines = 0 + else: + self._new_pipelines -= 1 + return message + else: + self._new_pipelines = 0 + elif 'pipeline' in message and message['pipeline'] is None: + self.logger.info('Bad bad router, no pipeline for me.') + return None + else: + self.logger.info('Ignoring malformed pipeline: {}'.format(message)) + #TODO: send a 'rejecting pipeline' request to router + return None + + def get_a_pipeline(self): + pipeline_definition = 42 + while pipeline_definition is not None: + pipeline_definition = self.ask_for_a_pipeline() + if pipeline_definition is not None: + self.start_pipeline(pipeline_definition) + + def _send_job(self, worker): + job_request = {'command': 'add job', 'worker': worker.name, + 'data': worker.data} + self.send_api_request(job_request) + self.logger.info('Sent job request: {}'.format(job_request)) + message = self.get_api_reply() + self.logger.info('Received from router API: {}'.format(message)) + self._jobs[message['job id']] = worker + subscribe_message = 'job finished: {}'.format(message['job id']) + self.broadcast_subscribe(subscribe_message) + self.logger.info('Subscribed on router broadcast to: {}'\ + .format(subscribe_message)) + + def start_pipeline(self, pipeline_definition): + pipeline = Worker.from_dict(pipeline_definition['workers']) + pipeline.pipeline_id = pipeline_definition['pipeline id'] + pipeline.data = pipeline_definition['data'] + pipeline.pipeline_started_at = time() + self._pipelines[pipeline.pipeline_id] = [pipeline] + self._send_job(pipeline) + + def verify_jobs(self): + self._update_broadcast() + new_messages = [] + for message in self._messages: + if message.startswith('job finished: '): + job_id = message.split(': ')[1].split(' ')[0] + if job_id in self._jobs: + self.logger.info('Processing finished job id {}.'.format(job_id)) + worker = self._jobs[job_id] + self._pipelines[worker.pipeline_id].remove(worker) + next_workers = worker.after + for next_worker in next_workers: + self.logger.info(' worker after: {}'.format(next_worker.name)) + next_worker.data = worker.data + next_worker.pipeline_id = worker.pipeline_id + next_worker.pipeline_started_at = worker.pipeline_started_at + self._pipelines[worker.pipeline_id].append(next_worker) + self._send_job(next_worker) + del self._jobs[job_id] + if not self._pipelines[worker.pipeline_id]: + total_time = time() - worker.pipeline_started_at + self.logger.info('Finished pipeline {}'\ + .format(worker.pipeline_id)) + self.send_api_request({'command': 'pipeline finished', + 'pipeline id': worker.pipeline_id, + 'duration': total_time}) + self.get_api_reply() + #TODO: check reply + del self._pipelines[worker.pipeline_id] + self.get_a_pipeline() + self.broadcast_unsubscribe(message) + self._messages = [] + + def run(self): + self.logger.info('Entering main loop') + self.get_a_pipeline() + while True: + if self.router_has_new_pipeline(): + self.get_a_pipeline() + self.verify_jobs() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..b829942 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,103 @@ +# coding: utf-8 + +import unittest +from pypelinin import Pipeline + + +class PipelineTest(unittest.TestCase): + def test_pipeline_init(self): + pipeline = Pipeline('worker_name') + + self.assertEqual(pipeline._workers, ['worker_name']) + self.assertEqual(pipeline._after, None) + self.assertEqual(repr(pipeline), "Pipeline('worker_name')") + + def test_pipeline_with_more_than_one_worker(self): + pipeline = Pipeline('worker_1', 'worker_2', 'worker_3') + + self.assertEqual(pipeline._workers, ['worker_1', 'worker_2', + 'worker_3']) + self.assertEqual(pipeline._after, None) + expected = "Pipeline('worker_1', 'worker_2', 'worker_3')" + self.assertEqual(repr(pipeline), expected) + + def test_pipelines_with_same_workers_should_be_equal(self): + p1 = Pipeline('w1') + p2 = Pipeline('w1') + self.assertEqual(p1, p2) + p3 = Pipeline('w1', 'w2') + p4 = Pipeline('w1', 'w2') + self.assertEqual(p3, p4) + + def test_pipeline_after(self): + pipeline = Pipeline('w1') | Pipeline('w2') + + self.assertEqual(pipeline._workers, ['w1']) + self.assertEqual(pipeline._after, Pipeline('w2')) + + def test_pipeline_after_should_not_change_current_object(self): + p1 = Pipeline('w1') + p2 = Pipeline('w2') + p3 = Pipeline('w3.1', 'w3.2') + p4 = p1 | p2 | p3 + self.assertEqual(p1, Pipeline('w1')) + self.assertEqual(p2, Pipeline('w2')) + self.assertEqual(p3, Pipeline('w3.1', 'w3.2')) + self.assertEqual(p4, p1 | p2 | p3) + + def test_pipeline_after_repr(self): + self.assertEqual(repr(Pipeline('w1') | Pipeline('w2')), + "Pipeline('w1') | Pipeline('w2')") + self.assertEqual(repr(Pipeline('w1') | Pipeline('w2') | Pipeline('w3')), + "Pipeline('w1') | Pipeline('w2') | Pipeline('w3')") + + def test_after_should_raise_TypeError_if_right_object_is_not_Pipeline(self): + with self.assertRaises(TypeError): + Pipeline('w1') | 42 + with self.assertRaises(TypeError): + Pipeline('w1') | 'answer' + with self.assertRaises(TypeError): + Pipeline('w1') | 3.14 + with self.assertRaises(TypeError): + Pipeline('w1') | Exception + with self.assertRaises(TypeError): + Pipeline('w1') | Pipeline # class, not instance + + def test_pipelines_with_same_workers_and_after_should_be_equal(self): + self.assertNotEqual(Pipeline('w1') | Pipeline('w2'), Pipeline('w1')) + self.assertNotEqual(Pipeline('w1') | Pipeline('w2'), Pipeline('w2')) + self.assertEqual(Pipeline('w1') | Pipeline('w2'), + Pipeline('w1') | Pipeline('w2')) + + def test_pipeline_to_dict_should_serialize_it(self): + self.assertEqual(Pipeline('w1').to_dict(), {'workers': ['w1'], + 'after': None}) + self.assertEqual(Pipeline('w1', 'w2', 'w3').to_dict(), + {'workers': ['w1', 'w2', 'w3'], 'after': None}) + + p2 = Pipeline('w1') | Pipeline('w2') + self.assertEqual(p2.to_dict(), {'workers': ['w1'], + 'after': {'workers': ['w2'], + 'after': None}}) + p3 = Pipeline('a', 'b') | Pipeline('c', 'd') | Pipeline('e', 'f', 'g') + p3_3 = {'workers': ['e', 'f', 'g'], 'after': None} + p3_2 = {'workers': ['c', 'd'], 'after': p3_3} + p3_1 = {'workers': ['a', 'b'], 'after': p3_2} + self.assertEqual(p3.to_dict(), p3_1) + + def test_pipeline_from_dict_should_deserialize_it(self): + d1 = {'workers': ['w1'], 'after': None} + self.assertEqual(Pipeline.from_dict(d1), Pipeline('w1')) + + d2 = {'workers': ['w1', 'w2', 'w3'], 'after': None} + self.assertEqual(Pipeline.from_dict(d2), Pipeline('w1', 'w2', 'w3')) + + d3 = {'workers': ['w1'], 'after': {'workers': ['w2'], 'after': None}} + self.assertEqual(Pipeline.from_dict(d3), + Pipeline('w1') | Pipeline('w2')) + + p3 = Pipeline('a', 'b') | Pipeline('c', 'd') | Pipeline('e', 'f', 'g') + p3_3 = {'workers': ['e', 'f', 'g'], 'after': None} + p3_2 = {'workers': ['c', 'd'], 'after': p3_3} + p3_1 = {'workers': ['a', 'b'], 'after': p3_2} + self.assertEqual(Pipeline.from_dict(p3_1), p3) diff --git a/tests/test_worker.py b/tests/test_worker.py deleted file mode 100644 index 7d54f83..0000000 --- a/tests/test_worker.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 - -import unittest -from pypelinin import Worker - - -class WorkerTest(unittest.TestCase): - def test_pipeline_init(self): - pipeline = Worker('worker_id') - - self.assertEqual(pipeline.name, 'worker_id') - self.assertEqual(pipeline.after, []) - self.assertEqual(repr(pipeline), "Worker('worker_id')") - - def test_pipeline_worker_pipe_pipeline(self): - pipeline = Worker('w1') | Worker('w2') - - self.assertEqual(pipeline.name, "w1") - self.assertEqual(pipeline.after, [Worker('w2')]) - - def test_pipeline_worker_pipe_parallel_pipelines_pipe_worker(self): - pipeline = Worker('V1') | [Worker('V2'), Worker('V3')] | Worker('V4') - self.assertEqual(pipeline.after, - [[Worker('V2'), Worker('V3')], Worker('V4')]) - - def test_pipeline_worker_pipe_nested_pipe_in_parallel_pipe_worker(self): - pipeline = Worker('V1') | [ Worker('V2') | Worker('A2'), - Worker('V3') - ] | Worker('V4') - - self.assertEqual(pipeline.after, - [[Worker('V2') | Worker('A2'), - Worker('V3')], Worker('V4')]) - - def test_complex_pipeline_to_dict_and_from_dict(self): - pipeline = Worker('V1') | [ Worker('V2') | Worker('A2'), - Worker('V3') - ] | Worker('V4') - - serialized = pipeline.to_dict() - pipeline_from_dict = Worker.from_dict(serialized) - - self.assertEqual(pipeline, pipeline_from_dict) - self.assertEqual(pipeline_from_dict.after, - [[Worker('V2') | Worker('A2'), - Worker('V3')], Worker('V4')]) - self.assertEqual(pipeline.to_dict(), - pipeline_from_dict.to_dict()) From 0fae399bc076fa4d35d1ad5b09608e21ce52f0c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sat, 13 Oct 2012 01:59:07 -0300 Subject: [PATCH 21/37] Fix misusing of `md5.md5`, correct is `uuid.uuid4` --- tests/test_broker.py | 4 ++-- tests/test_router.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_broker.py b/tests/test_broker.py index ad51ca5..b4c304a 100644 --- a/tests/test_broker.py +++ b/tests/test_broker.py @@ -11,7 +11,7 @@ from time import sleep, time from subprocess import Popen, PIPE from multiprocessing import cpu_count -from md5 import md5 +from uuid import uuid4 import zmq from psutil import Process, NoSuchProcess from utils import default_config @@ -148,7 +148,7 @@ def send_and_receive_jobs(self, jobs, wait_finished_job=False): else: job = {'worker': None} if 'job id' not in job: - job['job id'] = md5().hexdigest() + job['job id'] = uuid4().hex self.api.send_json(job) elif msg['command'] == 'job finished': self.api.send_json({'answer': 'good job!'}) diff --git a/tests/test_router.py b/tests/test_router.py index 1d1623b..43f80e3 100644 --- a/tests/test_router.py +++ b/tests/test_router.py @@ -9,6 +9,7 @@ import zmq +#TODO: validate pipeline data when requested time_to_wait = 150 class TestRouter(unittest.TestCase): From a7d6c5f4828f67800ff7b1c44db95723acf4bc9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sun, 14 Oct 2012 15:16:00 -0300 Subject: [PATCH 22/37] New way to represent pipeline: DAGs Now using directed acyclic graphs to represent a pipeline - it's more complete than older representations. The graph is represented using a simple `dict`, as proposed on http://www.python.org/doc/essays/graphs/ --- pypelinin/__init__.py | 2 +- pypelinin/pipeline.py | 134 ++++++++++++--------- requirements/production.txt | 2 + tests/test_pipeline.py | 231 ++++++++++++++++++++---------------- 4 files changed, 211 insertions(+), 158 deletions(-) mode change 100755 => 100644 pypelinin/pipeline.py diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index c04425f..d940218 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -3,5 +3,5 @@ from .router import Router from .client import Client from .broker import Broker +from .pipeline import Job, Pipeline#, PipelineManager #from .pipeliner import Pipeliner -from .pipeline import Pipeline#, PipelineManager diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py old mode 100755 new mode 100644 index abc260a..d3e4a36 --- a/pypelinin/pipeline.py +++ b/pypelinin/pipeline.py @@ -1,63 +1,83 @@ # coding: utf-8 -class Pipeline(object): - '''Representation of a series of parallel workers - - A `Pipeline` is basically a list of workers that will execute in parallel. - For example: - >>> my_pipeline = Pipeline('w1', 'w2') - - This represents that worker 'w1' will run in parallel to worker 'w2'. - You can combine pipelines using the `|` operator, as follows: - my_pipeline = Pipeline('s1') | Pipeline('s2') - In the example above, worker 's1' will be executed and then, sequentially, - 's2'. - The `|` operation returns a new `Pipeline` object, so you can combine your - pipeline objects easily as: - >>> p1 = Pipeline('w1.1', 'w1.2') - >>> p2 = Pipeline('s2.1') | Pipeline('s2.2') - >>> p3 = p1 | p2 - >>> print repr(p3) - Pipeline('w1.1', 'w1.2') | Pipeline('s2.1') | Pipeline('s2.2') - ''' - def __init__(self, *workers): - self._workers = [] - for worker in workers: - self._workers.append(worker) - self._after = None +from itertools import product + +from pygraph.classes.digraph import digraph as DiGraph +from pygraph.algorithms.cycles import find_cycle +from pygraph.readwrite.dot import write + + +class Job(object): + def __init__(self, name): + self.name = name def __repr__(self): - workers_representation = [repr(w) for w in self._workers] - if self._after is None: - return 'Pipeline({})'.format(', '.join(workers_representation)) - else: - workers = ', '.join(workers_representation) - return 'Pipeline({}) | {}'.format(workers, repr(self._after)) - - def __or__(self, other): - if type(self) != type(other): - raise TypeError('You can only use "|" between Pipeline objects') - p = Pipeline(*self._workers) - if self._after is None: - p._after = other - else: - p._after = self._after | other - return p + return 'Job({})'.format(repr(self.name)) def __eq__(self, other): - return type(self) == type(other) and \ - self._workers == other._workers and \ - self._after == other._after - - def to_dict(self): - after = None - if self._after is not None: - after = self._after.to_dict() - return {'workers': self._workers, 'after': after} - - @staticmethod - def from_dict(data): - p = Pipeline(*data['workers']) - if data['after'] is not None: - p = p | Pipeline.from_dict(data['after']) - return p + return type(self) == type(other) and self.name == other.name + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash(self.name) + + +class Pipeline(object): + def __init__(self, pipeline): + self._original_graph = pipeline + self._normalize() + nodes = set() + for key, value in self._graph: + nodes.add(key) + nodes.add(value) + nodes.discard(None) + self.jobs = tuple(nodes) + self._define_starters() + self._create_digraph() + if not self._validate(): + raise ValueError('The pipeline graph have cycles or do not have a ' + 'starter job') + + def _normalize(self): + new_graph = [] + for keys, values in self._original_graph.items(): + if type(keys) is Job: + keys = [keys] + if type(values) not in (tuple, list): + values = [values] + for key in keys: + if not values: + new_graph.append((key, None)) + else: + for value in values: + new_graph.append((key, value)) + self._graph = new_graph + + def _define_starters(self): + possible_starters = set() + others = set() + for key, value in self._graph: + others.add(value) + possible_starters.add(key) + self.starters = tuple(possible_starters - others) + + def _create_digraph(self): + digraph = DiGraph() + digraph.add_nodes(self.jobs) + for edge in self._graph: + if edge[1] is not None: + digraph.add_edge(edge) + self._digraph = digraph + + def _validate(self): + #TODO: A -> B, A -> C, B -> C + if len(self.starters) == 0: + return False + if find_cycle(self._digraph): + return False + return True + + def to_dot(self): + return write(self._digraph) diff --git a/requirements/production.txt b/requirements/production.txt index d4da163..b13d6c3 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -1,2 +1,4 @@ pyzmq psutil +python-graph-core +python-graph-dot diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index b829942..7f1580c 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,103 +1,134 @@ # coding: utf-8 import unittest -from pypelinin import Pipeline - - -class PipelineTest(unittest.TestCase): - def test_pipeline_init(self): - pipeline = Pipeline('worker_name') - - self.assertEqual(pipeline._workers, ['worker_name']) - self.assertEqual(pipeline._after, None) - self.assertEqual(repr(pipeline), "Pipeline('worker_name')") - - def test_pipeline_with_more_than_one_worker(self): - pipeline = Pipeline('worker_1', 'worker_2', 'worker_3') - - self.assertEqual(pipeline._workers, ['worker_1', 'worker_2', - 'worker_3']) - self.assertEqual(pipeline._after, None) - expected = "Pipeline('worker_1', 'worker_2', 'worker_3')" - self.assertEqual(repr(pipeline), expected) - - def test_pipelines_with_same_workers_should_be_equal(self): - p1 = Pipeline('w1') - p2 = Pipeline('w1') - self.assertEqual(p1, p2) - p3 = Pipeline('w1', 'w2') - p4 = Pipeline('w1', 'w2') - self.assertEqual(p3, p4) - - def test_pipeline_after(self): - pipeline = Pipeline('w1') | Pipeline('w2') - - self.assertEqual(pipeline._workers, ['w1']) - self.assertEqual(pipeline._after, Pipeline('w2')) - - def test_pipeline_after_should_not_change_current_object(self): - p1 = Pipeline('w1') - p2 = Pipeline('w2') - p3 = Pipeline('w3.1', 'w3.2') - p4 = p1 | p2 | p3 - self.assertEqual(p1, Pipeline('w1')) - self.assertEqual(p2, Pipeline('w2')) - self.assertEqual(p3, Pipeline('w3.1', 'w3.2')) - self.assertEqual(p4, p1 | p2 | p3) - - def test_pipeline_after_repr(self): - self.assertEqual(repr(Pipeline('w1') | Pipeline('w2')), - "Pipeline('w1') | Pipeline('w2')") - self.assertEqual(repr(Pipeline('w1') | Pipeline('w2') | Pipeline('w3')), - "Pipeline('w1') | Pipeline('w2') | Pipeline('w3')") - - def test_after_should_raise_TypeError_if_right_object_is_not_Pipeline(self): - with self.assertRaises(TypeError): - Pipeline('w1') | 42 - with self.assertRaises(TypeError): - Pipeline('w1') | 'answer' - with self.assertRaises(TypeError): - Pipeline('w1') | 3.14 - with self.assertRaises(TypeError): - Pipeline('w1') | Exception - with self.assertRaises(TypeError): - Pipeline('w1') | Pipeline # class, not instance - - def test_pipelines_with_same_workers_and_after_should_be_equal(self): - self.assertNotEqual(Pipeline('w1') | Pipeline('w2'), Pipeline('w1')) - self.assertNotEqual(Pipeline('w1') | Pipeline('w2'), Pipeline('w2')) - self.assertEqual(Pipeline('w1') | Pipeline('w2'), - Pipeline('w1') | Pipeline('w2')) - - def test_pipeline_to_dict_should_serialize_it(self): - self.assertEqual(Pipeline('w1').to_dict(), {'workers': ['w1'], - 'after': None}) - self.assertEqual(Pipeline('w1', 'w2', 'w3').to_dict(), - {'workers': ['w1', 'w2', 'w3'], 'after': None}) - - p2 = Pipeline('w1') | Pipeline('w2') - self.assertEqual(p2.to_dict(), {'workers': ['w1'], - 'after': {'workers': ['w2'], - 'after': None}}) - p3 = Pipeline('a', 'b') | Pipeline('c', 'd') | Pipeline('e', 'f', 'g') - p3_3 = {'workers': ['e', 'f', 'g'], 'after': None} - p3_2 = {'workers': ['c', 'd'], 'after': p3_3} - p3_1 = {'workers': ['a', 'b'], 'after': p3_2} - self.assertEqual(p3.to_dict(), p3_1) - - def test_pipeline_from_dict_should_deserialize_it(self): - d1 = {'workers': ['w1'], 'after': None} - self.assertEqual(Pipeline.from_dict(d1), Pipeline('w1')) - - d2 = {'workers': ['w1', 'w2', 'w3'], 'after': None} - self.assertEqual(Pipeline.from_dict(d2), Pipeline('w1', 'w2', 'w3')) - - d3 = {'workers': ['w1'], 'after': {'workers': ['w2'], 'after': None}} - self.assertEqual(Pipeline.from_dict(d3), - Pipeline('w1') | Pipeline('w2')) - - p3 = Pipeline('a', 'b') | Pipeline('c', 'd') | Pipeline('e', 'f', 'g') - p3_3 = {'workers': ['e', 'f', 'g'], 'after': None} - p3_2 = {'workers': ['c', 'd'], 'after': p3_3} - p3_1 = {'workers': ['a', 'b'], 'after': p3_2} - self.assertEqual(Pipeline.from_dict(p3_1), p3) +from textwrap import dedent +from pypelinin import Job, Pipeline + + +class GraphTest(unittest.TestCase): + def test_jobs(self): + result = Pipeline({Job('A'): [Job('B')], + Job('B'): [Job('C'), Job('D'), Job('E')], + Job('Z'): [Job('W')], + Job('W'): Job('A')}).jobs + expected = (Job('A'), Job('B'), Job('C'), Job('D'), Job('E'), Job('W'), + Job('Z')) + self.assertEqual(set(result), set(expected)) + + def test_get_starters(self): + result = Pipeline({Job('A'): []}).starters + expected = (Job('A'),) + self.assertEqual(set(result), set(expected)) + + result = Pipeline({Job('A'): [], Job('B'): []}).starters + expected = (Job('A'), Job('B')) + self.assertEqual(set(result), set(expected)) + + result = Pipeline({Job('A'): [Job('B')], Job('B'): []}).starters + expected = (Job('A'),) + self.assertEqual(set(result), set(expected)) + + result = Pipeline({Job('A'): [Job('B')], + Job('B'): [Job('C'), Job('D'), Job('E')], + Job('Z'): [Job('W')]}).starters + expected = (Job('A'), Job('Z')) + self.assertEqual(set(result), set(expected)) + + result = Pipeline({('A', 'B', 'C'): ['D']}).starters + expected = ['A', 'B', 'C'] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({(Job('A'), Job('B'), Job('C')): [Job('D')], + Job('E'): (Job('B'), Job('F'))}).starters + expected = (Job('A'), Job('C'), Job('E')) + self.assertEqual(set(result), set(expected)) + + def test_normalize(self): + result = Pipeline({Job('A'): Job('B')})._graph + expected = [(Job('A'), Job('B'))] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({Job('A'): [Job('B')]})._graph + expected = [(Job('A'), Job('B'))] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({(Job('A'),): (Job('B'),)})._graph + expected = [(Job('A'), Job('B'))] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({(Job('A'), Job('C')): Job('B')})._graph + expected = [(Job('A'), Job('B')), (Job('C'), Job('B'))] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({('A', 'C'): ['B', 'D', 'E']})._graph + expected = [('A', 'B'), ('A', 'D'), ('A', 'E'), ('C', 'B'), ('C', 'D'), + ('C', 'E')] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({Job('ABC'): []})._graph # problem here if use string + expected = [(Job('ABC'), None)] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({Job('A'): [], Job('B'): []})._graph + expected = [(Job('A'), None), (Job('B'), None)] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({Job('A'): [Job('B')], Job('B'): []})._graph + expected = [(Job('A'), Job('B')), (Job('B'), None)] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({Job('QWE'): [Job('B')], + Job('B'): [Job('C'), Job('D'), Job('E')], + Job('Z'): [Job('W')]})._graph + expected = [(Job('QWE'), Job('B')), (Job('B'), Job('C')), + (Job('B'), Job('D')), (Job('B'), Job('E')), + (Job('Z'), Job('W'))] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({(Job('A'), Job('B'), Job('C')): [Job('D')]})._graph + expected = [(Job('A'), Job('D')), (Job('B'), Job('D')), + (Job('C'), Job('D'))] + self.assertEqual(set(result), set(expected)) + + result = Pipeline({(Job('A'), Job('B'), Job('C')): [Job('D')], + Job('E'): (Job('B'), Job('F'))})._graph + expected = [(Job('A'), Job('D')), (Job('B'), Job('D')), + (Job('C'), Job('D')), (Job('E'), Job('B')), + (Job('E'), Job('F'))] + self.assertEqual(set(result), set(expected)) + + def test_validate_graph(self): + #should have at least one starter node + with self.assertRaises(ValueError): + Pipeline({Job('A'): Job('A')}) + with self.assertRaises(ValueError): + Pipeline({Job('A'): [Job('B')], Job('B'): [Job('A')]}) + + #should not have cycles + with self.assertRaises(ValueError): + print Pipeline({Job('A'): [Job('B')], Job('B'): [Job('C')], + Job('C'): [Job('B')]})._graph + with self.assertRaises(ValueError): + Pipeline({Job('A'): [Job('B')], Job('B'): [Job('C')], + Job('C'): [Job('D')], Job('D'): [Job('B')]}) + + def test_dot(self): + result = Pipeline({(Job('A'), Job('B'), Job('C')): [Job('D')], + Job('E'): (Job('B'), Job('F'))}).to_dot().strip() + expected = dedent(''' + digraph graphname { + "Job('A')"; + "Job('C')"; + "Job('B')"; + "Job('E')"; + "Job('D')"; + "Job('F')"; + "Job('A')" -> "Job('D')"; + "Job('C')" -> "Job('D')"; + "Job('B')" -> "Job('D')"; + "Job('E')" -> "Job('B')"; + "Job('E')" -> "Job('F')"; + } + ''').strip() + + self.assertEqual(result, expected) From aa04ad0506f8cd139e44488670c67b4c98720390 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sun, 14 Oct 2012 19:01:36 -0300 Subject: [PATCH 23/37] Improve `Pipeline` and `Job` to use on `Pipeliner` --- pypelinin/pipeline.py | 40 +++++++++- tests/test_pipeline.py | 173 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+), 2 deletions(-) diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py index d3e4a36..a6d7cae 100644 --- a/pypelinin/pipeline.py +++ b/pypelinin/pipeline.py @@ -8,8 +8,9 @@ class Job(object): - def __init__(self, name): + def __init__(self, name, data=None): self.name = name + self.data = data def __repr__(self): return 'Job({})'.format(repr(self.name)) @@ -25,7 +26,9 @@ def __hash__(self): class Pipeline(object): - def __init__(self, pipeline): + def __init__(self, pipeline, data=None): + self.data = data + self._finished_jobs = set() self._original_graph = pipeline self._normalize() nodes = set() @@ -39,6 +42,17 @@ def __init__(self, pipeline): if not self._validate(): raise ValueError('The pipeline graph have cycles or do not have a ' 'starter job') + if data is not None: + for job in self.jobs: + job.data = data + job.pipeline = self + + self._dependencies = {job: set() for job in self.jobs} + for job_1, job_2 in self._graph: + if job_2 is None: + continue + self._dependencies[job_2].add(job_1) + self.sent_jobs = set() def _normalize(self): new_graph = [] @@ -81,3 +95,25 @@ def _validate(self): def to_dot(self): return write(self._digraph) + + def add_finished_job(self, job): + if job not in self.jobs: + raise ValueError('Job {} not in pipeline'.format(job)) + elif job in self._finished_jobs: + raise RuntimeError('Job {} was already declared as ' + 'finished'.format(job)) + self._finished_jobs.add(job) + + def finished_job(self, job): + return job in self._finished_jobs + + def finished(self): + return set(self.jobs) == self._finished_jobs + + def available_jobs(self): + available = set() + for job in self.jobs: + if self._dependencies[job].issubset(self._finished_jobs) and \ + job not in self._finished_jobs: + available.add(job) + return available diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 7f1580c..6b992fa 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -132,3 +132,176 @@ def test_dot(self): ''').strip() self.assertEqual(result, expected) + + def test_pipeline_should_propagate_data_among_jobs(self): + job_1 = Job('w1') + job_2 = Job('w2') + job_3 = Job('w3') + pipeline_data = {'python': 42} + pipeline = Pipeline({job_1: job_2, job_2: job_3}, data=pipeline_data) + self.assertEqual(pipeline.data, pipeline_data) + self.assertEqual(job_1.data, pipeline_data) + self.assertEqual(job_2.data, pipeline_data) + self.assertEqual(job_3.data, pipeline_data) + self.assertEqual(job_1.pipeline, pipeline) + self.assertEqual(job_2.pipeline, pipeline) + self.assertEqual(job_3.pipeline, pipeline) + + def test_pipeline_add_finished_job(self): + job_1 = Job('w1') + job_2 = Job('w2') + job_3 = Job('w3') + pipeline_data = {'python': 42} + pipeline = Pipeline({job_1: job_2, job_2: job_3}, data=pipeline_data) + job_4 = Job('w4') + + self.assertFalse(pipeline.finished_job(job_1)) + self.assertFalse(pipeline.finished_job(job_2)) + self.assertFalse(pipeline.finished_job(job_3)) + + pipeline.add_finished_job(job_1) + self.assertTrue(pipeline.finished_job(job_1)) + self.assertFalse(pipeline.finished_job(job_2)) + self.assertFalse(pipeline.finished_job(job_3)) + + pipeline.add_finished_job(job_2) + self.assertTrue(pipeline.finished_job(job_1)) + self.assertTrue(pipeline.finished_job(job_2)) + self.assertFalse(pipeline.finished_job(job_3)) + + pipeline.add_finished_job(job_3) + self.assertTrue(pipeline.finished_job(job_1)) + self.assertTrue(pipeline.finished_job(job_2)) + self.assertTrue(pipeline.finished_job(job_3)) + + with self.assertRaises(ValueError): + pipeline.add_finished_job(job_4) # job not in pipeline + with self.assertRaises(RuntimeError): + pipeline.add_finished_job(job_3) # already finished + + def test_pipeline_finished(self): + job_1 = Job('w1') + job_2 = Job('w2') + job_3 = Job('w3') + pipeline_data = {'python': 42} + pipeline = Pipeline({job_1: job_2, job_2: job_3}, data=pipeline_data) + + self.assertFalse(pipeline.finished()) + pipeline.add_finished_job(job_1) + self.assertFalse(pipeline.finished()) + pipeline.add_finished_job(job_2) + self.assertFalse(pipeline.finished()) + pipeline.add_finished_job(job_3) + self.assertTrue(pipeline.finished()) + + def test_default_attributes(self): + pipeline = Pipeline({Job('test'): None}) + self.assertEqual(pipeline.data, None) + self.assertEqual(pipeline.jobs, (Job('test'),)) + self.assertEqual(pipeline.sent_jobs, set()) + + def test_available_jobs(self): + job_1 = Job('w1') + job_2 = Job('w2') + job_3 = Job('w3') + pipeline_data = {'python': 42} + pipeline = Pipeline({job_1: job_2, job_2: job_3}, data=pipeline_data) + + expected = [job_1] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_1) + expected = [job_2] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_2) + expected = [job_3] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_3) + self.assertEqual(pipeline.available_jobs(), set()) + + job_4, job_5, job_6, job_7 = Job('w4'), Job('w5'), Job('w6'), Job('w7') + job_8, job_9, job_10 = Job('8'), Job('9'), Job('10') + job_11, job_12, job_13 = Job('11'), Job('12'), Job('13') + job_14, job_15, job_16 = Job('14'), Job('15'), Job('16') + pipeline_data = {'python': 42} + pipeline = Pipeline({job_1: (job_2, job_3), + job_2: (job_4, job_16), + job_3: job_4, + job_4: job_5, + job_5: (job_6, job_7, job_8, job_9), + (job_6, job_7, job_8): job_10, + (job_10, job_11): (job_12, job_13, job_14), + job_15: None}, + data=pipeline_data) + + expected = [job_1, job_11, job_15] + self.assertEqual(pipeline.available_jobs(), set(expected)) + self.assertEqual(pipeline.available_jobs(), set(pipeline.starters)) + + pipeline.add_finished_job(job_1) + expected = [job_11, job_15, job_2, job_3] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_2) + expected = [job_11, job_15, job_3, job_16] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_3) + expected = [job_11, job_15, job_4, job_16] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_16) + expected = [job_11, job_15, job_4] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_4) + expected = [job_11, job_15, job_5] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_11) + expected = [job_15, job_5] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_5) + expected = [job_15, job_6, job_7, job_8, job_9] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_6) + expected = [job_15, job_7, job_8, job_9] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_15) + expected = [job_7, job_8, job_9] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_7) + expected = [job_8, job_9] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_9) + expected = [job_8] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_8) + expected = [job_10] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_10) + expected = [job_12, job_13, job_14] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_13) + expected = [job_12, job_14] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_12) + expected = [job_14] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + pipeline.add_finished_job(job_14) + expected = [] + self.assertEqual(pipeline.available_jobs(), set(expected)) + + self.assertTrue(pipeline.finished()) From 5c54d4b9654ec5994ddd4e646ff1bf3fb5d3ebb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sun, 14 Oct 2012 20:00:09 -0300 Subject: [PATCH 24/37] Add tests for `Job` --- pypelinin/pipeline.py | 15 ++++++++++----- tests/test_pipeline.py | 24 ++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py index a6d7cae..acf0906 100644 --- a/pypelinin/pipeline.py +++ b/pypelinin/pipeline.py @@ -8,25 +8,30 @@ class Job(object): - def __init__(self, name, data=None): - self.name = name + def __init__(self, worker_name, data=None): + self.worker_name = worker_name self.data = data def __repr__(self): - return 'Job({})'.format(repr(self.name)) + #TODO: change this when add `input` + return 'Job({})'.format(repr(self.worker_name)) def __eq__(self, other): - return type(self) == type(other) and self.name == other.name + #TODO: change this when add `input` + return type(self) == type(other) and \ + self.worker_name == other.worker_name def __ne__(self, other): return not self.__eq__(other) def __hash__(self): - return hash(self.name) + #TODO: change this when add `input` + return hash(self.worker_name) #TODO: change this when add `input` class Pipeline(object): def __init__(self, pipeline, data=None): + #TODO: should raise if pipeline is not composed of `Job`s? self.data = data self._finished_jobs = set() self._original_graph = pipeline diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 6b992fa..99145bd 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -4,8 +4,28 @@ from textwrap import dedent from pypelinin import Job, Pipeline - -class GraphTest(unittest.TestCase): +class JobTest(unittest.TestCase): + def test_worker_name(self): + self.assertEqual(Job('ABC').worker_name, 'ABC') + + def test_should_start_with_no_data(self): + self.assertEqual(Job('ABC').data, None) + + def test_repr(self): + self.assertEqual(repr(Job('ABC')), "Job('ABC')") + + def test_equal_not_equal_and_hash(self): + job_1 = Job('qwe') + job_2 = Job('qwe') + job_3 = Job('bla') + self.assertTrue(job_1 == job_2) + self.assertTrue(job_2 == job_1) + self.assertTrue(job_1 != job_3) + self.assertTrue(job_3 != job_1) + self.assertEqual(hash(job_1), hash(job_2)) + self.assertNotEqual(hash(job_1), hash(job_3)) + +class PipelineTest(unittest.TestCase): def test_jobs(self): result = Pipeline({Job('A'): [Job('B')], Job('B'): [Job('C'), Job('D'), Job('E')], From 73711d4d2e480ca5cec14b498457fa8ef432166b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sun, 14 Oct 2012 22:40:08 -0300 Subject: [PATCH 25/37] Improve `Job` and `Pipeline` (see below) - Add `serialize` and `deserialize` to `Pipeline` and `Job` - Add `__eq__`, `__ne__` and `__hash__` to `Pipeline` - `Pipeline` only accept `Job` objects in graph --- pypelinin/pipeline.py | 77 +++++++++++++++++++++++++++++++++++++----- tests/test_pipeline.py | 77 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 139 insertions(+), 15 deletions(-) diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py index acf0906..0a047c2 100644 --- a/pypelinin/pipeline.py +++ b/pypelinin/pipeline.py @@ -28,20 +28,35 @@ def __hash__(self): #TODO: change this when add `input` return hash(self.worker_name) #TODO: change this when add `input` + def serialize(self): + #TODO: change this when add `input` + if self.data is not None: + return tuple({'worker_name': self.worker_name, + 'data': self.data}.items()) + else: + return tuple({'worker_name': self.worker_name}.items()) + + @staticmethod + def deserialize(information): + information = dict(information) + if 'worker_name' not in information: + raise ValueError('`worker_name` was not specified') + elif 'data' not in information: + return Job(information['worker_name']) + else: + return Job(information['worker_name'], information['data']) + class Pipeline(object): def __init__(self, pipeline, data=None): #TODO: should raise if pipeline is not composed of `Job`s? self.data = data self._finished_jobs = set() - self._original_graph = pipeline - self._normalize() - nodes = set() - for key, value in self._graph: - nodes.add(key) - nodes.add(value) - nodes.discard(None) - self.jobs = tuple(nodes) + self._original_graph = self._graph = pipeline + if type(pipeline) == dict: + self._normalize() + self._check_types() + self._define_jobs() self._define_starters() self._create_digraph() if not self._validate(): @@ -74,6 +89,19 @@ def _normalize(self): new_graph.append((key, value)) self._graph = new_graph + def _check_types(self): + for key, value in self._graph: + if type(key) is not Job or type(value) not in (Job, type(None)): + raise ValueError('Only `Job` objects are accepted') + + def _define_jobs(self): + nodes = set() + for key, value in self._graph: + nodes.add(key) + nodes.add(value) + nodes.discard(None) + self.jobs = tuple(nodes) + def _define_starters(self): possible_starters = set() others = set() @@ -91,7 +119,7 @@ def _create_digraph(self): self._digraph = digraph def _validate(self): - #TODO: A -> B, A -> C, B -> C + #TODO: test A -> B, A -> C, B -> C if len(self.starters) == 0: return False if find_cycle(self._digraph): @@ -101,6 +129,28 @@ def _validate(self): def to_dot(self): return write(self._digraph) + def serialize(self): + result = [] + for key, value in self._graph: + serialized_key = key.serialize() + serialized_value = None + if value is not None: + serialized_value = value.serialize() + result.append((serialized_key, serialized_value)) + return tuple({'graph': tuple(result), 'data': self.data}.items()) + + @staticmethod + def deserialize(info): + info = dict(info) + new_graph = [] + for key, value in info['graph']: + deserialized_key = Job.deserialize(key) + deserialized_value = None + if value is not None: + deserialized_value = Job.deserialize(value) + new_graph.append((deserialized_key, deserialized_value)) + return Pipeline(new_graph, data=info['data']) + def add_finished_job(self, job): if job not in self.jobs: raise ValueError('Job {} not in pipeline'.format(job)) @@ -122,3 +172,12 @@ def available_jobs(self): job not in self._finished_jobs: available.add(job) return available + + def __eq__(self, other): + return self._graph == other._graph + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash(self.serialize()) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 99145bd..1f2369f 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -25,7 +25,27 @@ def test_equal_not_equal_and_hash(self): self.assertEqual(hash(job_1), hash(job_2)) self.assertNotEqual(hash(job_1), hash(job_3)) + def test_serialize_and_deserialize(self): + with self.assertRaises(ValueError): + Job.deserialize({}) # no key 'worker_name' + + job = Job('test') + expected = tuple({'worker_name': 'test'}.items()) + self.assertEqual(job.serialize(), expected) + self.assertEqual(Job.deserialize(expected), job) + + job_with_data = Job('testing', data={'python': 42, 'spam': 'eggs'}) + expected_with_data = {'worker_name': 'testing', + 'data': {'python': 42, 'spam': 'eggs'}} + expected_with_data = tuple(expected_with_data.items()) + self.assertEqual(job_with_data.serialize(), expected_with_data) + self.assertEqual(Job.deserialize(expected_with_data), job_with_data) + class PipelineTest(unittest.TestCase): + def test_only_accept_Job_objects(self): + with self.assertRaises(ValueError): + Pipeline({'test': 123}) + def test_jobs(self): result = Pipeline({Job('A'): [Job('B')], Job('B'): [Job('C'), Job('D'), Job('E')], @@ -54,8 +74,8 @@ def test_get_starters(self): expected = (Job('A'), Job('Z')) self.assertEqual(set(result), set(expected)) - result = Pipeline({('A', 'B', 'C'): ['D']}).starters - expected = ['A', 'B', 'C'] + result = Pipeline({(Job('A'), Job('B'), Job('C')): Job('D')}).starters + expected = [Job('A'), Job('B'), Job('C')] self.assertEqual(set(result), set(expected)) result = Pipeline({(Job('A'), Job('B'), Job('C')): [Job('D')], @@ -80,9 +100,12 @@ def test_normalize(self): expected = [(Job('A'), Job('B')), (Job('C'), Job('B'))] self.assertEqual(set(result), set(expected)) - result = Pipeline({('A', 'C'): ['B', 'D', 'E']})._graph - expected = [('A', 'B'), ('A', 'D'), ('A', 'E'), ('C', 'B'), ('C', 'D'), - ('C', 'E')] + graph = {(Job('A'), Job('C')): [Job('B'), Job('D'), Job('E')]} + result = Pipeline(graph)._graph + expected = [(Job('A'), Job('B')), (Job('A'), Job('D')), + (Job('A'), Job('E')), (Job('C'), Job('B')), + (Job('C'), Job('D')), + (Job('C'), Job('E'))] self.assertEqual(set(result), set(expected)) result = Pipeline({Job('ABC'): []})._graph # problem here if use string @@ -126,7 +149,7 @@ def test_validate_graph(self): #should not have cycles with self.assertRaises(ValueError): - print Pipeline({Job('A'): [Job('B')], Job('B'): [Job('C')], + Pipeline({Job('A'): [Job('B')], Job('B'): [Job('C')], Job('C'): [Job('B')]})._graph with self.assertRaises(ValueError): Pipeline({Job('A'): [Job('B')], Job('B'): [Job('C')], @@ -325,3 +348,45 @@ def test_available_jobs(self): self.assertEqual(pipeline.available_jobs(), set(expected)) self.assertTrue(pipeline.finished()) + + def test_serialize(self): + job_1, job_2, job_3, job_4 = (Job('spam'), Job('eggs'), Job('ham'), + Job('python')) + pipeline = Pipeline({job_1: job_2, job_2: (job_3, job_4)}) + result = pipeline.serialize() + expected = {'graph': ((job_1.serialize(), job_2.serialize()), + (job_2.serialize(), job_3.serialize()), + (job_2.serialize(), job_4.serialize())), + 'data': None} + expected = tuple(expected.items()) + + result = dict(result) + expected = dict(expected) + result['graph'] = dict(result['graph']) + expected['graph'] = dict(expected['graph']) + self.assertEqual(result, expected) + + def test_deserialize(self): + job_1, job_2, job_3, job_4, job_5 = (Job('spam'), Job('eggs'), + Job('ham'), Job('python'), + Job('answer_42')) + pipeline = Pipeline({job_1: job_2, job_2: (job_3, job_4), job_5: None}) + serialized = pipeline.serialize() + new_pipeline = Pipeline.deserialize(serialized) + self.assertEqual(pipeline, new_pipeline) + + def test_equal_not_equal_hash(self): + job_1, job_2, job_3, job_4 = (Job('spam'), Job('eggs'), Job('ham'), + Job('python')) + pipeline_1 = Pipeline({job_1: job_2, job_2: (job_3, job_4)}) + pipeline_2 = Pipeline({job_1: job_2, job_2: (job_3, job_4)}) + pipeline_3 = Pipeline({job_1: job_2, job_2: job_3, job_3: job_4}) + self.assertTrue(pipeline_1 == pipeline_2) + self.assertTrue(pipeline_2 == pipeline_1) + self.assertTrue(pipeline_1 != pipeline_3) + self.assertTrue(pipeline_3 != pipeline_1) + + my_set = set([pipeline_1, pipeline_2, pipeline_3]) #test __hash__ + self.assertIn(pipeline_1, my_set) + self.assertIn(pipeline_2, my_set) + self.assertIn(pipeline_3, my_set) From 44eee49e765159bbc965901ee9289ed243e8712b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sun, 14 Oct 2012 22:44:29 -0300 Subject: [PATCH 26/37] Add new `Pipeliner` and its tests --- Makefile | 5 +- pypelinin/__init__.py | 2 +- pypelinin/pipeliner.py | 84 +++++----- tests/my_pipeliner.py | 22 +++ tests/test_pipeliner.py | 347 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 418 insertions(+), 42 deletions(-) create mode 100644 tests/my_pipeliner.py create mode 100644 tests/test_pipeliner.py diff --git a/Makefile b/Makefile index dc09915..05a033e 100644 --- a/Makefile +++ b/Makefile @@ -27,4 +27,7 @@ test-broker: bootstrap-tests test-pipeline: bootstrap-tests ${TEST_RUNNER} --with-coverage --cover-package=pypelinin.pipeline tests/test_pipeline.py -.PHONY: clean bootstrap-environment bootstrap-tests test test-router test-client test-broker test-pipeline +test-pipeliner: bootstrap-tests + ${TEST_RUNNER} -x tests/test_pipeliner.py + +.PHONY: clean bootstrap-environment bootstrap-tests test test-router test-client test-broker test-pipeline test-pipeliner diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index d940218..129597e 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -4,4 +4,4 @@ from .client import Client from .broker import Broker from .pipeline import Job, Pipeline#, PipelineManager -#from .pipeliner import Pipeliner +from .pipeliner import Pipeliner diff --git a/pypelinin/pipeliner.py b/pypelinin/pipeliner.py index 8869aea..4523eb3 100755 --- a/pypelinin/pipeliner.py +++ b/pypelinin/pipeliner.py @@ -5,7 +5,7 @@ from time import time from uuid import uuid4 -from . import Client +from . import Client, Job, Pipeline class Pipeliner(Client): @@ -54,8 +54,8 @@ def ask_for_a_pipeline(self): self.send_api_request({'command': 'get pipeline'}) message = self.get_api_reply() #TODO: if router stops and doesn't answer, pipeliner will stop here - if 'workers' in message and 'data' in message: - if message['data'] is not None: + if 'pipeline' in message: + if message['pipeline'] is not None: self.logger.info('Got this pipeline: {}'.format(message)) if self._new_pipelines is None: self._new_pipelines = 0 @@ -64,9 +64,8 @@ def ask_for_a_pipeline(self): return message else: self._new_pipelines = 0 - elif 'pipeline' in message and message['pipeline'] is None: - self.logger.info('Bad bad router, no pipeline for me.') - return None + self.logger.info('Bad bad router, no pipeline for me.') + return None else: self.logger.info('Ignoring malformed pipeline: {}'.format(message)) #TODO: send a 'rejecting pipeline' request to router @@ -77,60 +76,65 @@ def get_a_pipeline(self): while pipeline_definition is not None: pipeline_definition = self.ask_for_a_pipeline() if pipeline_definition is not None: - self.start_pipeline(pipeline_definition) + pipeline = \ + Pipeline.deserialize(pipeline_definition['pipeline']) + pipeline.id = pipeline_definition['pipeline id'] + pipeline.started_at = time() + self._pipelines[pipeline.id] = pipeline + self.start_pipeline_jobs(pipeline, pipeline.starters) - def _send_job(self, worker): - job_request = {'command': 'add job', 'worker': worker.name, - 'data': worker.data} + def _start_job(self, job): + job_request = {'command': 'add job', 'worker': job.worker_name, + 'data': job.data} self.send_api_request(job_request) self.logger.info('Sent job request: {}'.format(job_request)) message = self.get_api_reply() self.logger.info('Received from router API: {}'.format(message)) - self._jobs[message['job id']] = worker - subscribe_message = 'job finished: {}'.format(message['job id']) + job_id = message['job id'] + subscribe_message = 'job finished: {}'.format(job_id) self.broadcast_subscribe(subscribe_message) self.logger.info('Subscribed on router broadcast to: {}'\ .format(subscribe_message)) + return job_id - def start_pipeline(self, pipeline_definition): - pipeline = Worker.from_dict(pipeline_definition['workers']) - pipeline.pipeline_id = pipeline_definition['pipeline id'] - pipeline.data = pipeline_definition['data'] - pipeline.pipeline_started_at = time() - self._pipelines[pipeline.pipeline_id] = [pipeline] - self._send_job(pipeline) + def start_pipeline_jobs(self, pipeline, jobs): + job_ids = [] + for job in jobs: + job_id = self._start_job(job) + self._jobs[job_id] = job + pipeline.sent_jobs.add(job) def verify_jobs(self): self._update_broadcast() - new_messages = [] for message in self._messages: if message.startswith('job finished: '): job_id = message.split(': ')[1].split(' ')[0] if job_id in self._jobs: self.logger.info('Processing finished job id {}.'.format(job_id)) - worker = self._jobs[job_id] - self._pipelines[worker.pipeline_id].remove(worker) - next_workers = worker.after - for next_worker in next_workers: - self.logger.info(' worker after: {}'.format(next_worker.name)) - next_worker.data = worker.data - next_worker.pipeline_id = worker.pipeline_id - next_worker.pipeline_started_at = worker.pipeline_started_at - self._pipelines[worker.pipeline_id].append(next_worker) - self._send_job(next_worker) + job = self._jobs[job_id] + pipeline = job.pipeline + pipeline.add_finished_job(job) del self._jobs[job_id] - if not self._pipelines[worker.pipeline_id]: - total_time = time() - worker.pipeline_started_at - self.logger.info('Finished pipeline {}'\ - .format(worker.pipeline_id)) - self.send_api_request({'command': 'pipeline finished', - 'pipeline id': worker.pipeline_id, - 'duration': total_time}) - self.get_api_reply() - #TODO: check reply - del self._pipelines[worker.pipeline_id] + if pipeline.finished(): + total_time = time() - pipeline.started_at + self.send_api_request( + {'command': 'pipeline finished', + 'pipeline id': pipeline.id, + 'duration': total_time} + ) + self.logger.info('Finished pipeline_id={}, ' + 'duration={}'.format(pipeline.id, + total_time)) + self.get_api_reply() #TODO: check reply + del self._pipelines[pipeline.id] self.get_a_pipeline() + else: + jobs_to_send = pipeline.available_jobs() - pipeline.sent_jobs + if jobs_to_send: + self.start_pipeline_jobs(pipeline, jobs_to_send) self.broadcast_unsubscribe(message) + elif message == 'new pipeline': + self.get_a_pipeline() self._messages = [] def run(self): diff --git a/tests/my_pipeliner.py b/tests/my_pipeliner.py new file mode 100644 index 0000000..ea05f96 --- /dev/null +++ b/tests/my_pipeliner.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python2 +# coding: utf-8 + +from sys import stdout +from logging import Logger, StreamHandler, Formatter +from pypelinin import Pipeliner + + +def main(): + logger = Logger('Pipeliner') + handler = StreamHandler(stdout) + formatter = Formatter('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + pipeliner = Pipeliner(api='tcp://localhost:5555', + broadcast='tcp://localhost:5556', logger=logger) + pipeliner.start() + +if __name__ == '__main__': + main() + diff --git a/tests/test_pipeliner.py b/tests/test_pipeliner.py new file mode 100644 index 0000000..bf0c97e --- /dev/null +++ b/tests/test_pipeliner.py @@ -0,0 +1,347 @@ +# coding: utf-8 + +from __future__ import print_function +import unittest +import shlex +import select +from signal import SIGINT, SIGKILL +from time import sleep, time +from subprocess import Popen, PIPE +from uuid import uuid4 +import zmq +from psutil import Process, NoSuchProcess +from pypelinin import Job, Pipeline + + +TIMEOUT = 1500 +DEBUG_STDOUT = False +DEBUG_STDERR = False + +def _print_debug(name, message): + print() + print('----- {} BEGIN -----'.format(name)) + print(message) + print('----- {} END -----'.format(name)) + +def _kill(pid, timeout=1.5): + try: + process = Process(pid) + except NoSuchProcess: + return + try: + process.send_signal(SIGINT) + sleep(timeout) + except OSError: + pass + finally: + try: + process.send_signal(SIGKILL) + except (OSError, NoSuchProcess): + pass + process.wait() + +class TestPipeliner(unittest.TestCase): + def setUp(self): + self.context = zmq.Context() + self.start_router_sockets() + self.start_pipeliner_process() + sleep(1) # wait for subscribe to take effect + + def tearDown(self): + self.end_pipeliner_process() + self.close_sockets() + self.context.term() + + def start_pipeliner_process(self): + self.pipeliner = Popen(shlex.split('python ./tests/my_pipeliner.py'), + stdin=PIPE, stdout=PIPE, stderr=PIPE) + #TODO: use select and self.fail + for line in self.pipeliner.stdout.readline(): + if 'main loop' in line: + break + + def end_pipeliner_process(self): + try: + pipeliner_process = Process(self.pipeliner.pid) + except NoSuchProcess: + return # was killed + + # kill main process and its children + children = [process.pid for process in pipeliner_process.get_children()] + _kill(self.pipeliner.pid, timeout=TIMEOUT / 1000.0) + for child_pid in children: + _kill(child_pid, timeout=TIMEOUT / 1000.0) + + # get stdout and stderr + stdout = self.pipeliner.stdout.read() + stderr = self.pipeliner.stderr.read() + if stdout and DEBUG_STDOUT: + _print_debug('STDOUT', ''.join(stdout)) + if stderr and DEBUG_STDERR: + _print_debug('STDERR', ''.join(stderr)) + + + def start_router_sockets(self): + self.api = self.context.socket(zmq.REP) + self.broadcast = self.context.socket(zmq.PUB) + self.api.bind('tcp://*:5555') + self.broadcast.bind('tcp://*:5556') + + def close_sockets(self): + self.api.close() + self.broadcast.close() + + def send_no_pipeline(self): + self.api.send_json({'pipeline': None}) + + def send_pipeline(self, pipeline_definition): + pipeline = Pipeline(pipeline_definition['graph'], + data=pipeline_definition['data']) + self.api.send_json({'pipeline': pipeline.serialize(), + 'pipeline id': pipeline_definition['pipeline id']}) + + def check_add_job(self): + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'add job' from pipeliner") + message = self.api.recv_json() + if message['command'] == 'add job': + job_id = uuid4().hex + self.api.send_json({'answer': 'job accepted', + 'job id': job_id}) + return message, job_id + elif message['command'] == 'get pipeline': + self.send_no_pipeline() + return self.check_add_job() + + def ignore_get_pipeline(self): + if self.api.poll(TIMEOUT): + message = self.api.recv_json() + if message['command'] == 'get pipeline': + self.send_no_pipeline() + else: + self.fail('Should not receive message "{}" ' + 'here.'.format(message)) + + def get_api_request(self): + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive message in API channel") + else: + message = self.api.recv_json() + return message + + + def test_should_receive_get_pipeline_when_broadcast_new_pipeline(self): + for i in range(10): #everytime it receives a new pipeline broadcast... + self.broadcast.send('new pipeline') + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get pipeline' from pipeliner") + message = self.api.recv_json() + self.send_no_pipeline() + self.assertEqual(message, {'command': 'get pipeline'}) + + def test_should_create_a_job_request_after_getting_a_pipeline(self): + job_counter = 0 + for index in range(20): + if index < 10: + self.broadcast.send('new pipeline') + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get pipeline' from pipeliner") + message = self.api.recv_json() + if message == {'command': 'get pipeline'}: + pipeline = {'graph': {Job('Dummy'): None}, + 'data': {'index': index}, + 'pipeline id': uuid4().hex} + self.send_pipeline(pipeline) + elif message['command'] == 'add job': + job_counter += 1 + self.assertEqual(message['worker'], 'Dummy') + self.api.send_json({'answer': 'job accepted', + 'job id': uuid4().hex}) + self.assertEqual(job_counter, 10) + + def test_pipeliner_should_send_pipeline_finished_when_router_sends_job_finished(self): + self.broadcast.send('job finished: {}'.format(uuid4().hex)) + if self.api.poll(TIMEOUT): + message = self.api.recv_json() + if message['command'] != 'get pipeline': + self.fail('Should not receive any message at this point') + else: + self.send_no_pipeline() + + self.broadcast.send('new pipeline') + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get pipeline' from pipeliner") + message = self.api.recv_json() + pipeline_id = uuid4().hex + pipeline = {'graph': {Job('Dummy'): None}, + 'data': {}, 'pipeline id': pipeline_id} + self.send_pipeline(pipeline) + + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get pipeline' from pipeliner") + message = self.api.recv_json() + job_id = uuid4().hex + self.api.send_json({'answer': 'job accepted', + 'job id': job_id}) + self.broadcast.send('job finished: {}'.format(job_id)) + + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get pipeline' from pipeliner") + message = self.api.recv_json() + if message['command'] == 'get pipeline': + self.send_no_pipeline() + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'finished pipeline' from pipeliner") + message = self.api.recv_json() + self.assertIn('command', message) + self.assertIn('pipeline id', message) + self.assertIn('duration', message) + self.assertEquals(message['command'], 'pipeline finished') + self.assertEquals(message['pipeline id'], pipeline_id) + + def test_pipeliner_should_be_able_to_add_jobs_in_sequence(self): + self.broadcast.send('new pipeline') + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get pipeline' from pipeliner") + message = self.api.recv_json() + pipeline_id = uuid4().hex + pipeline_graph = {Job('Dummy'): Job('Dummy2'), + Job('Dummy2'): Job('Dummy3')} + pipeline = {'graph': pipeline_graph, + 'data': {}, + 'pipeline id': pipeline_id} + self.send_pipeline(pipeline) + start_time = time() + + job_workers = [] + finished_job_counter = 0 + while finished_job_counter < 3: + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'add job' from pipeliner") + message = self.api.recv_json() + if message['command'] == 'add job': + job_id = uuid4().hex + self.api.send_json({'answer': 'job accepted', + 'job id': job_id}) + job_workers.append(message['worker']) + if self.api.poll(TIMEOUT * 10): + message = self.api.recv_json() + if message['command'] == 'get pipeline': + self.send_no_pipeline() + else: + self.fail("Should not receive messages at this point") + finished_job_counter += 1 + self.broadcast.send('job finished: {}'.format(job_id)) + elif message['command'] == 'get pipeline': + self.send_no_pipeline() + self.assertEqual(finished_job_counter, 3) + # then, check order of jobs sent + self.assertEqual(job_workers, ['Dummy', 'Dummy2', 'Dummy3']) + + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'finished pipeline' from pipelines") + message = self.api.recv_json() + end_time = time() + total_time = end_time - start_time + self.assertIn('command', message) + self.assertIn('pipeline id', message) + self.assertIn('duration', message) + self.assertEqual(message['command'], 'pipeline finished') + self.assertEqual(message['pipeline id'], pipeline_id) + self.assertTrue(message['duration'] <= total_time) + + def test_pipeliner_should_be_able_to_add_jobs_in_parallel(self): + self.broadcast.send('new pipeline') + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get pipeline' from pipeliner") + message = self.api.recv_json() + pipeline_id = uuid4().hex + pipeline = {Job('Dummy'): None, Job('Dummy2'): None, + Job('Dummy3'): None} + pipeline = {'graph': pipeline, + 'data': {}, + 'pipeline id': pipeline_id} + self.send_pipeline(pipeline) + start_time = time() + + job_ids = [] + while len(job_ids) < 3: + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'add job' from pipeliner") + message = self.api.recv_json() + if message['command'] == 'add job': + job_id = uuid4().hex + self.api.send_json({'answer': 'job accepted', + 'job id': job_id}) + job_ids.append(job_id) + elif message['command'] == 'get pipeline': + self.send_no_pipeline() + for job_id in job_ids: + self.broadcast.send('job finished: {}'.format(job_id)) + + pipeline_finished = False + while not pipeline_finished: + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'finished pipeline' from pipelines") + message = self.api.recv_json() + if message['command'] == 'get pipeline': + self.send_no_pipeline() + else: + pipeline_finished = True + end_time = time() + total_time = end_time - start_time + self.assertIn('command', message) + self.assertIn('pipeline id', message) + self.assertIn('duration', message) + self.assertEqual(message['command'], 'pipeline finished') + self.assertEqual(message['pipeline id'], pipeline_id) + self.assertTrue(message['duration'] <= total_time) + + def test_pipeliner_should_be_able_to_add_jobs_in_sequence_and_parallel_mixed(self): + self.broadcast.send('new pipeline') + if not self.api.poll(TIMEOUT): + self.fail("Didn't receive 'get pipeline' from pipeliner") + message = self.api.recv_json() + pipeline_id = uuid4().hex + pipeline_graph = {Job('w1'): (Job('w2.1'), Job('w2.2'), Job('w2.3')), + (Job('w2.1'), Job('w2.2'), Job('w2.3')): Job('w3')} + pipeline = {'graph': pipeline_graph, + 'data': {}, + 'pipeline id': pipeline_id} + self.send_pipeline(pipeline) + start_time = time() + + message, job_id = self.check_add_job() + expected = {'command': 'add job', 'worker': 'w1', 'data': {}} + self.assertEqual(message, expected) + self.broadcast.send('job finished: {}'.format(job_id)) + + message_1, job_id_2_1 = self.check_add_job() + message_2, job_id_2_2 = self.check_add_job() + message_3, job_id_2_3 = self.check_add_job() + workers = set([message_1['worker'], message_2['worker'], + message_3['worker']]) + self.assertEqual(workers, set(['w2.1', 'w2.2', 'w2.3'])) + + self.ignore_get_pipeline() + self.broadcast.send('job finished: {}'.format(job_id_2_1)) + self.broadcast.send('job finished: {}'.format(job_id_2_2)) + self.broadcast.send('job finished: {}'.format(job_id_2_3)) + + message, job_id = self.check_add_job() + self.assertEqual(message['worker'], 'w3') + + end_time = time() + total_time = end_time - start_time + self.broadcast.send('job finished: {}'.format(job_id)) + message = self.get_api_request() + self.assertEqual(message['command'], 'pipeline finished') + self.assertEqual(message['pipeline id'], pipeline_id) + self.assertTrue(message['duration'] > total_time) + self.assertTrue(message['duration'] < 1.5 * total_time) + + +#TODO: create helper functions for interacting with pipeliner +#TODO: max of pipelines per Pipeliner? +#TODO: handle incorrect pipelines (ignored) - send message to Router +#TODO: move process management helper functions to another module From e2bbe2b2aa1f564c7cdb2436c34b704741283d8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Sun, 14 Oct 2012 23:38:12 -0300 Subject: [PATCH 27/37] Refactor `Pipeliner` tests --- pypelinin/pipeliner.py | 2 + tests/test_pipeliner.py | 115 ++++++++++++---------------------------- 2 files changed, 36 insertions(+), 81 deletions(-) diff --git a/pypelinin/pipeliner.py b/pypelinin/pipeliner.py index 4523eb3..afe699a 100755 --- a/pypelinin/pipeliner.py +++ b/pypelinin/pipeliner.py @@ -12,6 +12,8 @@ class Pipeliner(Client): #TODO: should send monitoring information? #TODO: should receive and handle a 'job error' from router when some job # could not be processed (timeout, worker not found etc.) + #TODO: max of pipelines per Pipeliner? + #TODO: handle incorrect pipelines (ignored) - send message to Router def __init__(self, api, broadcast, logger=None, poll_time=50): super(Pipeliner, self).__init__() diff --git a/tests/test_pipeliner.py b/tests/test_pipeliner.py index bf0c97e..d5cecd4 100644 --- a/tests/test_pipeliner.py +++ b/tests/test_pipeliner.py @@ -16,6 +16,7 @@ TIMEOUT = 1500 DEBUG_STDOUT = False DEBUG_STDERR = False +#TODO: move process management helper functions to another module def _print_debug(name, message): print() @@ -100,43 +101,44 @@ def send_pipeline(self, pipeline_definition): self.api.send_json({'pipeline': pipeline.serialize(), 'pipeline id': pipeline_definition['pipeline id']}) - def check_add_job(self): + def get_api_request(self, ignore_get_pipeline=True): if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'add job' from pipeliner") - message = self.api.recv_json() + self.fail("Didn't receive message in API channel") + else: + message = self.api.recv_json() + if not ignore_get_pipeline: + return message + else: + if message['command'] == 'get pipeline': + self.send_no_pipeline() + return self.get_api_request() + else: + return message + + def check_add_job(self): + message = self.get_api_request() if message['command'] == 'add job': job_id = uuid4().hex self.api.send_json({'answer': 'job accepted', 'job id': job_id}) return message, job_id - elif message['command'] == 'get pipeline': - self.send_no_pipeline() - return self.check_add_job() + else: + seif.fail('Should not receive message "{}" here'.format(message)) def ignore_get_pipeline(self): if self.api.poll(TIMEOUT): message = self.api.recv_json() if message['command'] == 'get pipeline': self.send_no_pipeline() + return message else: self.fail('Should not receive message "{}" ' 'here.'.format(message)) - def get_api_request(self): - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive message in API channel") - else: - message = self.api.recv_json() - return message - - def test_should_receive_get_pipeline_when_broadcast_new_pipeline(self): for i in range(10): #everytime it receives a new pipeline broadcast... self.broadcast.send('new pipeline') - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'get pipeline' from pipeliner") - message = self.api.recv_json() - self.send_no_pipeline() + message = self.ignore_get_pipeline() self.assertEqual(message, {'command': 'get pipeline'}) def test_should_create_a_job_request_after_getting_a_pipeline(self): @@ -144,9 +146,7 @@ def test_should_create_a_job_request_after_getting_a_pipeline(self): for index in range(20): if index < 10: self.broadcast.send('new pipeline') - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'get pipeline' from pipeliner") - message = self.api.recv_json() + message = self.get_api_request(ignore_get_pipeline=False) if message == {'command': 'get pipeline'}: pipeline = {'graph': {Job('Dummy'): None}, 'data': {'index': index}, @@ -161,38 +161,22 @@ def test_should_create_a_job_request_after_getting_a_pipeline(self): def test_pipeliner_should_send_pipeline_finished_when_router_sends_job_finished(self): self.broadcast.send('job finished: {}'.format(uuid4().hex)) - if self.api.poll(TIMEOUT): - message = self.api.recv_json() - if message['command'] != 'get pipeline': - self.fail('Should not receive any message at this point') - else: - self.send_no_pipeline() - + self.ignore_get_pipeline() self.broadcast.send('new pipeline') - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'get pipeline' from pipeliner") - message = self.api.recv_json() + message = self.get_api_request(ignore_get_pipeline=False) pipeline_id = uuid4().hex pipeline = {'graph': {Job('Dummy'): None}, 'data': {}, 'pipeline id': pipeline_id} self.send_pipeline(pipeline) - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'get pipeline' from pipeliner") - message = self.api.recv_json() + message = self.get_api_request() job_id = uuid4().hex self.api.send_json({'answer': 'job accepted', 'job id': job_id}) self.broadcast.send('job finished: {}'.format(job_id)) - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'get pipeline' from pipeliner") - message = self.api.recv_json() - if message['command'] == 'get pipeline': - self.send_no_pipeline() - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'finished pipeline' from pipeliner") - message = self.api.recv_json() + self.ignore_get_pipeline() + message = self.get_api_request() self.assertIn('command', message) self.assertIn('pipeline id', message) self.assertIn('duration', message) @@ -201,9 +185,7 @@ def test_pipeliner_should_send_pipeline_finished_when_router_sends_job_finished( def test_pipeliner_should_be_able_to_add_jobs_in_sequence(self): self.broadcast.send('new pipeline') - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'get pipeline' from pipeliner") - message = self.api.recv_json() + message = self.get_api_request(ignore_get_pipeline=False) pipeline_id = uuid4().hex pipeline_graph = {Job('Dummy'): Job('Dummy2'), Job('Dummy2'): Job('Dummy3')} @@ -216,20 +198,13 @@ def test_pipeliner_should_be_able_to_add_jobs_in_sequence(self): job_workers = [] finished_job_counter = 0 while finished_job_counter < 3: - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'add job' from pipeliner") - message = self.api.recv_json() + message = self.get_api_request() if message['command'] == 'add job': job_id = uuid4().hex self.api.send_json({'answer': 'job accepted', 'job id': job_id}) job_workers.append(message['worker']) - if self.api.poll(TIMEOUT * 10): - message = self.api.recv_json() - if message['command'] == 'get pipeline': - self.send_no_pipeline() - else: - self.fail("Should not receive messages at this point") + self.ignore_get_pipeline() finished_job_counter += 1 self.broadcast.send('job finished: {}'.format(job_id)) elif message['command'] == 'get pipeline': @@ -238,9 +213,7 @@ def test_pipeliner_should_be_able_to_add_jobs_in_sequence(self): # then, check order of jobs sent self.assertEqual(job_workers, ['Dummy', 'Dummy2', 'Dummy3']) - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'finished pipeline' from pipelines") - message = self.api.recv_json() + message = self.get_api_request() end_time = time() total_time = end_time - start_time self.assertIn('command', message) @@ -252,9 +225,7 @@ def test_pipeliner_should_be_able_to_add_jobs_in_sequence(self): def test_pipeliner_should_be_able_to_add_jobs_in_parallel(self): self.broadcast.send('new pipeline') - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'get pipeline' from pipeliner") - message = self.api.recv_json() + message = self.get_api_request(ignore_get_pipeline=False) pipeline_id = uuid4().hex pipeline = {Job('Dummy'): None, Job('Dummy2'): None, Job('Dummy3'): None} @@ -266,9 +237,7 @@ def test_pipeliner_should_be_able_to_add_jobs_in_parallel(self): job_ids = [] while len(job_ids) < 3: - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'add job' from pipeliner") - message = self.api.recv_json() + message = self.get_api_request() if message['command'] == 'add job': job_id = uuid4().hex self.api.send_json({'answer': 'job accepted', @@ -279,15 +248,7 @@ def test_pipeliner_should_be_able_to_add_jobs_in_parallel(self): for job_id in job_ids: self.broadcast.send('job finished: {}'.format(job_id)) - pipeline_finished = False - while not pipeline_finished: - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'finished pipeline' from pipelines") - message = self.api.recv_json() - if message['command'] == 'get pipeline': - self.send_no_pipeline() - else: - pipeline_finished = True + message = self.get_api_request() end_time = time() total_time = end_time - start_time self.assertIn('command', message) @@ -299,9 +260,7 @@ def test_pipeliner_should_be_able_to_add_jobs_in_parallel(self): def test_pipeliner_should_be_able_to_add_jobs_in_sequence_and_parallel_mixed(self): self.broadcast.send('new pipeline') - if not self.api.poll(TIMEOUT): - self.fail("Didn't receive 'get pipeline' from pipeliner") - message = self.api.recv_json() + message = self.get_api_request(ignore_get_pipeline=False) pipeline_id = uuid4().hex pipeline_graph = {Job('w1'): (Job('w2.1'), Job('w2.2'), Job('w2.3')), (Job('w2.1'), Job('w2.2'), Job('w2.3')): Job('w3')} @@ -339,9 +298,3 @@ def test_pipeliner_should_be_able_to_add_jobs_in_sequence_and_parallel_mixed(sel self.assertEqual(message['pipeline id'], pipeline_id) self.assertTrue(message['duration'] > total_time) self.assertTrue(message['duration'] < 1.5 * total_time) - - -#TODO: create helper functions for interacting with pipeliner -#TODO: max of pipelines per Pipeliner? -#TODO: handle incorrect pipelines (ignored) - send message to Router -#TODO: move process management helper functions to another module From 3e800d8860c68ffbb4a9ed4044f24cbbe016cda7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 02:03:13 -0300 Subject: [PATCH 28/37] Add `PipelineManager` and its tests Also split `Pipeline` into `Pipeline` and `PipelineForPipeliner` -- the last one should only be used by Pipeliner --- pypelinin/__init__.py | 2 +- pypelinin/pipeline.py | 82 ++++++++++++++++++--- pypelinin/pipeliner.py | 4 +- pypelinin/router.py | 4 +- tests/test_broker.py | 2 +- tests/test_pipeline.py | 162 ++++++++++++++++++++++++++++++++++++++--- 6 files changed, 228 insertions(+), 28 deletions(-) diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index 129597e..233a4d6 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -3,5 +3,5 @@ from .router import Router from .client import Client from .broker import Broker -from .pipeline import Job, Pipeline#, PipelineManager +from .pipeline import Job, Pipeline, PipelineManager, PipelineForPipeliner from .pipeliner import Pipeliner diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py index 0a047c2..5c58404 100644 --- a/pypelinin/pipeline.py +++ b/pypelinin/pipeline.py @@ -1,11 +1,14 @@ # coding: utf-8 from itertools import product +from time import time from pygraph.classes.digraph import digraph as DiGraph from pygraph.algorithms.cycles import find_cycle from pygraph.readwrite.dot import write +from . import Client + class Job(object): def __init__(self, worker_name, data=None): @@ -51,6 +54,7 @@ class Pipeline(object): def __init__(self, pipeline, data=None): #TODO: should raise if pipeline is not composed of `Job`s? self.data = data + self.id = None self._finished_jobs = set() self._original_graph = self._graph = pipeline if type(pipeline) == dict: @@ -74,6 +78,15 @@ def __init__(self, pipeline, data=None): self._dependencies[job_2].add(job_1) self.sent_jobs = set() + def __eq__(self, other): + return self._graph == other._graph + + def __ne__(self, other): + return not self.__eq__(other) + + def __hash__(self): + return hash(self.serialize()) + def _normalize(self): new_graph = [] for keys, values in self._original_graph.items(): @@ -140,7 +153,7 @@ def serialize(self): return tuple({'graph': tuple(result), 'data': self.data}.items()) @staticmethod - def deserialize(info): + def _deserialize(info): info = dict(info) new_graph = [] for key, value in info['graph']: @@ -149,7 +162,18 @@ def deserialize(info): if value is not None: deserialized_value = Job.deserialize(value) new_graph.append((deserialized_key, deserialized_value)) - return Pipeline(new_graph, data=info['data']) + return new_graph, info['data'] + + @staticmethod + def deserialize(info): + new_graph, data = Pipeline._deserialize(info) + return Pipeline(new_graph, data=data) + +class PipelineForPipeliner(Pipeline): + @staticmethod + def deserialize(info): + new_graph, data = PipelineForPipeliner._deserialize(info) + return PipelineForPipeliner(new_graph, data=data) def add_finished_job(self, job): if job not in self.jobs: @@ -173,11 +197,49 @@ def available_jobs(self): available.add(job) return available - def __eq__(self, other): - return self._graph == other._graph - - def __ne__(self, other): - return not self.__eq__(other) - - def __hash__(self): - return hash(self.serialize()) +class PipelineManager(Client): + #TODO: is there any way to subscribe to job ids? So I can know the status + # of each job in pipeline with this object + def __init__(self, api, broadcast, poll_time=50): # milliseconds + super(PipelineManager, self).__init__() + self.poll_time = poll_time + self._pipelines = set() + self._pipeline_from_id = {} + self.connect(api=api, broadcast=broadcast) + + def start(self, pipeline): + if pipeline in self._pipelines: + raise ValueError('This pipeline was already started') + self._pipelines.add(pipeline) + request = {'command': 'add pipeline', 'pipeline': pipeline.serialize()} + self.send_api_request(request) + result = self.get_api_reply() + pipeline_id = str(result['pipeline id']) + pipeline.id = pipeline_id + pipeline.finished = False + self._pipeline_from_id[pipeline_id] = pipeline + pipeline.started_at = time() + self.broadcast_subscribe('pipeline finished: id=' + pipeline_id) + return pipeline_id + + def _update_broadcast(self): + while self.broadcast_poll(self.poll_time): + message = self.broadcast_receive() + if message.startswith('pipeline finished: '): + try: + data = message.split(': ')[1].split(', ') + pipeline_id = data[0].split('=')[1] + duration = float(data[1].split('=')[1]) + except (IndexError, ValueError): + continue + pipeline = self._pipeline_from_id[pipeline_id] + pipeline.duration = duration + pipeline.finished = True + self.broadcast_unsubscribe(message) + + def finished(self, pipeline): + if pipeline not in self._pipelines: + raise ValueError('This pipeline is not being managed by this ' + 'PipelineMager') + self._update_broadcast() + return pipeline.finished diff --git a/pypelinin/pipeliner.py b/pypelinin/pipeliner.py index afe699a..f7255e4 100755 --- a/pypelinin/pipeliner.py +++ b/pypelinin/pipeliner.py @@ -5,7 +5,7 @@ from time import time from uuid import uuid4 -from . import Client, Job, Pipeline +from . import Client, Job, PipelineForPipeliner class Pipeliner(Client): @@ -79,7 +79,7 @@ def get_a_pipeline(self): pipeline_definition = self.ask_for_a_pipeline() if pipeline_definition is not None: pipeline = \ - Pipeline.deserialize(pipeline_definition['pipeline']) + PipelineForPipeliner.deserialize(pipeline_definition['pipeline']) pipeline.id = pipeline_definition['pipeline id'] pipeline.started_at = time() self._pipelines[pipeline.id] = pipeline diff --git a/pypelinin/router.py b/pypelinin/router.py index 9bc56db..6d8a154 100755 --- a/pypelinin/router.py +++ b/pypelinin/router.py @@ -128,8 +128,8 @@ def run(self): else: self.pending_pipeline_ids.remove(pipeline_id) self.reply({'answer': 'good job!'}) - new_message = 'pipeline finished: {}'\ - .format(pipeline_id) + new_message = 'pipeline finished: id={}, duration={}'\ + .format(pipeline_id, message['duration']) self.broadcast.send(new_message) self.logger.info('[Broadcast] Sent: {}'\ .format(new_message)) diff --git a/tests/test_broker.py b/tests/test_broker.py index b4c304a..158c87b 100644 --- a/tests/test_broker.py +++ b/tests/test_broker.py @@ -19,7 +19,7 @@ TIMEOUT = 1500 DEBUG_STDOUT = False -DEBUG_STDERR = True +DEBUG_STDERR = False def _print_debug(name, message): print() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 1f2369f..6ad0878 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,8 +1,16 @@ # coding: utf-8 import unittest + from textwrap import dedent -from pypelinin import Job, Pipeline +from multiprocessing import Pool +from uuid import uuid4 +from time import sleep, time + +import zmq + +from pypelinin import Job, Pipeline, PipelineManager, PipelineForPipeliner + class JobTest(unittest.TestCase): def test_worker_name(self): @@ -195,7 +203,8 @@ def test_pipeline_add_finished_job(self): job_2 = Job('w2') job_3 = Job('w3') pipeline_data = {'python': 42} - pipeline = Pipeline({job_1: job_2, job_2: job_3}, data=pipeline_data) + pipeline = PipelineForPipeliner({job_1: job_2, job_2: job_3}, + data=pipeline_data) job_4 = Job('w4') self.assertFalse(pipeline.finished_job(job_1)) @@ -227,7 +236,8 @@ def test_pipeline_finished(self): job_2 = Job('w2') job_3 = Job('w3') pipeline_data = {'python': 42} - pipeline = Pipeline({job_1: job_2, job_2: job_3}, data=pipeline_data) + pipeline = PipelineForPipeliner({job_1: job_2, job_2: job_3}, + data=pipeline_data) self.assertFalse(pipeline.finished()) pipeline.add_finished_job(job_1) @@ -240,6 +250,7 @@ def test_pipeline_finished(self): def test_default_attributes(self): pipeline = Pipeline({Job('test'): None}) self.assertEqual(pipeline.data, None) + self.assertEqual(pipeline.id, None) self.assertEqual(pipeline.jobs, (Job('test'),)) self.assertEqual(pipeline.sent_jobs, set()) @@ -248,7 +259,8 @@ def test_available_jobs(self): job_2 = Job('w2') job_3 = Job('w3') pipeline_data = {'python': 42} - pipeline = Pipeline({job_1: job_2, job_2: job_3}, data=pipeline_data) + pipeline = PipelineForPipeliner({job_1: job_2, job_2: job_3}, + data=pipeline_data) expected = [job_1] self.assertEqual(pipeline.available_jobs(), set(expected)) @@ -269,14 +281,15 @@ def test_available_jobs(self): job_11, job_12, job_13 = Job('11'), Job('12'), Job('13') job_14, job_15, job_16 = Job('14'), Job('15'), Job('16') pipeline_data = {'python': 42} - pipeline = Pipeline({job_1: (job_2, job_3), - job_2: (job_4, job_16), - job_3: job_4, - job_4: job_5, - job_5: (job_6, job_7, job_8, job_9), - (job_6, job_7, job_8): job_10, - (job_10, job_11): (job_12, job_13, job_14), - job_15: None}, + pipeline = PipelineForPipeliner({job_1: (job_2, job_3), + job_2: (job_4, job_16), + job_3: job_4, + job_4: job_5, + job_5: (job_6, job_7, job_8, job_9), + (job_6, job_7, job_8): job_10, + (job_10, job_11): (job_12, job_13, + job_14), + job_15: None}, data=pipeline_data) expected = [job_1, job_11, job_15] @@ -390,3 +403,128 @@ def test_equal_not_equal_hash(self): self.assertIn(pipeline_1, my_set) self.assertIn(pipeline_2, my_set) self.assertIn(pipeline_3, my_set) + +def run_in_parallel(function, args=tuple()): + pool = Pool(processes=1) + result = pool.apply_async(function, args) + return result, pool + +def send_pipeline(): + pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), + Job(u'worker_2'): Job(u'worker_3')}) + pipeline_manager = PipelineManager(api='tcp://localhost:5550', + broadcast='tcp://localhost:5551') + before = pipeline.id + pipeline_id = pipeline_manager.start(pipeline) + pipeline_manager.disconnect() + return before, pipeline_id, pipeline.id + +def send_pipeline_and_wait_finished(): + import time + + pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), + Job(u'worker_2'): Job(u'worker_3')}) + pipeline_manager = PipelineManager(api='tcp://localhost:5550', + broadcast='tcp://localhost:5551') + pipeline_manager.start(pipeline) + start = time.time() + while not pipeline_manager.finished(pipeline): + time.sleep(0.1) + end = time.time() + pipeline_manager.disconnect() + return {'duration': pipeline.duration, 'real_duration': end - start} + +def verify_PipelineManager_exceptions(): + pipeline_1 = Pipeline({Job(u'worker_1'): Job(u'worker_2'), + Job(u'worker_2'): Job(u'worker_3')}) + pipeline_2 = Pipeline({Job(u'worker_1'): Job(u'worker_2')}) + pipeline_manager = PipelineManager(api='tcp://localhost:5550', + broadcast='tcp://localhost:5551') + pipeline_manager.start(pipeline_1) + raise_1, raise_2 = False, False + try: + pipeline_manager.start(pipeline_1) + except ValueError: + raise_1 = True + try: + pipeline_manager.finished(pipeline_2) + except ValueError: + raise_2 = True + + pipeline_manager.disconnect() + return {'raise_1': raise_1, 'raise_2': raise_2, + 'started_at': pipeline_1.started_at} + +class PipelineManagerTest(unittest.TestCase): + def setUp(self): + self.context = zmq.Context() + self.start_router_sockets() + self.pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), + Job(u'worker_2'): Job(u'worker_3')}) + + def tearDown(self): + self.close_sockets() + self.context.term() + + def start_router_sockets(self): + self.api = self.context.socket(zmq.REP) + self.broadcast = self.context.socket(zmq.PUB) + self.api.bind('tcp://127.0.0.1:5550') + self.broadcast.bind('tcp://127.0.0.1:5551') + + def close_sockets(self): + self.api.close() + self.broadcast.close() + + def test_should_send_add_pipeline_with_serialized_pipeline(self): + result, pool = run_in_parallel(send_pipeline) + message = self.api.recv_json() + received = Pipeline.deserialize(message['pipeline']).serialize() + expected = self.pipeline.serialize() + self.assertEqual(set(message.keys()), set(['command', 'pipeline'])) + self.assertEqual(message['command'], 'add pipeline') + self.assertEqual(received, expected) + + pipeline_id = uuid4().hex + self.api.send_json({'answer': 'pipeline accepted', + 'pipeline id': pipeline_id}) + result.get() + pool.terminate() + + def test_should_save_pipeline_id_on_pipeline_object(self): + result, pool = run_in_parallel(send_pipeline) + message = self.api.recv_json() + pipeline_id = uuid4().hex + self.api.send_json({'answer': 'pipeline accepted', + 'pipeline id': pipeline_id}) + received = result.get() + pool.terminate() + self.assertEqual(received, (None, pipeline_id, pipeline_id)) + + def test_should_subscribe_to_broadcast_to_wait_for_finished_pipeline(self): + result, pool = run_in_parallel(send_pipeline_and_wait_finished) + message = self.api.recv_json() + pipeline_id = uuid4().hex + self.api.send_json({'answer': 'pipeline accepted', + 'pipeline id': pipeline_id}) + sleep(1) + self.broadcast.send('pipeline finished: id={}, duration=1.23456'\ + .format(pipeline_id)) + received = result.get() + pool.terminate() + self.assertEqual(received['duration'], 1.23456) + self.assertTrue(received['real_duration'] > 1) + + def test_should_raise_ValueError_in_some_cases(self): + result, pool = run_in_parallel(verify_PipelineManager_exceptions) + message = self.api.recv_json() + pipeline_id = uuid4().hex + self.api.send_json({'answer': 'pipeline accepted', + 'pipeline id': pipeline_id}) + start_time = time() + received = result.get() + pool.terminate() + self.assertTrue(received['raise_1']) + self.assertTrue(received['raise_2']) + started_at = received['started_at'] + self.assertTrue(start_time - 0.1 <= started_at <= start_time + 0.1) From d1330aa721656478d8d2f5236b51b77ef0bc7f94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 13:39:15 -0300 Subject: [PATCH 29/37] Fix problems on `Job`/`Pipeline`.(de)?serialize --- pypelinin/__init__.py | 3 ++- pypelinin/pipeline.py | 23 ++++++++++++++++------- tests/test_pipeline.py | 21 +++++++++++++++++++-- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/pypelinin/__init__.py b/pypelinin/__init__.py index 233a4d6..ce7ee18 100644 --- a/pypelinin/__init__.py +++ b/pypelinin/__init__.py @@ -3,5 +3,6 @@ from .router import Router from .client import Client from .broker import Broker -from .pipeline import Job, Pipeline, PipelineManager, PipelineForPipeliner +from .pipeline import (Job, Pipeline, PipelineManager, PipelineForPipeliner, + Worker) from .pipeliner import Pipeliner diff --git a/pypelinin/pipeline.py b/pypelinin/pipeline.py index 5c58404..fe6a32f 100644 --- a/pypelinin/pipeline.py +++ b/pypelinin/pipeline.py @@ -10,6 +10,9 @@ from . import Client +class Worker(object): + pass + class Job(object): def __init__(self, worker_name, data=None): self.worker_name = worker_name @@ -35,7 +38,7 @@ def serialize(self): #TODO: change this when add `input` if self.data is not None: return tuple({'worker_name': self.worker_name, - 'data': self.data}.items()) + 'data': tuple(self.data.items())}.items()) else: return tuple({'worker_name': self.worker_name}.items()) @@ -47,7 +50,7 @@ def deserialize(information): elif 'data' not in information: return Job(information['worker_name']) else: - return Job(information['worker_name'], information['data']) + return Job(information['worker_name'], dict(information['data'])) class Pipeline(object): @@ -79,7 +82,7 @@ def __init__(self, pipeline, data=None): self.sent_jobs = set() def __eq__(self, other): - return self._graph == other._graph + return self._graph == other._graph and self.data == other.data def __ne__(self, other): return not self.__eq__(other) @@ -150,7 +153,10 @@ def serialize(self): if value is not None: serialized_value = value.serialize() result.append((serialized_key, serialized_value)) - return tuple({'graph': tuple(result), 'data': self.data}.items()) + data = self.data + if data is not None: + data = tuple(self.data.items()) + return tuple({'graph': tuple(result), 'data': data}.items()) @staticmethod def _deserialize(info): @@ -162,7 +168,10 @@ def _deserialize(info): if value is not None: deserialized_value = Job.deserialize(value) new_graph.append((deserialized_key, deserialized_value)) - return new_graph, info['data'] + data = info['data'] + if data is not None: + data = dict(data) + return new_graph, data @staticmethod def deserialize(info): @@ -203,14 +212,14 @@ class PipelineManager(Client): def __init__(self, api, broadcast, poll_time=50): # milliseconds super(PipelineManager, self).__init__() self.poll_time = poll_time - self._pipelines = set() + self._pipelines = [] self._pipeline_from_id = {} self.connect(api=api, broadcast=broadcast) def start(self, pipeline): if pipeline in self._pipelines: raise ValueError('This pipeline was already started') - self._pipelines.add(pipeline) + self._pipelines.append(pipeline) request = {'command': 'add pipeline', 'pipeline': pipeline.serialize()} self.send_api_request(request) result = self.get_api_reply() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 6ad0878..b4b7b77 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -44,10 +44,13 @@ def test_serialize_and_deserialize(self): job_with_data = Job('testing', data={'python': 42, 'spam': 'eggs'}) expected_with_data = {'worker_name': 'testing', - 'data': {'python': 42, 'spam': 'eggs'}} + 'data': tuple({'python': 42, + 'spam': 'eggs'}.items())} expected_with_data = tuple(expected_with_data.items()) self.assertEqual(job_with_data.serialize(), expected_with_data) self.assertEqual(Job.deserialize(expected_with_data), job_with_data) + self.assertEqual(Job.deserialize(job_with_data.serialize()).serialize(), + job_with_data.serialize()) class PipelineTest(unittest.TestCase): def test_only_accept_Job_objects(self): @@ -379,14 +382,19 @@ def test_serialize(self): expected['graph'] = dict(expected['graph']) self.assertEqual(result, expected) + pipeline = Pipeline({job_1: job_2}, data={'python': 42}) + self.assertEqual(pipeline, Pipeline.deserialize(pipeline.serialize())) + def test_deserialize(self): job_1, job_2, job_3, job_4, job_5 = (Job('spam'), Job('eggs'), Job('ham'), Job('python'), Job('answer_42')) - pipeline = Pipeline({job_1: job_2, job_2: (job_3, job_4), job_5: None}) + pipeline = Pipeline({job_1: job_2, job_2: (job_3, job_4), job_5: None}, + data={'key': 42}) serialized = pipeline.serialize() new_pipeline = Pipeline.deserialize(serialized) self.assertEqual(pipeline, new_pipeline) + self.assertEqual(serialized, new_pipeline.serialize()) def test_equal_not_equal_hash(self): job_1, job_2, job_3, job_4 = (Job('spam'), Job('eggs'), Job('ham'), @@ -404,6 +412,15 @@ def test_equal_not_equal_hash(self): self.assertIn(pipeline_2, my_set) self.assertIn(pipeline_3, my_set) + pipeline_with_data = Pipeline({job_1: job_2, job_2: (job_3, job_4)}, + data={'python': 42}) + pipeline_with_data_2 = Pipeline({job_1: job_2, job_2: (job_3, job_4)}, + data={'python': 42}) + self.assertTrue(pipeline_with_data == pipeline_with_data_2) + self.assertTrue(pipeline_with_data_2 == pipeline_with_data) + self.assertTrue(pipeline_1 != pipeline_with_data) + self.assertTrue(pipeline_with_data != pipeline_1) + def run_in_parallel(function, args=tuple()): pool = Pool(processes=1) result = pool.apply_async(function, args) From c2e08664b4598cd1627bac9bba8da09d723ccb46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 13:39:30 -0300 Subject: [PATCH 30/37] Broker do'nt stop when can't use StoreClass --- pypelinin/broker.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/pypelinin/broker.py b/pypelinin/broker.py index 4da14ce..3e1b373 100755 --- a/pypelinin/broker.py +++ b/pypelinin/broker.py @@ -211,9 +211,16 @@ def save_monitoring_information(self): process_info['data'] = worker.job_info['data'] processes.append(process_info) data = {'host': host_info, 'timestamp': time(), 'processes': processes} - self._store.save_monitoring(data) + try: + self._store.save_monitoring(data) + except Exception as e: + #TODO: what to do? + self.logger.error('Could not save monitoring information into ' + 'store with parameters: {}. Exception: {}'\ + .format(data, e)) + return self.last_time_saved_monitoring_information = time() - self.logger.info('Saved monitoring information in MongoDB') + self.logger.info('Saved monitoring information') self.logger.debug(' Information: {}'.format(data)) def start(self): @@ -238,7 +245,14 @@ def start_job(self, job_description): 'worker_requires': worker_requires, 'data': job_description['data']} #TODO: handle if retrieve raises exception - worker_input = self._store.retrieve(info) + try: + worker_input = self._store.retrieve(info) + except Exception as e: + #TODO: what to do? + self.logger.error('Could not retrieve data from store ' + 'with parameters: {}. Exception: {}'\ + .format(info, e)) + return job_info = {'worker': worker, 'worker_input': worker_input, 'data': job_description['data'], @@ -308,7 +322,15 @@ def check_if_some_job_finished_and_do_what_you_need_to(self): #TODO: what if I want to the caller to receive job information # as a "return" from a function call? Should use a store? #TODO: handle if retrieve raises exception - self._store.save(job_information) + try: + self._store.save(job_information) + except Exception as e: + #TODO: what to do? + self.logger.error('Could not save data into store ' + 'with parameters: ' + '{}. Exception: {}'\ + .format(job_information, e)) + return except ValueError: self.request({'command': 'job failed', 'job id': job_id, From 8068ccf3b552db26cf0a526eafa2569ae30318aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 14:21:44 -0300 Subject: [PATCH 31/37] Add a example of usage --- example/file_store.py | 52 ++++++++++++++++++++++++++++++++ example/my_broker.py | 33 ++++++++++++++++++++ example/my_pipeliner.py | 22 ++++++++++++++ example/my_router.py | 24 +++++++++++++++ example/requirements.txt | 1 + example/send_pipelines.py | 52 ++++++++++++++++++++++++++++++++ example/test_workers.py | 63 +++++++++++++++++++++++++++++++++++++++ example/workers.py | 53 ++++++++++++++++++++++++++++++++ 8 files changed, 300 insertions(+) create mode 100644 example/file_store.py create mode 100644 example/my_broker.py create mode 100644 example/my_pipeliner.py create mode 100644 example/my_router.py create mode 100644 example/requirements.txt create mode 100644 example/send_pipelines.py create mode 100644 example/test_workers.py create mode 100644 example/workers.py diff --git a/example/file_store.py b/example/file_store.py new file mode 100644 index 0000000..24bb528 --- /dev/null +++ b/example/file_store.py @@ -0,0 +1,52 @@ +# coding: utf-8 + +import json + + +class SimpleFileStore(object): + def __init__(self, **configuration): + self.monitoring_fp = open(configuration['monitoring filename'], 'w') + + def retrieve(self, info): + '''Retrieve data to pass to `WorkerClass.process` + + `info` has keys 'worker', 'worker_requires' and 'data': + - 'data' comes from pipeline data + - 'worker' is the worker name + - 'worker_requires' is 'requires' attribute of WorkerClass + ''' + filename = info['data']['filename'] # get filename with data + worker_requires = info['worker_requires'] + with open(filename, 'r') as fp: + file_data = json.loads(fp.read().strip()) # read filename + # get only information this worker needs + worker_input = {key: file_data[key] for key in worker_requires} + return worker_input + + def save(self, info): + '''Save information returned by `WorkerClass.process` + + `info` has keys 'worker', 'worker_requires', 'worker_result' and 'data': + - 'data' comes from pipeline data + - 'worker' is the worker name + - 'worker_requires' is 'requires' attribute of WorkerClass + - 'worker_result' is what WorkerClass.process returned + ''' + # read information from file + filename = info['data']['filename'] + with open(filename, 'r') as fp: + file_data = json.loads(fp.read().strip()) + + # update file with information returned by worker + worker_result = info['worker_result'] + file_data['_result-from-{}'.format(info['worker'])] = worker_result + for key, value in worker_result.items(): + file_data[key] = value + with open(filename, 'w') as fp: + fp.write(json.dumps(file_data)) + + def save_monitoring(self, data): + # serialize monitoring information to JSON and save in a file + data_as_json_string = json.dumps(data) + self.monitoring_fp.write(data_as_json_string + "\n") + self.monitoring_fp.flush() diff --git a/example/my_broker.py b/example/my_broker.py new file mode 100644 index 0000000..adfe36d --- /dev/null +++ b/example/my_broker.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# coding: utf-8 + +from logging import Logger, StreamHandler, Formatter, NullHandler +from multiprocessing import cpu_count +from sys import stdout + +from pypelinin import Broker + +from file_store import SimpleFileStore + + +def main(): + logger = Logger('Broker') + handler = StreamHandler(stdout) + formatter = Formatter('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + broker = Broker(api='tcp://localhost:5555', # router API + broadcast='tcp://localhost:5556', # router Broadcast + # class that will be called to retrieve/store information + # to pass to/to save from worker + store_class=SimpleFileStore, + logger=logger, + # name of the module that contain workers + workers='workers', + # each core will run 4 workers + number_of_workers=cpu_count() * 4) + broker.start() + +if __name__ == '__main__': + main() diff --git a/example/my_pipeliner.py b/example/my_pipeliner.py new file mode 100644 index 0000000..ea05f96 --- /dev/null +++ b/example/my_pipeliner.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python2 +# coding: utf-8 + +from sys import stdout +from logging import Logger, StreamHandler, Formatter +from pypelinin import Pipeliner + + +def main(): + logger = Logger('Pipeliner') + handler = StreamHandler(stdout) + formatter = Formatter('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + pipeliner = Pipeliner(api='tcp://localhost:5555', + broadcast='tcp://localhost:5556', logger=logger) + pipeliner.start() + +if __name__ == '__main__': + main() + diff --git a/example/my_router.py b/example/my_router.py new file mode 100644 index 0000000..6707db0 --- /dev/null +++ b/example/my_router.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python2 +# coding: utf-8 + +from sys import stdout +from logging import Logger, StreamHandler, Formatter +from pypelinin import Router + + +def main(): + logger = Logger('My Router') + handler = StreamHandler(stdout) + formatter = Formatter('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + api_host_port = ('*', 5555) + broadcast_host_port = ('*', 5556) + default_config = {'store': {'monitoring filename': '/tmp/monitoring.log'}, + 'monitoring interval': 60, } + router = Router(api_host_port, broadcast_host_port, default_config, logger) + router.start() + +if __name__ == '__main__': + main() diff --git a/example/requirements.txt b/example/requirements.txt new file mode 100644 index 0000000..0b66568 --- /dev/null +++ b/example/requirements.txt @@ -0,0 +1 @@ +pypelinin diff --git a/example/send_pipelines.py b/example/send_pipelines.py new file mode 100644 index 0000000..395c084 --- /dev/null +++ b/example/send_pipelines.py @@ -0,0 +1,52 @@ +# coding: utf-8 + +import json + +from pypelinin import Job, Pipeline, PipelineManager + + +def main(): + pipeline_definition = {Job('Downloader'): (Job('GetTextAndWords'), + Job('GetLinks'))} + urls = ['http://www.fsf.org', 'https://creativecommons.org', + 'https://github.com', 'http://emap.fgv.br', + 'https://twitter.com/turicas'] + + pipeline_manager = PipelineManager(api='tcp://127.0.0.1:5555', + broadcast='tcp://127.0.0.1:5556') + print 'Sending pipelines...' + my_pipelines = [] + for index, url in enumerate(urls): + filename = '/tmp/{}.data'.format(index) + data = json.dumps({'url': url}) + with open(filename, 'w') as fp: + fp.write(data) + pipeline = Pipeline(pipeline_definition, data={'filename': filename}) + pipeline_manager.start(pipeline) + my_pipelines.append(pipeline) + print ' Sent pipeline for url={}'.format(url) + + print 'Waiting for pipelines to finish...' + pipelines_finished = 0 + while pipelines_finished < len(urls): + counter = 0 + for pipeline in my_pipelines: + if pipeline_manager.finished(pipeline): + counter += 1 + if counter != pipelines_finished: + print ' # of finished pipelines: {}'.format(counter) + pipelines_finished = counter + + durations = [str(pipeline.duration) for pipeline in my_pipelines] + print 'Pipeline durations (in seconds) = {}'.format(', '.join(durations)) + + for index, url in enumerate(urls): + filename = '/tmp/{}.data'.format(index) + with open(filename) as fp: + data = json.loads(fp.read()) + print (' url={url}, download_duration={download_duration}, ' + 'number_of_words={number_of_words}, ' + 'number_of_links={number_of_links}'.format(**data)) + +if __name__ == '__main__': + main() diff --git a/example/test_workers.py b/example/test_workers.py new file mode 100644 index 0000000..7bbcc2a --- /dev/null +++ b/example/test_workers.py @@ -0,0 +1,63 @@ +# coding: utf-8 + +import time +import unittest + +from textwrap import dedent + +from workers import Downloader, GetTextAndWords, GetLinks + + +sample_html = dedent(''' + + It's a test + + This is a link +
+ Another link +
+ Another link (repeated) +
+ The last one + + +''').strip() + +class TestDownloader(unittest.TestCase): + def test_download_local_file(self): + filename = '/tmp/testing-worker-downloader' + worker_input = {'url': filename} + with open(filename, 'w') as fp: + fp.write(sample_html) + start_time = time.time() + result = Downloader().process(worker_input) + end_time = time.time() + total_time = end_time - start_time + self.assertEqual(result['html'], sample_html) + self.assertEqual(result['length'], len(sample_html)) + self.assertTrue(result['download_duration'] < total_time) + +class TestGetTextAndWords(unittest.TestCase): + def test_simple_html(self): + worker_input = {'html': sample_html} + result = GetTextAndWords().process(worker_input) + expected_text = dedent(''' + It's a test + This is a link + Another link + Another link (repeated) + The last one + ''').strip() + expected_words = expected_text.split() + self.assertEqual(result['text'], expected_text) + self.assertEqual(result['words'], expected_words) + self.assertEqual(result['number_of_words'], len(expected_words)) + +class TestGetLinks(unittest.TestCase): + def test_non_repeat_links(self): + worker_input = {'html': sample_html} + result = GetLinks().process(worker_input) + expected_links = ['http://pypelin.in/', 'http://www.wikipedia.org/', + 'http://python.org/'] + self.assertEqual(set(result['links']), set(expected_links)) + self.assertEqual(result['number_of_links'], len(expected_links)) diff --git a/example/workers.py b/example/workers.py new file mode 100644 index 0000000..be6a42a --- /dev/null +++ b/example/workers.py @@ -0,0 +1,53 @@ +# coding: utf-8 + + +import re +import time +import urllib2 + +from pypelinin import Worker + + +__all__ = ['Downloader', 'GetTextAndWords', 'GetLinks'] + +class Downloader(Worker): + requires = ['url'] + + def process(self, data): + url = data['url'] + start_time = time.time() + opener = urllib2.build_opener() + opener.addheaders = [('User-agent', 'Mozilla/5.0')] + response = opener.open(url) + content = response.read() + response.close() + end_time = time.time() + total_time = end_time - start_time + return {'html': content, 'download_duration': total_time, + 'length': len(content)} + +regexp_tags = re.compile(r'<[^>]*>') +regexp_spaces = re.compile(r'\n[ ]*') + +class GetTextAndWords(Worker): + requires = ['html'] + + def process(self, data): + html = data['html'] + without_tags = regexp_tags.sub('', html) + text = regexp_spaces.sub('\n', without_tags).strip() + while '\n\n' in text: + text = text.replace('\n\n', '\n') + words = text.split() + return {'text': text, 'words': words, 'number_of_words': len(words)} + +regexp_links = re.compile(r'http://([^ "]*)') + +class GetLinks(Worker): + requires = ['html'] + + def process(self, data): + html = data['html'] + links = set(regexp_links.findall(html)) # do not repeat links + links = ['http://' + link for link in links] + return {'links': links, 'number_of_links': len(links)} From 7e35f3dfd8515f24288a6c8dbfa81deb33c86f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 14:51:39 -0300 Subject: [PATCH 32/37] Update README and add a tutorial --- README.markdown | 138 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 123 insertions(+), 15 deletions(-) diff --git a/README.markdown b/README.markdown index 988f97a..ed1f56b 100644 --- a/README.markdown +++ b/README.markdown @@ -9,7 +9,24 @@ the daemons. Architecture ------------ -TODO: talk about Router, Broker and Pipeliner +We have 3 daemons you need to run: + +- **Router**: it's the central point of communication of the network. Every + pipeline you need to execute should be asked to Router to add it and every + other daemon will communicate with Router to get a pipeline to execute and + other things. You can have only one Router running. +- **Broker**: it run worker processes and execute jobs. It does not know about + an entire pipeline, it just receives a job to be executed, retrieve + needed information for that job, run the worker and then save information + returned by the worker. It uses a class defined by you (`StoreClass`) to + retrieve/save information. You should run as many Brokers as possible in your + cluster, to increase throughtput of job/pipeline execution. +- **Pipeliner**: It take cares of pipelines. This daemon do not know how to + save/retrieve or even execute jobs, but it knows which job should be executed + after another one in a pipeline. Router will give Pipeliner a pipeline and it + will ask for job execution (to Router, that will be sent to Broker). You can + run as many Pipeliners you want (but just one can handle lots of pipelines + simultaneously). Usage @@ -17,35 +34,126 @@ Usage ### Daemons -TODO: talk about starting daemons +For each daemon, you need to create a script that instantiates the daemon class +and start it. Please check our +[example](https://github.com/turicas/pypelinin/tree/develop/example) +(files `example/my_router.py`, `example/my_broker.py` and +`example/my_pipeliner.py`). ### Client -Pypelinin will provide a high level python dsl to describe your workflow. +You need to specify what jobs are in a pipeline and then send it to Router. +A pipeline is a +[directed acyclic graph](https://en.wikipedia.org/wiki/Directed_acyclic_graph) +(aka DAG) and is represented as a `dict`, where key/value pairs represent edges +(keys are "from" and values are "to" edges -- +[see notes about this representation](http://www.python.org/doc/essays/graphs/)). -#### Example 1 - Creating a pipeline + +#### Example Creating a pipeline and submitting it to execution ```python -from pypelinin import Pipeline +from pypelinin import Pipeline, Job, PipelineManager -pipeline = Pipeline('task1') | Pipeline('parallel_1', 'parallel_2') | Pipeline('last_task') +pipeline = Pipeline({Job('WorkerName1'): Job('WorkerName2'), + Job('WorkerName2'): Job('WorkerName3')}, + data={'foo': 'bar'}) ``` -#### Example 2 - Submitting a pipeline to be executed - -TODO: PipelineManager is not implemented +In this pipeline, `Job('WorkernName2')` will be executed after +`Job('WorkerName1')` and `Job('WorkerName3')` after `Job('WorkerName2')` -- +when you send it to `Pipeliner` (via `Router`), it'll take care of executing +the jobs in this order. `data` is what will be passed to `StoreClass` (that is +loaded on each `Broker`) when `Broker` needs to retrieve information from a +data store to pass it to a worker execute or to save information returned by +the worker. -After defined, you just have to start your pipeline. ```python -from pypelinin import Pipeline, PipelineManager manager = PipelineManager(api='tcp://localhost:5555', broadcast='tcp://localhost:5556') -pipeline = Pipeline('task1') | Pipeline('task2') -print 'starting executing tasks...' -manager.start(pipeline) -pipeline.wait_finish() +manager.start(pipeline) # send it to the cluster to execute +while not manager.finished(pipeline): # wait for pipeline to finish + pass print 'done' ``` + +Note that you need to create a `StoreClass` and the workers (each one is a +another class). These classes should be passed to a `Broker` when instantiated. + + +Tutorial +-------- + +Let's learn doing! Create a virtualenv, install pypelinin and then download our +`example` folder to see it working. + + mkvirtualenv test-pypelinin + pip install pypelinin + wget https://github.com/turicas/pypelinin/tarball/develop -O pypelinin.tar.gz + tar -xfz pypelinin.tar.gz && rm pypelinin.tar.gz + cd turicas-pypelinin-*/example/ + +Now your environment is created and you need to run the daemons, each one in a +separated terminal: + +Router: + + $ python my_router.py + 2012-10-15 14:12:59,112 - My Router - INFO - Entering main loop + +Broker: + + $ python my_broker.py + 2012-10-15 14:13:17,956 - Broker - INFO - Starting worker processes + 2012-10-15 14:13:18,055 - Broker - INFO - Broker started + 2012-10-15 14:13:18,056 - Broker - INFO - Trying to connect to router... + 2012-10-15 14:13:18,057 - Broker - INFO - [API] Request to router: {'command': 'get configuration'} + 2012-10-15 14:13:18,058 - Broker - INFO - [API] Reply from router: {u'monitoring interval': 60, u'store': {u'monitoring filename': u'/tmp/monitoring.log'}} + +And Pipeliner: + + $ python my_pipeliner.py + 2012-10-15 14:13:56,476 - Pipeliner - INFO - Pipeliner started + 2012-10-15 14:13:56,477 - Pipeliner - INFO - Entering main loop + 2012-10-15 14:13:56,477 - Pipeliner - INFO - Bad bad router, no pipeline for me. + +Please read the files: +- `file\_store.py` - we have a simple StoreClass which saves and retrieves + information from files. You can modify it easily to use a database. +- `workers.py` (and `test\_workers.py`) - we have created 3 workers: + `Downloader`, `GetTextAndWords` and `GetLinks`. The first one is required to + execute the last two. Each worker is basically a class that inherites from + `pypelinin.Worker`, have an attribute `requires` and a method `process`. +- `send\_pipelines.py` - this script basically creates some `Pipeline`s and + send it to execution using a `PipelineManager` (as the example above). You + need to run it to get the jobs executed. + +After executing `send\_pipelines.py` you can check files +`/tmp/{0,1,2,3,4}.data` to see the results -- these files are python +dictionaries encoded as JSON (this was done by `file\_store.SimpleFileStore`). +To read one of these files, just call this function: + +```python +import json + +def read_result_file(filename): + with open(filename, 'r') as fp: + data = fp.read() + return json.loads(data) +``` + +### Installing on other cluster nodes + +If you want to process more jobs/pipelines per second, you need to run more +Brokers on another machines. To do it, you need to: + +- Be sure `Router` is binding to an interface that is reachable to all machines + that will run `Broker` and `Pipeline` (change `my\_router.py`); +- Change `my\_broker.py` with new `Router` ip address/ports; +- Install `pypelinin` in all cluster machines; +- Copy `my\_broker.py`, `file\_store.py` and `workers.py` to all + "Broker machines"; +- Run everything! From 7f9e938b9087695a5a2825bdb71aa85003e2d47f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 15:04:15 -0300 Subject: [PATCH 33/37] Add installation notes about ZeroMQ --- README.markdown | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.markdown b/README.markdown index ed1f56b..83f4db0 100644 --- a/README.markdown +++ b/README.markdown @@ -29,6 +29,19 @@ We have 3 daemons you need to run: simultaneously). +Installation +------------ + +First you need to install `libzmq`, its headers and compilers needed to compile +it. On a Debian/Ubuntu machine, run: + + sudo aptitude install libzmq libzmq-dev build-essential + +Then, install the Python package: + + pip install pypelinin + + Usage ----- @@ -93,7 +106,7 @@ Let's learn doing! Create a virtualenv, install pypelinin and then download our mkvirtualenv test-pypelinin pip install pypelinin wget https://github.com/turicas/pypelinin/tarball/develop -O pypelinin.tar.gz - tar -xfz pypelinin.tar.gz && rm pypelinin.tar.gz + tar xfz pypelinin.tar.gz && rm pypelinin.tar.gz cd turicas-pypelinin-*/example/ Now your environment is created and you need to run the daemons, each one in a From e57d9b253865efee9b03819984117186fda6bd93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 15:04:27 -0300 Subject: [PATCH 34/37] Update `install_requires` on setup.py --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7416ac8..0ae58a4 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,9 @@ url='https://github.com/turicas/pypelinin/', description='Easily distribute and process jobs and pipelines among a cluster', packages=['pypelinin'], - install_requires=['pyzmq', 'psutil'], - license='GPL3', + install_requires=['pyzmq', 'psutil', 'python-graph-core', + 'python-graph-dot'], + license='LGPL', keywords=['jobs', 'tasks', 'distributed', 'pipelines', 'cluster'], classifiers=[ 'Development Status :: 3 - Alpha', From 0bedc51a9ca896e8a45f809408996b1b59ca42a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 15:06:53 -0300 Subject: [PATCH 35/37] Add log of changes --- CHANGELOG.markdown | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 CHANGELOG.markdown diff --git a/CHANGELOG.markdown b/CHANGELOG.markdown new file mode 100644 index 0000000..6af8b3f --- /dev/null +++ b/CHANGELOG.markdown @@ -0,0 +1,10 @@ +pypelinin's ChangeLog +===================== + + +Version 0.1.0 +------------- + +- First version released! +- Have Router, Broker and Pipeliner on "server-side" +- Have Job, Pipeline and PipelineManager on "client-side" From ebace0b8fd48705f0bb5c4c52525f60b80a8b537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 15:07:08 -0300 Subject: [PATCH 36/37] Add MANIFEST.in --- MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e12e4f7 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include README.markdown +include CHANGELOG.markdown From d4988e6719f5d03b4030413730adcb4c2e7c0b29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Mon, 15 Oct 2012 15:07:17 -0300 Subject: [PATCH 37/37] Update setup.py --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 0ae58a4..06edf99 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,11 @@ setup(name='pypelinin', - version='0.1.0-dev', + version='0.1.0', author=u'Álvaro Justen', author_email='alvarojusten@gmail.com', url='https://github.com/turicas/pypelinin/', - description='Easily distribute and process jobs and pipelines among a cluster', + description='Easily distribute jobs and pipelines among a cluster', packages=['pypelinin'], install_requires=['pyzmq', 'psutil', 'python-graph-core', 'python-graph-dot'], @@ -17,10 +17,10 @@ classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License (GPL)', - 'Natural Language :: English', + 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', 'Operating System :: OS Independent', 'Programming Language :: Python :: 2.7', 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: System :: Distributed Computing', ], )