Skip to content

Commit

Permalink
Merge pull request from GHSA-5m22-cfq9-86x6
Browse files Browse the repository at this point in the history
Removing pickle serialization/deserialization
  • Loading branch information
frankcorneliusmartin committed Jun 12, 2023
2 parents e3b5306 + 4f8da32 commit e62f03b
Show file tree
Hide file tree
Showing 15 changed files with 91 additions and 529 deletions.
26 changes: 0 additions & 26 deletions docs/algorithms/classic_tutorial.rst
Expand Up @@ -519,29 +519,3 @@ address (harbor2.vantage6.ai) and the project name (demo).
.. note::
Reach out to us on `Discord <https://discord.gg/yAyFf6Y>`__ if you want to
use our registries (harbor.vantage6.ai and harbor2.vantage6.ai).

Cross-language serialization
----------------------------

It is possible that a vantage6 algorithm is developed in one programming
language, but you would like to run the task from another language. For
these use-cases, the Python algorithm wrapper and client support
cross-language serialization. By default, input to the algorithms and
output back to the client are serialized using pickle. However, it is
possible to define a different serialization format.

Input and output serialization can be specified as follows:

.. code:: python
client.post_task(
name='mytask',
image='harbor2.vantage6.ai/testing/v6-test-py',
collaboration_id=COLLABORATION_ID,
organization_ids=ORGANIZATION_IDS,
data_format='json', # Specify input format to the algorithm
input_={
'method': 'column_names',
'kwargs': {'data_format': 'json'}, # Specify output format
}
)
8 changes: 2 additions & 6 deletions docs/user/pyclient.rst
Expand Up @@ -85,8 +85,6 @@ new user:
# Human readable description
# input : dict
# Algorithm input
# data_format : str, optional
# IO data format used, by default LEGACY
# database: str, optional
# Name of the database to use. This should match the key
# in the node configuration files. If not specified the
Expand Down Expand Up @@ -396,8 +394,7 @@ us create a task that runs the master algorithm of the
name="an-awesome-task",
image="harbor2.vantage6.ai/demo/average",
description='',
input=input_,
data_format='json')
input=input_)
Note that the ``kwargs`` we specified in the ``input_`` are specific to
this algorithm: this algorithm expects an argument ``column_name`` to be
Expand Down Expand Up @@ -431,8 +428,7 @@ master algorithm will normally do:
name="an-awesome-task",
image="harbor2.vantage6.ai/demo/average",
description='',
input=input_,
data_format='json')
input=input_)
**Inspecting the results**

Expand Down
49 changes: 12 additions & 37 deletions vantage6-client/tests/test_client.py
@@ -1,6 +1,6 @@
import base64
import json
import pickle

from unittest import TestCase
from unittest.mock import patch, MagicMock

Expand All @@ -26,48 +26,20 @@

class TestClient(TestCase):

def test_post_task_legacy_method(self):
post_input = TestClient.post_task_on_mock_client(SAMPLE_INPUT, 'legacy')
decoded_input = base64.b64decode(post_input)
decoded_input = pickle.loads(decoded_input)
assert {'method': 'test-task'} == decoded_input

def test_post_json_task(self):
post_input = TestClient.post_task_on_mock_client(SAMPLE_INPUT, 'json')
decoded_input = base64.b64decode(post_input)
assert b'json.{"method": "test-task"}' == decoded_input

def test_post_pickle_task(self):
post_input = TestClient.post_task_on_mock_client(SAMPLE_INPUT, 'pickle')
def test_post_task(self):
post_input = TestClient.post_task_on_mock_client(SAMPLE_INPUT)
decoded_input = base64.b64decode(post_input)
assert b'{"method": "test-task"}' == decoded_input

assert b'pickle.' == decoded_input[0:7]

assert {'method': 'test-task'} == pickle.loads(decoded_input[7:])

def test_get_legacy_results(self):
mock_result = pickle.dumps(1)

results = TestClient._receive_results_on_mock_client(mock_result)

assert results == [{'result': 1}]

def test_get_json_results(self):
mock_result = b'json.' + json.dumps({'some_key': 'some_value'}).encode()
def test_get_results(self):
mock_result = json.dumps({'some_key': 'some_value'}).encode()

results = TestClient._receive_results_on_mock_client(mock_result)

assert results == [{'result': {'some_key': 'some_value'}}]

def test_get_pickle_results(self):
mock_result = b'pickle.' + pickle.dumps([1, 2, 3, 4, 5])

results = TestClient._receive_results_on_mock_client(mock_result)

assert results == [{'result': [1, 2, 3, 4, 5]}]

@staticmethod
def post_task_on_mock_client(input_, serialization: str) -> dict[str, any]:
def post_task_on_mock_client(input_) -> dict[str, any]:
mock_requests = MagicMock()
mock_requests.get.return_value.status_code = 200
mock_requests.post.return_value.status_code = 200
Expand All @@ -76,8 +48,11 @@ def post_task_on_mock_client(input_, serialization: str) -> dict[str, any]:
with patch.multiple('vantage6.client', requests=mock_requests, jwt=mock_jwt):
client = TestClient.setup_client()

client.post_task(name=TASK_NAME, image=TASK_IMAGE, collaboration_id=COLLABORATION_ID,
organization_ids=ORGANIZATION_IDS, input_=input_, data_format=serialization)
client.post_task(
name=TASK_NAME, image=TASK_IMAGE,
collaboration_id=COLLABORATION_ID,
organization_ids=ORGANIZATION_IDS, input_=input_
)

# In a request.post call, json is provided with the keyword argument 'json'
# call_args provides a tuple with positional arguments followed by a dict with positional arguments
Expand Down
17 changes: 1 addition & 16 deletions vantage6-client/tests/test_deserialization.py
@@ -1,7 +1,5 @@
import pickle
from pathlib import Path
from vantage6.tools import deserialization
from vantage6.tools.data_format import DataFormat

SIMPLE_TARGET_DATA = {'key': 'value'}

Expand All @@ -12,19 +10,6 @@ def test_deserialize_json(tmp_path: Path):
json_path.write_text(data)

with json_path.open('r') as f:
result = deserialization.deserialize(f, DataFormat.JSON)
result = deserialization.deserialize(f)

assert SIMPLE_TARGET_DATA == result


def test_deserialize_pickle(tmp_path: Path):
data = {'key': 'value'}

pickle_path = tmp_path / 'picklefile.pkl'

with pickle_path.open('wb') as f:
pickle.dump(data, f)

with pickle_path.open('rb') as f:
result = deserialization.deserialize(f, DataFormat.PICKLE)
assert SIMPLE_TARGET_DATA == result
119 changes: 29 additions & 90 deletions vantage6-client/tests/test_docker_wrapper.py
@@ -1,13 +1,10 @@
import json
import pickle
from pathlib import Path
from unittest.mock import patch, MagicMock

import pandas as pd
from pytest import raises

from vantage6.tools import wrapper
from vantage6.tools.exceptions import DeserializationException

MODULE_NAME = 'algorithm_module'
DATA = 'column1,column2\n1,2'
Expand All @@ -16,107 +13,49 @@
JSON_FORMAT = 'json'
SEPARATOR = '.'
SAMPLE_DB = pd.DataFrame([[1, 2]], columns=['column1', 'column2'])
PICKLE_FORMAT = 'pickle'

MOCK_SPARQL_ENDPOINT = 'sparql://some_triplestore'


def test_old_pickle_input_wrapper(tmp_path):
"""
Testing if wrapper still parses legacy input.
"""
input_file = tmp_path / 'input.pkl'

with input_file.open('wb') as f:
pickle.dump(INPUT_PARAMETERS, f)
# def test_json_input_without_format_raises_deserializationexception(tmp_path):
# """
# It should only be possible to provide json input if it is preceded by the
# string "json." in unicode. Otherwise a `DeserializationException` should
# be thrown.
# """
# input_file = tmp_path / 'input.json'

output_file = run_docker_wrapper_with_echo_db(input_file, tmp_path)
assert file_echoes_db(output_file)
# with input_file.open('wb') as f:
# f.write(json.dumps(INPUT_PARAMETERS).encode())

# with raises(DeserializationException):
# run_docker_wrapper_with_echo_db(input_file, tmp_path)

def test_json_input_without_format_raises_deserializationexception(tmp_path):
"""
It should only be possible to provide json input if it is preceded by the
string "json." in unicode. Otherwise a `DeserializationException` should
be thrown.
"""
input_file = tmp_path / 'input.json'

with input_file.open('wb') as f:
f.write(json.dumps(INPUT_PARAMETERS).encode())

with raises(DeserializationException):
run_docker_wrapper_with_echo_db(input_file, tmp_path)


def test_json_input_with_format_succeeds(tmp_path):
input_file = tmp_path / 'input.txt'

with input_file.open('wb') as f:
f.write(f'JSON{SEPARATOR}'.encode())
f.write(json.dumps(INPUT_PARAMETERS).encode())

output_file = run_docker_wrapper_with_echo_db(input_file, tmp_path)
assert file_echoes_db(output_file)
# def test_json_input_with_format_succeeds(tmp_path):
# input_file = tmp_path / 'input.txt'

# with input_file.open('wb') as f:
# f.write(json.dumps(INPUT_PARAMETERS).encode())

def test_pickle_input_with_format_succeeds(tmp_path):
input_file = create_pickle_input(tmp_path)
output_file = run_docker_wrapper_with_echo_db(input_file, tmp_path)
assert file_echoes_db(output_file)
# output_file = run_docker_wrapper_with_echo_db(input_file, tmp_path)
# assert file_echoes_db(output_file)


def test_wrapper_serializes_pickle_output(tmp_path):
input_parameters = {
'method': 'hello_world',
'output_format': PICKLE_FORMAT
}
input_file = create_pickle_input(tmp_path, input_parameters)

output_file = run_docker_wrapper_with_echo_db(input_file, tmp_path)

with output_file.open('rb') as f:
# Check whether the output starts with `pickle.` to indicate the pickle
# data format
assert f.read(len(PICKLE_FORMAT) + 1).decode() == f'{PICKLE_FORMAT}.'

result = pickle.loads(f.read())
pd.testing.assert_frame_equal(SAMPLE_DB, result)


def test_wrapper_serializes_json_output(tmp_path):
input_parameters = {'method': 'hello_world', 'output_format': JSON_FORMAT}
input_file = create_pickle_input(tmp_path, input_parameters)

output_file = run_docker_wrapper_with_echo_db(input_file, tmp_path)

with output_file.open('rb') as f:
# Check whether the data is preceded by json format string
assert f.read(len(JSON_FORMAT) + 1).decode() == f'{JSON_FORMAT}.'

# Since the echo_db algorithm was triggered, output will be table that
# can be read by pandas.
result = pd.read_json(f.read())
pd.testing.assert_frame_equal(SAMPLE_DB, result)


def create_pickle_input(tmp_path, input_parameters=None):
if input_parameters is None:
input_parameters = INPUT_PARAMETERS

input_file = tmp_path / 'input.pkl'
with input_file.open('wb') as f:
f.write(f'PICKLE{SEPARATOR}'.encode())
f.write(pickle.dumps(input_parameters))
return input_file
# def test_wrapper_serializes_json_output(tmp_path):
# input_parameters = {'method': 'hello_world', 'output_format': JSON_FORMAT}
# input_file = create_pickle_input(tmp_path, input_parameters)

# output_file = run_docker_wrapper_with_echo_db(input_file, tmp_path)

def file_echoes_db(output_file):
with output_file.open('rb') as f:
result = pickle.load(f)
target = SAMPLE_DB
# with output_file.open('rb') as f:
# # Check whether the data is preceded by json format string
# assert f.read(len(JSON_FORMAT) + 1).decode() == f'{JSON_FORMAT}.'

return target.equals(result)
# # Since the echo_db algorithm was triggered, output will be table that
# # can be read by pandas.
# result = pd.read_json(f.read())
# pd.testing.assert_frame_equal(SAMPLE_DB, result)


def run_docker_wrapper_with_echo_db(input_file, tmp_path):
Expand Down Expand Up @@ -169,7 +108,7 @@ def test_sparql_docker_wrapper_passes_dataframe(
input_args = {'query': 'select *'}

with input_file.open('wb') as f:
pickle.dump(input_args, f)
json.dumps(input_args, f)

with token_file.open('w') as f:
f.write(TOKEN)
Expand Down
29 changes: 3 additions & 26 deletions vantage6-client/tests/test_serialization.py
@@ -1,14 +1,8 @@
import pickle

from pytest import mark

from vantage6.tools import serialization
import pandas as pd

from vantage6.tools.data_format import DataFormat

JSON = 'json'


@mark.parametrize("data,target", [
# Default serialization
Expand All @@ -17,28 +11,11 @@
({'hello': 'goodbye'}, '{"hello": "goodbye"}'),
# Pandas serialization
(pd.DataFrame([[1, 2, 3]], columns=['one', 'two', 'three']), '{"one":{"0":1},"two":{"0":2},"three":{"0":3}}'),
(pd.DataFrame([[1, 2, 3]], columns=['one', 'two', 'three']),
'{"one":{"0":1},"two":{"0":2},"three":{"0":3}}'),
(pd.Series([1, 2, 3]), '{"0":1,"1":2,"2":3}')
])
def test_json_serialization(data, target):
result = serialization.serialize(data, DataFormat.JSON)
result = serialization.serialize(data)

assert target == result.decode()


@mark.parametrize("data", [
({'key': 'value'}),
(123),
([1, 2, 3]),
])
def test_pickle_serialization(data):
pickled = serialization.serialize(data, DataFormat.PICKLE)

assert data == pickle.loads(pickled)


def test_pickle_serialization_pandas():
data = pd.DataFrame([1, 2, 3])
pickled = serialization.serialize(data, DataFormat.PICKLE)

pd.testing.assert_frame_equal(data, pickle.loads(pickled))

0 comments on commit e62f03b

Please sign in to comment.