In [23]:
import json
import shutil
from pathlib import Path

import numpy
import pandas
from pandas.testing import assert_frame_equal

from concrete.ml.dataframe.client import EncryptedDataFrameClient
from concrete.ml.dataframe.development import save_deployment
from concrete.ml.dataframe.server import EncryptedDataFrameServer

numpy.random.seed(0)

# Development

In [24]:
DEPLOYMENT_DIR = Path("deployment")
DEPLOYMENT_DIR.mkdir(exist_ok=True)

save_deployment(DEPLOYMENT_DIR, "merge", 4)

# Client

In [25]:
# Path directory for client and server files
INPUTS_OUTPUTS_DIR = Path("inputs_outputs")

if INPUTS_OUTPUTS_DIR.is_dir():
    shutil.rmtree(INPUTS_OUTPUTS_DIR)

INPUTS_OUTPUTS_DIR.mkdir(exist_ok=True)

# Operator metadata (shared to the server)
OP_NAME = "merge"
N_BITS = 4
DTYPE = f"uint{N_BITS}"

# Client-only parameters
LOW = 1
HIGH = 2**N_BITS - 1

# Pandas kwargs (shared to the server)
HOW = "left"
ON = "id"

In [26]:
# Define the left data-frame
n_left = 10
n_feat_left = 3

dict_left = {
    ON: list(range(1, n_left + 1)),
}

for i in range(n_feat_left):
    dict_left[f"feat_left_{i+1}"] = list(numpy.random.randint(low=LOW, high=HIGH, size=(n_left,)))

df_left = pandas.DataFrame(
    dict_left,
    dtype="int",
)
print(df_left)

   id  feat_left_1  feat_left_2  feat_left_3
0   1           13            3            8
1   2            6            5            8
2   3            1            8            9
3   4            4            7            2
4   5           12            9            6
5   6            4            9           10
6   7            8           13           14
7   8           10           11            9
8   9            4            2           10
9  10            6            7            5


In [27]:
# Define the right data-frame
n_right = 3
index_shift = 4
n_feat_right = 5

dict_right = {
    ON: list(range(index_shift, n_right + index_shift)),
}

for i in range(n_feat_right):
    dict_right[f"feat_right_{i+1}"] = list(
        numpy.random.randint(low=LOW, high=HIGH, size=(n_right,))
    )

df_right = pandas.DataFrame(
    dict_right,
    dtype="int",
)
print(df_right)

   id  feat_right_1  feat_right_2  feat_right_3  feat_right_4  feat_right_5
0   4             4             6             4             4             4
1   5             1             1             9            14             8
2   6             4             3             2             4             1


In [28]:
# Define the operators to run in the server :
# - name: name of the operator
# - dtype: value dtype allowed in the data-frame (defines the min/max values as well as the circuit
# to use)
# - pandas_kwargs: the Pandas arguments to consider for this operator

ops_kwargs = [{"name": OP_NAME, "dtype": DTYPE, "pandas_kwargs": {"how": HOW, "on": ON}}]

# Build the client object
df_client = EncryptedDataFrameClient(
    ops_kwargs=ops_kwargs,
    deployment_dir=DEPLOYMENT_DIR,
)

In [29]:
# Compute and serialize the inputs to send to the server (left data-frame)
serialized_input_left = df_client.pre_process_encrypt_serialize(df_left, force_keygen=True)

# Retrieve and serialize the operators to send to the server
serialized_ops_left = df_client.get_serialized_ops()

In [30]:
with (
    (INPUTS_OUTPUTS_DIR / "input_left.json").open("w") as input_left,
    (INPUTS_OUTPUTS_DIR / "ops_left.json").open("w") as ops_left,
):
    json.dump(serialized_input_left, input_left)
    json.dump(serialized_ops_left, ops_left)

In [31]:
# Compute and serialize the inputs to send to the server (right data-frame)
# Do not force keygen here so that both parties use the same set of keys (private and eval)
serialized_input_right = df_client.pre_process_encrypt_serialize(df_right, force_keygen=False)

# TODO: Retrieve again the operators to send, if we need to check that both parties agree on what to
# run, else we can only consider one side for that
serialized_ops_right = df_client.get_serialized_ops()

In [32]:
with (
    (INPUTS_OUTPUTS_DIR / "input_right.json").open("w") as input_right_file,
    (INPUTS_OUTPUTS_DIR / "ops_right.json").open("w") as ops_right_file,
):
    json.dump(serialized_input_right, input_right_file)
    json.dump(serialized_ops_right, ops_right_file)

# Server

In [33]:
# Build the server object
df_server = EncryptedDataFrameServer(
    deployment_dir=DEPLOYMENT_DIR,
)

In [34]:
with (
    (INPUTS_OUTPUTS_DIR / "input_left.json").open("r") as input_left_file,
    (INPUTS_OUTPUTS_DIR / "input_right.json").open("r") as input_right_file,
    (INPUTS_OUTPUTS_DIR / "ops_left.json").open("r") as ops_left_file,
    (INPUTS_OUTPUTS_DIR / "ops_right.json").open("r") as ops_right_file,
):
    server_input_left = json.load(input_left_file)
    server_input_right = json.load(input_right_file)
    server_ops_left = json.load(ops_left_file)
    server_ops_right = json.load(ops_right_file)

In [35]:
# Run the operators (here: left join only) on the server
serialized_server_output = df_server.run(
    input_left=server_input_left,
    input_right=server_input_right,
    ops_left=server_ops_left,
    ops_right=server_ops_right,
)

Total FHE execution: 11.97s


In [36]:
with ((INPUTS_OUTPUTS_DIR / "server_output.json").open("w") as server_output_file,):
    json.dump(serialized_server_output, server_output_file)

# Client

In [37]:
with ((INPUTS_OUTPUTS_DIR / "server_output.json").open("r") as output_file,):
    output = json.load(output_file)

In [38]:
# Recover the pandas data-frame from the server's output
df_joined_cml = df_client.deserialize_decrypt_post_process(output)

## Concrete ML vs Pandas comparison


In [48]:
def df_are_equal(df_1, df_2):
    """Determines if both data-frames are identical, including NaN values.

    NaN values have the property of no being equal to one another (ie NaN != NaN). In the following
    notebook we want to be able to determine if the CP result is identical to Pandas, including the
    NaNs positions (meaning we want to have NaN == NaN)
    """
    try:
        assert_frame_equal(df_1, df_2, check_dtype=False)
        return True
    except AssertionError:
        return False

In [49]:
# Compute the left-joined data-frame using Pandas
df_joined_pandas = pandas.merge(df_left, df_right, on=ON, how=HOW)

print(df_joined_pandas)

   id  feat_left_1  feat_left_2  feat_left_3  feat_right_1  feat_right_2  \
0   1           13            3            8           NaN           NaN   
1   2            6            5            8           NaN           NaN   
2   3            1            8            9           NaN           NaN   
3   4            4            7            2           4.0           6.0   
4   5           12            9            6           1.0           1.0   
5   6            4            9           10           4.0           3.0   
6   7            8           13           14           NaN           NaN   
7   8           10           11            9           NaN           NaN   
8   9            4            2           10           NaN           NaN   
9  10            6            7            5           NaN           NaN   

   feat_right_3  feat_right_4  feat_right_5  
0           NaN           NaN           NaN  
1           NaN           NaN           NaN  
2           NaN          

In [51]:
# Compte the joined Pandas data-frame to the Concrete ML result
print("Concrete ML result is equal to Pandas:", df_are_equal(df_joined_pandas, df_joined_cml), "\n")

print(df_joined_cml)

Concrete ML result is equal to Pandas: True 

     id  feat_left_1  feat_left_2  feat_left_3  feat_right_1  feat_right_2  \
0   1.0         13.0          3.0          8.0           NaN           NaN   
1   2.0          6.0          5.0          8.0           NaN           NaN   
2   3.0          1.0          8.0          9.0           NaN           NaN   
3   4.0          4.0          7.0          2.0           4.0           6.0   
4   5.0         12.0          9.0          6.0           1.0           1.0   
5   6.0          4.0          9.0         10.0           4.0           3.0   
6   7.0          8.0         13.0         14.0           NaN           NaN   
7   8.0         10.0         11.0          9.0           NaN           NaN   
8   9.0          4.0          2.0         10.0           NaN           NaN   
9  10.0          6.0          7.0          5.0           NaN           NaN   

   feat_right_3  feat_right_4  feat_right_5  
0           NaN           NaN           NaN  
1  