In [1]:
import json
import shutil
from pathlib import Path

import numpy
import pandas
from pandas.testing import assert_frame_equal

from concrete.ml.pandas import (
    encrypt_from_pandas,
    get_client_and_eval_keys,
    load_encrypted_dataframe,
    load_server,
)

pandas.set_option("display.max_columns", None)
pandas.set_option("display.width", 1000)
numpy.random.seed(0)

# Clients

In [2]:
# Path directory for client and server files
INPUTS_OUTPUTS_DIR = Path("inputs_outputs")
DATA_DIR = Path("data")

if INPUTS_OUTPUTS_DIR.is_dir():
    shutil.rmtree(INPUTS_OUTPUTS_DIR)

INPUTS_OUTPUTS_DIR.mkdir(exist_ok=True)

# Pandas kwargs
HOW = "left"
ON = "id"

# TODO: remove this if possible
OPERATOR = "left_right_join"

In [3]:
df_left = pandas.read_csv(DATA_DIR / "df_left.csv", index_col=0)
df_right = pandas.read_csv(DATA_DIR / "df_right.csv", index_col=0)

print(df_left)

   id  feat_left_1  feat_left_2  feat_left_3
0   1           13            3            8
1   2            6            5            8
2   3            1            8            9
3   4            4            7            2
4   5           12            9            6
5   6            4            9           10
6   7            8           13           14
7   8           10           11            9
8   9            4            2           10
9  10            6            7            5


In [4]:
print(df_right)

   id  feat_right_1  feat_right_2  feat_right_3  feat_right_4  feat_right_5
0   4             4             6             4             4             4
1   5             1             1             9            14             8
2   6             4             3             2             4             1


In [5]:
client_1, evaluation_keys_1 = get_client_and_eval_keys(OPERATOR)

In [6]:
df_left_enc = encrypt_from_pandas(df_left, client_1, evaluation_keys_1, OPERATOR)
df_right_enc = encrypt_from_pandas(df_right, client_1, evaluation_keys_1, OPERATOR)

In [7]:
df_left_enc_path = INPUTS_OUTPUTS_DIR / "df_left_enc.json"
df_right_enc_path = INPUTS_OUTPUTS_DIR / "df_right_enc.json"

df_left_enc.to_json(df_left_enc_path)
df_right_enc.to_json(df_right_enc_path)

# Server

In [8]:
server = load_server("left_right_join")

In [9]:
df_left_enc_server = load_encrypted_dataframe(df_left_enc_path)
df_right_enc_server = load_encrypted_dataframe(df_right_enc_path)

In [10]:
df_joined_enc_server = df_left_enc_server.merge(df_right_enc_server, server, how=HOW, on=ON)

In [11]:
df_joined_enc_server_path = INPUTS_OUTPUTS_DIR / "df_joined_enc.json"

df_joined_enc_server.to_json(df_joined_enc_server_path)

# Client

In [12]:
df_joined_enc = load_encrypted_dataframe(df_joined_enc_server_path)

In [13]:
df_joined_cml = df_joined_enc.decrypt_to_pandas(client_1)

## Concrete ML vs Pandas comparison


In [14]:
def df_are_equal(df_1, df_2):
    """Determines if both data-frames are identical, including NaN values.

    NaN values have the property of no being equal to one another (ie NaN != NaN). In the following
    notebook we want to be able to determine if the CP result is identical to Pandas, including the
    NaNs positions (meaning we want to have NaN == NaN)
    """
    try:
        assert_frame_equal(df_1, df_2, check_dtype=False)
        return True
    except AssertionError:
        return False

In [15]:
# Compute the left-joined data-frame using Pandas
df_joined_pandas = pandas.merge(df_left, df_right, on=ON, how=HOW)

print(df_joined_pandas)

   id  feat_left_1  feat_left_2  feat_left_3  feat_right_1  feat_right_2  feat_right_3  feat_right_4  feat_right_5
0   1           13            3            8           NaN           NaN           NaN           NaN           NaN
1   2            6            5            8           NaN           NaN           NaN           NaN           NaN
2   3            1            8            9           NaN           NaN           NaN           NaN           NaN
3   4            4            7            2           4.0           6.0           4.0           4.0           4.0
4   5           12            9            6           1.0           1.0           9.0          14.0           8.0
5   6            4            9           10           4.0           3.0           2.0           4.0           1.0
6   7            8           13           14           NaN           NaN           NaN           NaN           NaN
7   8           10           11            9           NaN           NaN        

In [16]:
# Compte the joined Pandas data-frame to the Concrete ML result
print("Concrete ML result is equal to Pandas:", df_are_equal(df_joined_pandas, df_joined_cml), "\n")

print(df_joined_cml)

Concrete ML result is equal to Pandas: True 

     id  feat_left_1  feat_left_2  feat_left_3  feat_right_1  feat_right_2  feat_right_3  feat_right_4  feat_right_5
0   1.0         13.0          3.0          8.0           NaN           NaN           NaN           NaN           NaN
1   2.0          6.0          5.0          8.0           NaN           NaN           NaN           NaN           NaN
2   3.0          1.0          8.0          9.0           NaN           NaN           NaN           NaN           NaN
3   4.0          4.0          7.0          2.0           4.0           6.0           4.0           4.0           4.0
4   5.0         12.0          9.0          6.0           1.0           1.0           9.0          14.0           8.0
5   6.0          4.0          9.0         10.0           4.0           3.0           2.0           4.0           1.0
6   7.0          8.0         13.0         14.0           NaN           NaN           NaN           NaN           NaN
7   8.0         10