# Testing Titanic with Decision Trees

From https://www.kaggle.com/c/titanic/ competition. Be sure to run `./script/kaggle_utils/download_datasets.sh` to have local versions of the dataset.

With inspiration from ppxgboost repository, https://github.com/awslabs/privacy-preserving-xgboost-inference/blob/master/example/Titanic.ipynb, which is "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. SPDX-License-Identifier: Apache-2.0".  

In [1]:
import os
import time

import numpy as np
import pandas as pd
import xgboost as xgb

from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier

In [2]:
# Download datasets
if not os.path.isfile("./local_datasets/titanic/train.csv"):
    raise ValueError("Please launch ./script/kaggle_utils/download_datasets.sh to get datasets")

train = pd.read_csv("./local_datasets/titanic/train.csv")
test = pd.read_csv("./local_datasets/titanic/test.csv")

X_train = train[["Pclass", "Age", "Fare", "SibSp", "Parch"]]
y_train = train[["Survived"]]
X_test = test[["Pclass", "Age", "Fare", "SibSp", "Parch"]]

In [3]:
# Sanitize a bit, ie replace empty ages by average age
average_age = 0
nb_age = 0

for age in X_train["Age"]:
    if not np.isnan(age):
        average_age += age
        nb_age += 1

average_age /= nb_age

for i, age in enumerate(X_train["Age"]):
    if np.isnan(age):
        X_train["Age"][i] = average_age

In [4]:
X_train.head()

Unnamed: 0,Pclass,Age,Fare,SibSp,Parch
0,3,22.0,7.25,1,0
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,0,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,0


In [5]:
y_train.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


### Train a Decision Tree with XGBoost (in clear)

In [6]:
model = xgb.XGBClassifier(n_estimators=10)
model.fit(X_train.to_numpy(), y_train.to_numpy())
print(f"Accuracy on training: {model.score(X_train.to_numpy(), y_train.to_numpy())}")

Accuracy on training: 0.8013468013468014


### The same Decision Tree, but with Concrete-ML

In [7]:
concrete_model = ConcreteXGBClassifier(n_bits=6, max_depth=7, n_estimators=10)
concrete_model.fit(X_train.to_numpy(), y_train.to_numpy())
concrete_model.compile(X_train.to_numpy(), use_virtual_lib=True, show_mlir=True)
print(f"Accuracy on training: {concrete_model.score(X_train.to_numpy(), y_train.to_numpy())}")


Computation Graph
--------------------------------------------------------------------------------
 %0 = [[[-28  -1 ...   0   0]]]                # ClearTensor<int6, shape=(10, 1, 37)>
 %1 = [[4] [6] [ ... ] [0] [0]]                # ClearTensor<uint3, shape=(370, 1)>
 %2 = [[[ 1  1   ... 0  0  0]]]                # ClearTensor<int2, shape=(10, 37, 36)>
 %3 = [[47] [ 7] ... [ 0] [ 0]]                # ClearTensor<uint6, shape=(360, 1)>
 %4 = [[1 0 0 0  ...  0 0 0 0]]                # ClearTensor<uint1, shape=(360, 5)>
 %5 = _inputs                                  # EncryptedTensor<uint6, shape=(5, 1)>
 %6 = matmul(%4, %5)                           # EncryptedTensor<uint6, shape=(360, 1)>
 %7 = less(%6, %3)                             # EncryptedTensor<uint1, shape=(360, 1)>
 %8 = astype(%7, dtype=int_)                   # EncryptedTensor<uint1, shape=(360, 1)>
 %9 = reshape(%8, newshape=[10 36 -1])         # EncryptedTensor<uint1, shape=(10, 36, 1)>
%10 = matmul(%2, %9)              

### Compare clear and quantized precisions

In [8]:
clear_predictions = model.predict(X_test)

In [9]:
quantized_predictions = concrete_model.predict(X_test.to_numpy())

In [10]:
nb_equal = np.sum(quantized_predictions == clear_predictions)
nb_total = len(clear_predictions)
print(f"Predictions which are the same between clear and quantized: {nb_equal} / {nb_total}")

Predictions which are the same between clear and quantized: 362 / 418


### Compare clear and FHE precisions

In [11]:
time_begin = time.time()
fhe_predictions = concrete_model.predict(X_test.to_numpy(), execute_in_fhe=True)
duration = time.time() - time_begin

In [12]:
nb_equal = np.sum(fhe_predictions == clear_predictions)
nb_total = len(fhe_predictions)
print(f"Predictions which are the same between clear and FHE: {nb_equal} / {nb_total}")
print(f"Execution time per inference: {duration / nb_total:.04f} s")

Predictions which are the same between clear and FHE: 362 / 418
Execution time per inference: 0.0013 s


### Prepare the submission to Kaggle

In [13]:
submission = pd.DataFrame(fhe_predictions, columns=["Survived"], index=range(892, 892 + 418))
submission.index.name = "PassengerId"
submission.to_csv("titanic_submission.csv")