# Predicting Ride Price using Linear Regression
There are 5 parts in this notebook.
1. Training on plaintext data
2. Evaluate model on plaintext data
3. Quantize the model
4. Compile the model to the equivalent FHE circuit
5. Evaluate the FHE model on encrypted data

In [1]:
# import required packages
import time
import numpy as np
from sklearn.linear_model import LinearRegression as SklearnLinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from concrete.ml.sklearn import LinearRegression as ConcreteLinearRegression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import dataset as panda's dataframe
import pandas as pd
taxi_dataset = pd.read_csv("./data/taxi_dataset.csv")
print(taxi_dataset)

        duration  price   tip  passengers  distance  hour_of_day  \
0          290.0   5.80  0.00           1  0.807897           14   
1          972.0  11.80  0.00           2  1.374723           11   
2          148.0   5.30  0.00           1  0.541303           19   
3          161.0   4.80  0.00           1  0.538977            8   
4          970.0  24.36  4.06           1  7.119970           20   
...          ...    ...   ...         ...       ...          ...   
485841     639.0  13.80  2.00           2  3.353926            2   
485842     241.0   7.56  1.26           1  1.213102           20   
485843     344.0   7.55  1.25           3  1.094541           17   
485844     564.0  11.75  1.95           1  1.597917           22   
485845    1386.0  15.80  0.00           1  1.861415           18   

        hour_of_week  day_of_week  start_location_id  end_location_id  \
0                110            4                 75               74   
1                 35            1    

In [3]:
# ConcreteMl currently does not support input or output data in categorical, string, or generic object data types
# so let's convert the data type of the target array to integer
# TODO: is this process necessary?
# TODO: maybe better to name target price
# target = taxi_dataset.price.astype("int")
target = taxi_dataset.price

print(target)

# split the inputs and targets into a train/test dataset
# TODO: look up random_state parameter
# split the dataset into 80% training data and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(
    taxi_dataset, target, test_size=0.4, random_state=42
)


# print(X_test)


0          5.80
1         11.80
2          5.30
3          4.80
4         24.36
          ...  
485841    13.80
485842     7.56
485843     7.55
485844    11.75
485845    15.80
Name: price, Length: 485846, dtype: float64


## 1. Training on plaintext data

In [4]:
# train LinearRegression model on the clear
sklearn_lr = SklearnLinearRegression()
sklearn_lr.fit(X_train, y_train)

## 2. Evaluate model on plaintext data

In [5]:
time_begin = time.time()
y_pred = sklearn_lr.predict(X_test)
print(y_pred)
execution_time_on_plaintext = (time.time() - time_begin) / len(X_test)
print(f"Model evaluation time on the clear: {execution_time_on_plaintext:.8f} seconds per sample")
# Compute the R2 scores
sklearn_r2_score = r2_score(y_test, y_pred)
print(sklearn_r2_score)

[11.3  11.3  36.8  ... 11.62 24.8  12.96]
Model evaluation time on the clear: 0.00000013 seconds per sample
1.0


## 3. Quantize the model

In [6]:
# Quantize the model (convert all the floating point numbers of weights and biases (32 bits) to integers (8 bits))
# set the quantization parameter on input as 8 bits
concrete_lr = ConcreteLinearRegression(n_bits=8)

# train the concrete linear regression model on clear data
concrete_lr.fit(X_train, y_train)

# Now, we can test our Concrete ML model on the clear test data
y_pred_q = concrete_lr.predict(X_test)

# Compute the R2 scores
quantized_r2_score = r2_score(y_test, y_pred_q)

print(quantized_r2_score)

0.13915080803321267


## 4. Compile the model to the equivalent FHE circuit

In [10]:
# Compile the quantized model in to FHE circuit and run inference on it
# You have to provide the training dataset in order to compile the quantized model to equivalent FHE circuit
time_begin = time.time()
fhe_circuit = concrete_lr.compile(X_train)
print(f"Generating a key for a {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit")
print(f"Compilation time: {time.time() - time_begin:.4f} seconds")

# Compiler returns the circuit, which can be used to generated a secrete key and evaluation key
# secrete key: used for encryption and decryption. only accesible to the client
# evaluation key: used to evaluate the cirucit on encypted data. anyone can access it
print(f"Generating a key for a {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit")

time_begin = time.time()
fhe_circuit.client.keygen(force=False)
print(f"Key generation time: {time.time() - time_begin:.4f} seconds")

Generating a key for a 19-bit circuit
Compilation time: 37.7040 seconds
Generating a key for a 19-bit circuit
Key generation time: 0.0002 seconds


## 5. Evaluate the FHE model on encrypted data

In [8]:
# Evaluate the FHE-version of the model
time_begin = time.time()
y_pred_fhe = concrete_lr.predict(X_test, fhe="execute")

execution_time_on_ciphertext = (time.time() - time_begin) / len(X_test)
print(f"Execution time: {execution_time_on_ciphertext:.8f} seconds per sample")
print(f"which is {(execution_time_on_ciphertext / execution_time_on_plaintext):.2f} times slower than prediction on the plaintext data")

Execution time: 0.00303060 seconds per sample
which is 22718.86 times slower than prediction on the plaintext data


In [9]:
# Measure R2 score of FHE version of the model
fhe_r2_score = r2_score(y_test, y_pred_fhe)

print("R^2 scores:")
print(f"scikit-learn (clear): {sklearn_r2_score:.4f}")
print(f"Concrete ML (quantized model on plaintext): {quantized_r2_score:.4f}")
print(f"Concrete ML (FHE model on ciphertext): {fhe_r2_score:.4f}")

# Measure the error of the FHE quantized model with respect to the clear scikit-learn float model
concrete_score_difference = abs(fhe_r2_score - quantized_r2_score) * 100 / quantized_r2_score
print(
    "\nRelative score difference for Concrete ML (quantized model on clear) vs. Concrete ML (FHE):",
    f"{concrete_score_difference:.2f}%",
)

# Measure the error of the FHE quantized model with respect to the clear float model
score_difference = abs(fhe_r2_score - sklearn_r2_score) * 100 / sklearn_r2_score
print(
    "Relative score difference for scikit-learn (clear) vs. Concrete ML (FHE) scores:",
    f"{score_difference:.2f}%",
)

R^2 scores:
scikit-learn (clear): 1.0000
Concrete ML (quantized model on plaintext): 0.1392
Concrete ML (FHE model on ciphertext): 0.1392

Relative score difference for Concrete ML (quantized model on clear) vs. Concrete ML (FHE): 0.00%
Relative score difference for scikit-learn (clear) vs. Concrete ML (FHE) scores: 86.08%
