# Predicting Ride Price using Linear Regression
There are 7 parts in this notebook.

0. Preprocess the dataframe
1. Training on plaintext data
2. Evaluate model on plaintext data
3. Quantize the model
4. Compile the model to the equivalent FHE circuit
5. Evaluate the FHE model on encrypted data
6. MAE and R^2 measurement

In [5]:
# import required packages
import time
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor as SklearnMLPRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from concrete.ml.sklearn import NeuralNetRegressor as ConcreteNNRegressor
import torch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

In [38]:
house_dataset = pd.read_csv("./data/Housing.csv")
df = pd.DataFrame(house_dataset)
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [39]:
#preprocessing

scaler_cat = OneHotEncoder()
scaler_y = MinMaxScaler()
std = np.std(df['price'])
y = np.array(df.pop('price'), dtype=np.float32)
y = np.expand_dims(y, axis=-1)
y = scaler_y.fit_transform(y)
x = np.empty((len(df), 0))

categorical_features = df.select_dtypes(include=['object']).columns.tolist()

for key, values in df.items():
  el = np.expand_dims(np.array(df[key]), axis=-1)
  if key in categorical_features:
    el = scaler_cat.fit_transform(el).toarray()
  x = np.concatenate((x, el), axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## 0. Preprocess the dataframe

## 1. Training on plaintext data

In [40]:
# train LinearRegression model on the clear
sklearn_lr = SklearnLinearRegression()
sklearn_lr.fit(X_train, y_train)

## 2. Evaluate model on plaintext data

In [60]:
time_begin = time.time()
# y_pred = sklearn_lr.predict(X_test)
execution_time_on_plaintext = (time.time() - time_begin) / len(X_test)
print(f"Model evaluation time in the clear: {execution_time_on_plaintext:.8f} seconds per sample")

# Compute MAE
# mae_clear = mean_absolute_error(y_test, y_pred)

y_pred = sklearn_lr.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1))
y_true = scaler_y.inverse_transform(y_test.reshape(-1, 1))
mae = mean_absolute_error(y_test, y_pred)
print(f'sklearn model MAE in the clear : {round(mae/1000000, 2)} millions, ie {round(mae/std, 3)} std')

# print('sklearn model MAE in the clear : ', mae_clear)

# Compute the R2 scores
# sklearn_r2_score = r2_score(y_test, y_pred)
# print("sklearn model r2 score in the clear: ", sklearn_r2_score)


Model evaluation time in the clear: 0.00000055 seconds per sample
sklearn model MAE in the clear : 4.86 millions, ie 2.601 std


In [45]:
print('few sample test')
for i in range(20):
  print('pred:', sklearn_lr.predict(X_test[i:i+1])/100, 'true:', y_test[i:i+1]/100)

few sample test
pred: [[0.00295641]] true: [[0.002]]
pred: [[0.00474002]] true: [[0.00424242]]
pred: [[0.00117737]] true: [[0.00169697]]
pred: [[0.00247799]] true: [[0.00406061]]
pred: [[0.00133736]] true: [[0.00090909]]
pred: [[0.0015431]] true: [[0.00272727]]
pred: [[0.00334353]] true: [[0.0030303]]
pred: [[0.00399839]] true: [[0.00241818]]
pred: [[0.0008423]] true: [[0.00060606]]
pred: [[0.00076139]] true: [[0.00138788]]
pred: [[0.00681129]] true: [[0.00727273]]
pred: [[0.00090743]] true: [[0.00078788]]
pred: [[0.00123039]] true: [[0.00139394]]
pred: [[0.00142393]] true: [[0.00139394]]
pred: [[0.00167194]] true: [[0.00045455]]
pred: [[0.00304172]] true: [[0.00078788]]
pred: [[0.00111339]] true: [[0.00078788]]
pred: [[0.00262868]] true: [[0.00484848]]
pred: [[0.00225069]] true: [[0.0010303]]
pred: [[0.0015778]] true: [[0.0009697]]


## 3. Quantize the model

In [62]:
# Quantize the model (convert all the floating point numbers of weights and biases (32 bits) to integers (8 bits))
# set the quantization parameter on input as 8 bits
concrete_lr = ConcreteLinearRegression(n_bits=8)

# train the concrete linear regression model on clear data
concrete_lr.fit(X_train, y_train)

# Now, we can test our Concrete ML model on the clear test data
y_pred_q = concrete_lr.predict(X_test)
y_pred_q = scaler_y.inverse_transform(y_pred_q.reshape(-1, 1))
mae_q = mean_absolute_error(y_true, y_pred_q)
print(f'quantized model MAE:  {round(mae_q/1000000, 2)} millions, ie {round(mae_q/std, 3)} std')


# # Compute the R2 scores
# quantized_r2_score = r2_score(y_test, y_pred_q)
# print('quantized model R^2 score: ', quantized_r2_score)

quantized model MAE:  3.26 millions, ie 1.743 std


## 4. Compile the model to the equivalent FHE circuit

In [50]:
# Compile the quantized model in to FHE circuit and run inference on it
# You have to provide the training dataset in order to compile the quantized model to equivalent FHE circuit
time_begin = time.time()
fhe_circuit = concrete_lr.compile(X_train)
print(f"Generating a key for a {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit")
print(f"Compilation time: {time.time() - time_begin:.4f} seconds")

# Compiler returns the circuit, which can be used to generated a secrete key and evaluation key
# secrete key: used for encryption and decryption. only accesible to the client
# evaluation key: used to evaluate the cirucit on encypted data. anyone can access it
print(f"Generating a key for a {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit")

time_begin = time.time()
fhe_circuit.client.keygen(force=False)
print(f"Key generation time: {time.time() - time_begin:.4f} seconds")

Generating a key for a 18-bit circuit
Compilation time: 8.4107 seconds
Generating a key for a 18-bit circuit
Key generation time: 0.0008 seconds


In [51]:
print(fhe_circuit.statistics)

{'size_of_inputs': 208000, 'size_of_outputs': 10400, 'programmable_bootstrap_count': 0, 'programmable_bootstrap_count_per_parameter': {}, 'programmable_bootstrap_count_per_tag': {}, 'programmable_bootstrap_count_per_tag_per_parameter': {}, 'key_switch_count': 0, 'key_switch_count_per_parameter': {}, 'key_switch_count_per_tag': {}, 'key_switch_count_per_tag_per_parameter': {}, 'packing_key_switch_count': 0, 'packing_key_switch_count_per_parameter': {}, 'packing_key_switch_count_per_tag': {}, 'packing_key_switch_count_per_tag_per_parameter': {}, 'clear_addition_count': 1, 'clear_addition_count_per_parameter': {LweSecretKeyParam(dimension=1299): 1}, 'clear_addition_count_per_tag': {}, 'clear_addition_count_per_tag_per_parameter': {}, 'encrypted_addition_count': 41, 'encrypted_addition_count_per_parameter': {LweSecretKeyParam(dimension=1299): 41}, 'encrypted_addition_count_per_tag': {}, 'encrypted_addition_count_per_tag_per_parameter': {}, 'clear_multiplication_count': 21, 'clear_multiplic

## 5. Evaluate the FHE model on encrypted data

In [56]:
# Evaluate the FHE-version of the model on encrypted data
time_begin = time.time()

decrypted_prediction_list = []

for i, input in enumerate(X_test):
	quantized_input = concrete_lr.quantize_input([np.array(input)])

	encrypted_input = fhe_circuit.encrypt(quantized_input)

	encrypted_prediction = fhe_circuit.run(encrypted_input)

	decrypted_prediction = concrete_lr.dequantize_output(fhe_circuit.decrypt(encrypted_prediction))

	decrypted_prediction_list.append(decrypted_prediction[0][0])



execution_time_on_ciphertext = (time.time() - time_begin)/len(X_test)

y_pred_fhe = scaler_y.inverse_transform(np.array(decrypted_prediction_list).reshape(-1, 1))

# Measure MSE of FHE version of the model
mae_fhe = mean_absolute_error(y_true, y_pred_fhe)

# Measure R2 score of FHE version of the model
# fhe_r2_score = r2_score(y_test, decrypted_prediction_list)

print(f"Execution time: {execution_time_on_ciphertext:.8f} seconds per sample")
print(f"which is {(execution_time_on_ciphertext / execution_time_on_plaintext):.2f} times slower than prediction on the plaintext data")

Execution time: 0.00350443 seconds per sample
which is 5601.93 times slower than prediction on the plaintext data


## 6. MAE and R^2 Score Measurement

In [59]:
print("MAE:")
print(f"- scikit-learn (clear): {round(mae/1000000, 2)} millions")
print(f"- Concrete ML (quantized model on plaintext): {round(mae_q/1000000, 2)} millions")
print(f"- Concrete ML (FHE model on ciphertext): {round(mae_fhe/1000000, 2)} millions\n")

# print("R^2 scores:")
# print(f"- scikit-learn (clear): {sklearn_r2_score:.4f}")
# print(f"- Concrete ML (quantized model on plaintext): {quantized_r2_score:.4f}")
# print(f"- Concrete ML (FHE model on ciphertext): {fhe_r2_score:.4f} \n")

#Measure the error of the FHE quantized model with respect to the clear float model
score_difference = abs(mae_q - mae) * 100 / mae
print(
    "Relative MAE score difference between scikit-learn (clear) and quntized model (clear):",
    f"{score_difference:.2f}%",
)

#Measure the error of the FHE quantized model with respect to the clear scikit-learn float model
concrete_score_difference = abs(mae_fhe - mae_q) * 100 / mae_q
print(
    "Relative MAE score difference between quantized model (clear) vs. Concrete ML (FHE):",
    f"{concrete_score_difference:.2f}%",
)



MAE:
- scikit-learn (clear): 4.86 millions
- Concrete ML (quantized model on plaintext): 3.26 millions
- Concrete ML (FHE model on ciphertext): 3.26 millions

Relative MAE score difference between scikit-learn (clear) and quntized model (clear): 32.99%
Relative MAE score difference between quantized model (clear) vs. Concrete ML (FHE): 0.00%
