# Predicting House Price using NeuralNet Regressor
There are 5 parts in this notebook.

0. Preprocess tne dataset
1. Training on plaintext data
2. Evaluate model on plaintext data
3. Train and Quantize the Concrete model (Quantization Aware Training)
4. Compile the model to the equivalent FHE circuit
5. Evaluate the FHE model on encrypted data
6. MAE measurement

In [2]:
# import required packages
import time
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from concrete.ml.sklearn import NeuralNetRegressor as ConcreteNNRegressor
import torch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
# import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
# import dataset as panda's dataframe
import pandas as pd
house_dataset = pd.read_csv("./data/house_data/train.csv")
df = pd.DataFrame(house_dataset)
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


## 0. Preprocess the dataset

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor

# Separate features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Automatically find categorical columns (assuming they're of type object)
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Define ColumnTransformer
# This transformer applies OneHotEncoder to categorical features and Imputer to handle NaN values
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute NaN values with most frequent value
            ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
        ]), categorical_features),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Impute NaN values with mean for numerical features
            ('scaler', StandardScaler())  # Standardize numerical features
        ]), X.select_dtypes(include=['int64', 'float64']).columns.tolist())
    ],
    remainder='passthrough'
)

# Create a pipeline with preprocessing and a model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(max_iter=1000))
])

preprocessor

In [6]:
# split dataset into 80% training data and 20% testing data
# house_dataset = np.array(house_dataset)
# target = np.array(round(target*100))
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Fit the pipeline
pipeline.fit(X_train, y_train)

In [7]:
X_train_processed = pipeline.named_steps['preprocessor'].transform(X_train)
X_test_processed = pipeline.named_steps['preprocessor'].transform(X_test)
X_train_processed

<1168x286 sparse matrix of type '<class 'numpy.float64'>'
	with 93217 stored elements in Compressed Sparse Row format>

## 1. Training on plaintext data

In [19]:
# train sklearn MLPRegressor model on the clear
sklearn_MLP_regressor = MLPRegressor(
                    alpha=1,
                    activation="identity",
                    max_iter=1000,
                    hidden_layer_sizes=(25,),
                    learning_rate_init=0.005,)
sklearn_MLP_regressor.fit(X_train_processed, y_train)

## 2. Evaluate model on plaintext data

In [21]:
time_begin = time.time()
y_pred = sklearn_MLP_regressor.predict(X_test_processed)
print(y_pred)
execution_time_on_plaintext = (time.time() - time_begin) / len(X_test)
print(f"Model evaluation time on the clear: {execution_time_on_plaintext:.8f} seconds per sample")
# Compute the R2 scores
sklearn_r2_score = r2_score(y_test, y_pred)
print(sklearn_r2_score)

[139438.6431405  306125.26296416 105570.40819224 170943.73540651
 301077.2865045   60360.56844309 225766.36180114 141073.81944668
  51032.72336898 148851.27057068 133567.80486872 102337.02199679
 121253.35947324 211540.63872085 187369.30067351 132096.18961333
 206471.79504737 123425.15737008 112240.7218204  229685.7740212
 167765.57337971 201567.50684574 195113.35455746 129511.19666708
 212901.19952438 153304.4358188  200526.25360023  97166.34624095
 188520.99696818 197044.92129471 131747.94444056 282761.78474694
 225894.62956065  96477.59592805 268463.11199187 151662.42944483
 133756.27193373 216562.11128232 296627.01970771  87176.60371528
 139629.60472031 240671.64613827  97806.50229382 321258.22001825
 133258.2119138  137109.65565245  93691.16477165 124669.42052205
 364840.16879436 123658.91762579 100310.47129169 218245.57002494
 117242.06873562 268790.81741123 168716.4585132  243710.15581193
 214968.8429475  156371.27057957 142494.05786293 104637.60220217
  64121.06004547 159297.61

## 3. Train and Quantize the Concrete model (Quantization Aware Training)

In [None]:
# Instantiate the model with parameter
# TODO: tune the parameter
params_neural_net = {
    # "module__n_w_bits": 6,
    # "module__n_a_bits": 8,
    # "module__n_accum_bits": 16,
    "module__n_hidden_neurons_multiplier": 10,
    "module__n_layers": 2,  # total number of layers in the FCNN = 1 hidden layer
    "module__activation_function": torch.nn.ReLU,
    "max_epochs": 1,
    "verbose": 1,
    # "lr": 0.1,
}


#some sort of Feature preprocessing needed for quantization aware training
# Linear models require polynomial features to be applied before training to fit a non-linear model and other models perform better with this transoformation
pipe = Pipeline(
    [
        ("poly", PolynomialFeatures()),
        ("scaler", StandardScaler(with_mean=False)),
    ]
)

X_poly_train = pipe.fit_transform(X_train_processed)
X_poly_test = pipe.transform(X_test_processed)

X_poly_train = X_poly_train.toarray()
X_poly_test = X_poly_test.toarray()

concrete_NN_regressor = ConcreteNNRegressor(batch_size=16, **params_neural_net)

# train the concrete linear regression model on clear data
# The built-in NN regressor models will automatically quantize weights and activations with .fit() call. (Quantization Aware Training) These models use several layers for Quantization Aware Training, allowing good performance for low precision (down to 2-3 bits) weights and activations.
# The maximum accumulator bit-width is controlled by the number of weights and activation bits, as well as a pruning factor. This factor is automatically determined based on the desired accumulator bit-width and a multiplier factor can be optionally specified.

y_train_float32 = y_train.values.reshape(-1, 1).astype('float32')

concrete_NN_regressor.fit(X_poly_train, y_train_float32)
print("done training")

# Now, we can test our Concrete ML model on the clear test data
y_pred_q = concrete_NN_regressor.predict(X_poly_test)
print("done prediction")
# Compute the R2 scores
quantized_r2_score = r2_score(y_test, y_pred_q)

print("done calculating r2 score")

print(quantized_r2_score)

## 4. Compile the model to the equivalent FHE circuit

In [16]:
# Compile the quantized model in to FHE circuit and run inference on it
# You have to provide the training dataset in order to compile the quantized model to equivalent FHE circuit
time_begin = time.time()
fhe_circuit = concrete_NN_regressor.compile(X_poly_train)
print(f"Generating a key for a {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit")
print(f"Compilation time: {time.time() - time_begin:.4f} seconds")

# Compiler returns the circuit, which can be used to generated a secrete key and evaluation key
# secrete key: used for encryption and decryption. only accesible to the client
# evaluation key: used to evaluate the cirucit on encypted data. anyone can access it
print(f"Generating a key for a {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit")

time_begin = time.time()
fhe_circuit.client.keygen(force=False)
print(f"Key generation time: {time.time() - time_begin:.4f} seconds")

Generating a key for a 12-bit circuit
Compilation time: 248.9701 seconds
Generating a key for a 12-bit circuit
Key generation time: 154.4244 seconds


## 5. Evaluate the FHE model on encrypted data

In [17]:
# Evaluate the FHE-version of the model
time_begin = time.time()
y_pred_fhe = concrete_NN_regressor.predict(X_poly_test[:1], fhe="execute")

execution_time_on_ciphertext = (time.time() - time_begin) / len(X_test)
print(f"Execution time: {execution_time_on_ciphertext:.8f} seconds per sample")
print(f"which is {(execution_time_on_ciphertext / execution_time_on_plaintext):.2f} times slower than prediction on the plaintext data")

: 

In [None]:
# Measure R2 score of FHE version of the model
fhe_r2_score = r2_score(y_test, y_pred_fhe)

print("R^2 scores:")
print(f"scikit-learn (clear): {sklearn_r2_score:.4f}")
print(f"Concrete ML (quantized model on plaintext): {quantized_r2_score:.4f}")
print(f"Concrete ML (FHE model on ciphertext): {fhe_r2_score:.4f}")

# Measure the error of the FHE quantized model with respect to the clear scikit-learn float model
concrete_score_difference = abs(fhe_r2_score - quantized_r2_score) * 100 / quantized_r2_score
print(
    "\nRelative score difference for Concrete ML (quantized model on clear) vs. Concrete ML (FHE):",
    f"{concrete_score_difference:.2f}%",
)

# Measure the error of the FHE quantized model with respect to the clear float model
score_difference = abs(fhe_r2_score - sklearn_r2_score) * 100 / sklearn_r2_score
print(
    "Relative score difference for scikit-learn (clear) vs. Concrete ML (FHE) scores:",
    f"{score_difference:.2f}%",
)