In [1]:
import pandas as pd
df_mt = pd.read_csv("/ssd003/projects/pets/datasets/mtsamples.csv")
Y_all = df_mt["medical_specialty"][~df_mt["transcription"].isna()]
X_idx_all = [i for i in range(len(Y_all))]

In [2]:
from sklearn.model_selection import train_test_split
X_idx_train, X_idx_test, y_train, y_test = train_test_split(X_idx_all, Y_all, test_size=0.2, random_state=999)

In [3]:
label_counts = y_train.groupby(y_train).count()
lbl = pd.Series(y_train.groupby(y_train).count().sort_values(ascending=False).index)

def f(v):
    if label_counts[v]>100:
        return lbl[lbl==v].index[0]
    else:
        return 11

Y_data = y_train.apply(f).to_numpy()

In [4]:
import numpy as np
X_test_data = np.load("bert_mt_pooled_output.npy")[X_idx_test]
Y_test_data = y_test.apply(f).to_numpy()

In [5]:
import tenseal as ts
poly_mod_degree = 4096
coeff_mod_bit_sizes = [40, 20, 40]
# create TenSEALContext
ctx_eval = ts.context(scheme= ts.SCHEME_TYPE.CKKS, 
                      poly_modulus_degree = poly_mod_degree,
                      plain_modulus = -1,
                      coeff_mod_bit_sizes = coeff_mod_bit_sizes,
                      n_threads = 4)
# scale of ciphertext to use
ctx_eval.global_scale = 2 ** 20
# this key is needed for doing dot-product operations
ctx_eval.generate_galois_keys()

In [6]:
X_test_data.shape

(994, 512)

In [45]:
%%time
X_test_data_vector_enc = [ts.ckks_vector(ctx_eval, X_test_data[i]) for i in range(X_test_data.shape[0])]

CPU times: user 2.47 s, sys: 101 ms, total: 2.57 s
Wall time: 2.54 s


In [64]:
%%time
sz=32
X_test_data_tensor_enc = [ts.ckks_tensor(ctx_eval, X_test_data[i*sz:i*sz+sz]) for i in range(1)]

CPU times: user 59.3 s, sys: 4.03 s, total: 1min 3s
Wall time: 17.5 s


In [7]:
import torch
class MLR(torch.nn.Module):

    def __init__(self, n_features, n_classes):
        super(MLR, self).__init__()
        self.lr = torch.nn.Linear(n_features, n_classes)
        
    def forward(self, x):
        out = torch.special.log_softmax(self.lr(x), 1)
        return out
    
model = torch.load(f"models3/mlr_7700.pkl")

In [37]:
class EncryptedMLR:
    
    def __init__(self, torch_lr):
        # TenSEAL processes lists and not torch tensors,
        # so we take out the parameters from the PyTorch model
        self.weight = np.array(torch_lr.lr.weight.data.tolist()).T
        self.bias = np.array(torch_lr.lr.bias.data.tolist())[np.newaxis,:]
        
    def forward(self, enc_x):
        # We don't need to perform sigmoid as this model
        # will only be used for evaluation, and the label
        # can be deduced without applying sigmoid
        enc_out = enc_x.mm(self.weight) + self.bias
        return enc_out
    
    def forward_vector(self, enc_x):
        # We don't need to perform sigmoid as this model
        # will only be used for evaluation, and the label
        # can be deduced without applying sigmoid
        enc_out = enc_x.mm(self.weight) + self.bias[0]
        return enc_out
    
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)
        
    ################################################
    ## You can use the functions below to perform ##
    ## the evaluation with an encrypted model     ##
    ################################################
    def encrypt(self, context):
        self.weight = ts.ckks_tensor(context, self.weight)
        self.bias = ts.ckks_tensor(context, self.bias)
        
    def decrypt(self, context):
        self.weight = self.weight.decrypt()
        self.bias = self.bias.decrypt()
    
enc_model = EncryptedMLR(model)

In [38]:
import json
import time
result = {
        "inference_time": 0,
        "enc_inference_time": 0
    }

lr_results = []
start_time = time.time()
for enc_x in X_test_data_vector_enc:
    lr_results.append(enc_model.forward_vector(enc_x).decrypt())
result["inference_time"] += (time.time() - start_time)

result["lr_results"] = lr_results

with open(f"result_vector.json", "w") as fout:
    json.dump(result, fout)

In [39]:
y_hat1 = np.argmax(np.array(lr_results), axis=1)

np.sum(y_hat1 == Y_test_data)

352

In [10]:

y_hat = np.argmax(model.forward(torch.from_numpy(X_test_data).float()).detach().numpy(), axis=1)



In [18]:
Y_test_data[Y_test_data != y_hat])

642

In [19]:
y_hat, Y_test_data

(array([11, 11,  4,  0, 11,  0, 11,  0, 11,  4, 11, 11,  0, 11, 11, 11, 11,
         0, 11,  0, 11,  0, 11,  4, 11,  4, 11,  1, 11,  1, 11,  2,  0,  5,
        11,  0,  0, 11, 11,  1, 11,  0,  0, 11,  0, 11, 11,  0,  0,  0,  0,
         1, 11, 11,  0,  4,  1,  0,  0, 11,  0, 11,  1,  4, 11,  0,  6, 11,
         0,  0,  0,  0, 11,  0,  1,  1, 11, 11,  0,  0,  0,  1,  2,  0,  0,
         0,  0, 11,  0,  0,  0,  1,  0, 11, 11,  0,  0, 11,  4,  1,  0,  0,
         0,  4,  0,  1,  0,  4,  0, 11, 11, 11,  5,  0,  0,  4,  0,  8,  0,
         0,  0, 11,  0,  0,  0,  1,  1,  0,  0,  0, 11,  0,  0,  1,  1,  0,
        11,  0, 11,  1,  1, 11, 11,  0,  5, 11, 11,  0,  0,  0,  0, 11,  3,
         1,  0,  0,  0, 11,  1,  0, 11, 11,  0, 11,  1, 11, 11,  0,  4,  4,
         4,  0,  0,  4,  4,  0,  0,  0,  0,  0,  4,  0, 11,  0,  0,  1,  0,
         0,  0, 11, 11, 11,  0,  0,  0, 11,  0,  0,  0, 11,  0,  0,  4, 11,
         4,  4, 11,  1,  0, 11,  0,  4,  0, 11,  0,  6, 11,  1,  4, 11, 11,
         0, 

In [21]:
y_test[13:18]

4366        Consult - History and Phy.
3643                  Gastroenterology
4414        Consult - History and Phy.
1341     SOAP / Chart / Progress Notes
962                            Surgery
Name: medical_specialty, dtype: object

In [17]:
import time
import json
sz_list = [8, 16, 32]


for sz in sz_list:
    result = {
        "inference_time": 0,
        "enc_inference_time": 0
    }
    n = X_test_data.shape[0]//sz
    if X_test_data.shape[0]%sz>0:
        n+=1

    enc_model = EncryptedMLR(model)

    enc_model_enc = EncryptedMLR(model)
    enc_model_enc.encrypt(ctx_eval)

    lr_results = []
    enc_lr_results = []

    for i in range(n):
        x_enc = ts.ckks_tensor(ctx_eval, X_test_data[i*sz:i*sz+sz])

        start_time = time.time()
        rs = enc_model.forward(x_enc).decrypt().tolist()
        result["inference_time"] += (time.time() - start_time)
        for rv in rs:
            lr_results.append(rv)

        start_time = time.time()
        rs = enc_model_enc.forward(x_enc).decrypt().tolist()
        result["enc_inference_time"] += (time.time() - start_time)
        for rv in rs:
            enc_lr_results.append(rv)

        with open(f"result_sz{sz}.json", "w") as fout:
            json.dump(result, fout)


    result["lr_results"] = lr_results

    y_hat1 = np.argmax(np.array(lr_results), axis=1)
    result["acc"] = int(np.sum(y_hat1 == Y_test_data))
    result["diff"] = int(np.sum(y_hat1 != y_hat))

    result["enc_lr_results"] = enc_lr_results

    y_hat1 = np.argmax(np.array(lr_results), axis=1)
    result["enc_acc"] = int(np.sum(y_hat1 == Y_test_data))
    result["enc_diff"] = int(np.sum(y_hat1 != y_hat))


    with open(f"result_sz{sz}.json", "w") as fout:
        json.dump(result, fout)

In [10]:
logits = model.lr(torch.from_numpy(X_test_data).float()).detach().numpy()

In [11]:
logits

array([[ 1.0349009 , -0.25117564,  1.0939565 , ..., -0.4523517 ,
         0.01484562,  1.5100555 ],
       [ 0.07647051,  1.376148  , -0.29634643, ..., -0.777474  ,
        -1.0150388 ,  1.7973723 ],
       [ 0.46277553, -0.35522628,  0.3918991 , ..., -0.9944029 ,
         0.16371481,  1.4771357 ],
       ...,
       [ 2.68868   , -2.0270047 ,  0.61391413, ...,  1.0245537 ,
        -0.5085133 ,  1.5060463 ],
       [ 2.3452663 , -1.7756293 ,  0.35105148, ..., -0.44321877,
         0.4465793 ,  1.8480144 ],
       [ 2.9141889 , -1.5517173 ,  0.5816109 , ..., -0.17610326,
         0.60270923,  2.4422083 ]], dtype=float32)

In [14]:
import json
with open(f"result_sz2.json") as fin:
    result = json.load(fin)

[[1.0552184454415894,
  -0.23866863247572845,
  1.118734932358503,
  1.2985269068734702,
  -0.33547823918824105,
  -0.06288056316081203,
  -1.0598332959197567,
  -1.4446103722790857,
  -0.900954638243939,
  -0.4645739883706028,
  0.011386916069290744,
  1.5192919586893454],
 [0.07935618951320833,
  1.4070378221104054,
  -0.3107583039782697,
  0.09266815907269728,
  -0.8251189059704368,
  0.7843122595735328,
  -0.9152034926506798,
  -0.513248636745138,
  -0.5277005530574833,
  -0.7723962566021811,
  -1.0281938482106505,
  1.8318034998338808],
 [0.4564652083522998,
  -0.3684788280081033,
  0.40519196572613875,
  0.2539150478542583,
  1.8559310266090228,
  -1.4826582511619442,
  0.17269638131670562,
  -0.7643367020403143,
  -1.6408126312171385,
  -1.0107268116382537,
  0.176656847896558,
  1.4919802163580393],
 [2.2696624111645103,
  -0.9138589624469249,
  0.284288510407908,
  1.40142447394856,
  -0.009709504204320823,
  -0.8810172051119766,
  -1.0567588033223916,
  0.5767542492742881,
  

In [32]:
for sz in [2,4,8,16,32]:
    with open(f"result_sz{sz}.json") as fin:
        result = json.load(fin)
        mae =np.mean(np.abs(np.array(result["enc_lr_results"])-logits))
    print(mae)

0.017443912417072376
0.01744672340918003
0.022507167209730568
0.024647933491368543
0.022444477022856322


In [42]:
np.mean(np.abs(np.array(lr_results)-logits))

0.014734069601626106

In [22]:
import numpy as np
a = np.random.randn(994, 512)
b = np.random.randn(12, 512)
enc_a =[ts.ckks_vector(ctx_eval,a[i]) for i in range(a.shape[0])]
enc_b =[ts.ckks_vector(ctx_eval,b[i]) for i in range(b.shape[0])]


In [23]:
import time
start_time = time.time()
enc_p = []
for enc_b1 in enc_b:
    enc_p.append([enc_a1.dot(enc_b1) for enc_a1 in enc_a])


In [27]:
p = []
for enc_pr in enc_p:
    p.append(np.concatenate([enc_ppr.decrypt() for enc_ppr in enc_pr]))

In [31]:
p = np.array(p).T

In [36]:
enc_max = np.argmax(p, axis=1)

In [40]:
plain_max = np.argmax(np.matmul(a, b.T), axis=1)

In [42]:
np.sum(plain_max!=enc_max)

2

In [58]:
class EncryptedMLR2:
    
    def __init__(self, torch_lr):
        # TenSEAL processes lists and not torch tensors,
        # so we take out the parameters from the PyTorch model
        self.weight = np.array(torch_lr.lr.weight.data.tolist()).T
        self.bias = np.array(torch_lr.lr.bias.data.tolist())[np.newaxis,:]
        print(self.weight.shape, self.bias.shape)
        
    def forward(self, enc_x):
        # We don't need to perform sigmoid as this model
        # will only be used for evaluation, and the label
        # can be deduced without applying sigmoid
        enc_out = [w.dot(enc_x) for w in self.weight]
        enc_out = [o+b for o, b in zip(enc_out, self.bias)]
        return enc_out
    
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)
        
    ################################################
    ## You can use the functions below to perform ##
    ## the evaluation with an encrypted model     ##
    ################################################
    def encrypt(self, context):
        self.weight = [ts.ckks_vector(context, self.weight[:,i]) for i in range(self.weight.shape[1])]
        self.bias = [ts.ckks_vector(context, np.array([c])) for c in self.bias[0]]
        
enc_model = EncryptedMLR2(model)
enc_model.encrypt(ctx_eval)

(512, 12) (1, 12)


In [59]:
import time
lr_results = []
start_time = time.time()
for enc_x in X_test_data_vector_enc:
    lr_results.append([o.decrypt()[0] for o in enc_model.forward(enc_x)])
print(time.time()-start_time)

81.70679497718811


In [60]:
y_hat1 = np.argmax(np.array(lr_results), axis=1)

In [61]:
int(np.sum(y_hat1 == Y_test_data))

360

In [62]:
int(np.sum(y_hat1 != y_hat))

42

In [63]:
np.sum(y_hat==Y_test_data)

352

In [65]:
len(X_test_data_tensor_enc[0].serialize())


1325370159

In [71]:
import math
def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])

In [72]:
convert_size(1325370159)

'1.23 GB'