In [None]:
!pip install tenseal

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import matplotlib
import re

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
import tenseal as ts

In [None]:
from google.colab import drive
drive.mount('/content/drive')

base_dir = '/content/drive/MyDrive/fhe-embedding/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### References
- https://github.com/OpenMined/TenSEAL
- https://arxiv.org/abs/2104.03152
- https://arxiv.org/pdf/2202.00004.pdf

In [None]:
%%time
# Larson
def load_embeddings(filename):
    """
    Load a DataFrame from the generalized text format used by word2vec, GloVe,
    fastText, and ConceptNet Numberbatch. The main point where they differ is
    whether there is an initial line with the dimensions of the matrix.
    """
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)

    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

embeddings = load_embeddings(base_dir + 'glove.6B.300d.txt')
embeddings.shape

CPU times: user 41.5 s, sys: 2.55 s, total: 44 s
Wall time: 1min


(400001, 300)

In [None]:
# Larson
def load_lexicon(filename):
    """
    Load a file from Bing Liu's sentiment lexicon
    (https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
    English words in Latin-1 encoding.

    One file contains a list of positive words, and the other contains
    a list of negative words. The files contain comment lines starting
    with ';' and blank lines, which should be skipped.
    """
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon

pos_words = load_lexicon(base_dir + 'positive-words.txt')
neg_words = load_lexicon(base_dir + 'negative-words.txt')

print(len(pos_words), len(neg_words))

2006 4783


In [None]:
# Larson
def vecs_to_sentiment(vecs):
    # predict_log_proba gives the log probability for each class
    # predictions = model.predict_log_proba(vecs)
    predictions = model.predict_log_proba(vecs).numpy().ravel()

    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    # this is a logarithm of the max margin for the classifier,
    # similar to odds ratio (but not exact) log(p_1/p_0) = log(p_1)-log(p_0)
    # return predictions[:, 1] - predictions[:, 0]
    return predictions

def enc_vecs_to_sentiment(vecs):
    """
    simulating a client-server interaction
      client: data owner that encrypts the data and later decrypts the results
      server: contains the machine learning model (logistic regression in this case) and receives encrypted data and returns encrypted results thereby preserving complete data privacy

    purpose: the client does not need to host a potentially large and memory intensive model nor does the client have to spend computational resources evaluating the model.
      Instead, client simply encrypts the desired data at the start and decrypts the results at the end.
      The server, presumably a third party with practically unlimited computational resources, hosts the model and handles machine learning computations with no access to the data in plaintext
    """

    # client-side: encrypt data
    vecs = torch.Tensor(vecs.values)
    enc_vecs = [ts.ckks_vector(ctx_eval, x.tolist()) for x in vecs]

    # server-side: compute evaluation on encrypted data
    forwards = []
    for enc_vec in enc_vecs:
      forward = enc_lr.predict_log_proba(enc_vec)
      forwards.append(forward)

    # client-side: decrypt the results and return results in plaintext
    forwards = [forward.decrypt() for forward in forwards]

    forwards = torch.Tensor(forwards)
    out = torch.sigmoid(forwards)
    with torch.no_grad():
      predictions = torch.log(out) - torch.log(1 - out)

    predictions = predictions.numpy().ravel()

    return predictions

def words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)

def enc_words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = enc_vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)

In [None]:
# Larson
import re
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.

def text_to_sentiment(text):
    # tokenize the input phrase
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    # send each token separately into the embedding, then the classifier
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean() # return the mean for the classifier

def enc_text_to_sentiment(text):
    # tokenize the input phrase
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    # send each token separately into the embedding, then the classifier
    sentiments = enc_words_to_sentiment(tokens)
    return sentiments['sentiment'].mean() # return the mean for the classifier

## Training

In [None]:
# Larson
pos_words_common = list(set(pos_words) & set(embeddings.index))
neg_words_common = list(set(neg_words) & set(embeddings.index))

pos_vectors = embeddings.loc[pos_words_common]
neg_vectors = embeddings.loc[neg_words_common]
print(pos_vectors.shape,neg_vectors.shape)

vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [0 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

vectors = (vectors - vectors.mean()) / vectors.std()
vectors = torch.Tensor(vectors.values)
targets = torch.Tensor(targets.reshape(-1,1))

train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.2, random_state=0)

# train_vectors = torch.Tensor(train_vectors.values)
# train_targets = torch.Tensor(train_targets.reshape(-1,1))
# test_vectors = torch.Tensor(test_vectors.values)
# test_targets = torch.Tensor(test_targets.reshape(-1,1))

print(train_vectors.shape, train_targets.shape)

(1893, 300) (4345, 300)
torch.Size([4990, 300]) torch.Size([4990, 1])


#### plaintext model

In [None]:
# Larson

# create a linear classifier
model = SGDClassifier(loss='log_loss', random_state=0, max_iter=100)
model.fit(train_vectors, train_targets)
accuracy_score(model.predict(test_vectors), test_targets)

  y = column_or_1d(y, warn=True)


0.8878205128205128

#### FHE model
TenSEAL library documentation and source for code examples: https://github.com/OpenMined/TenSEAL

In [None]:
class LR(torch.nn.Module):
  def __init__(self, n_features):
    super(LR, self).__init__()
    self.lr = torch.nn.Linear(n_features, 1)

  def forward(self, x):
    out = torch.sigmoid(self.lr(x))
    # print("shape after sigmoid: ", out.shape)
    return out

  def predict_log_proba(self, x):
    # print(x.shape)
    x = torch.Tensor(x.values)

    with torch.no_grad():
      pred = torch.log(self.forward(x)) - torch.log(1 - self.forward(x))
    return pred

In [None]:
n_features = train_vectors.shape[1]
model = LR(n_features)
optim = torch.optim.SGD(model.parameters(), lr=0.001)
# use Binary Cross Entropy Loss
criterion = torch.nn.BCELoss()

In [None]:
EPOCHS = 4500

def train(model, optim, criterion, x, y, epochs=EPOCHS):
  for e in range(1, epochs + 1):
    optim.zero_grad()
    out = model(x)
    loss = criterion(out, y)
    loss.backward()
    optim.step()
    if e % 500 == 0:
      print(f"Loss at epoch {e}: {loss.data}")
  return model

model = train(model, optim, criterion, train_vectors, train_targets)

Loss at epoch 500: 0.49766862392425537
Loss at epoch 1000: 0.4027809500694275
Loss at epoch 1500: 0.354069322347641
Loss at epoch 2000: 0.32376620173454285
Loss at epoch 2500: 0.30275776982307434
Loss at epoch 3000: 0.28716495633125305
Loss at epoch 3500: 0.27504006028175354
Loss at epoch 4000: 0.2652883529663086
Loss at epoch 4500: 0.25724294781684875


In [None]:
def accuracy(model, x, y):
    out = model(x)
    correct = torch.abs(y - out) < 0.5
    return correct.float().mean()

plain_accuracy = accuracy(model, test_vectors, test_targets)
print(f"Accuracy on plain test_set: {plain_accuracy}")

Accuracy on plain test_set: 0.9102563858032227


In [None]:
class EncryptedLR:

  def __init__(self, torch_lr):
    # TenSEAL processes lists and not torch tensors,
    # so we take out the parameters from the PyTorch model
    self.weight = torch_lr.lr.weight.data.tolist()[0]
    self.bias = torch_lr.lr.bias.data.tolist()

  def forward(self, enc_x):
    # We don't need to perform sigmoid as this model
    # will only be used for evaluation, and the label
    # can be deduced without applying sigmoid
    enc_out = enc_x.dot(self.weight) + self.bias
    return enc_out

  def forward_approx(self, enc_x):
    enc_out = enc_x.dot(self.weight) + self.bias
    enc_out = sigmoid_approx(enc_out)
    return enc_out

  def predict_log_proba(self, enc_x):
    enc_out = self.forward(enc_x)
    return enc_out


  def __call__(self, *args, **kwargs):
    return self.forward(*args, **kwargs)

  def encrypt(self, context):
    self.weight = ts.ckks_vector(context, self.weight)
    self.bias = ts.ckks_vector(context, self.bias)

  def decrypt(self, context):
    self.weight = self.weight.decrypt()
    self.bias = self.bias.decrypt()


enc_lr = EncryptedLR(model)

In [None]:
# parameters
# poly_mod_degree = 4096
# coeff_mod_bit_sizes = [40, 20, 40]
poly_mod_degree = 8192
coeff_mod_bit_sizes = [40, 21, 21, 21, 21, 21, 21, 40]
# create TenSEALContext
ctx_eval = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
# scale of ciphertext to use
ctx_eval.global_scale = 2 ** 20
# this key is needed for doing dot-product operations
ctx_eval.generate_galois_keys()

In [None]:
%%time
enc_x_test = [ts.ckks_vector(ctx_eval, x.tolist()) for x in test_vectors]
test_vectors.shape

CPU times: user 17 s, sys: 779 ms, total: 17.8 s
Wall time: 19.6 s


torch.Size([1248, 300])

In [None]:
def encrypted_evaluation(model, enc_x_test, y_test):

  correct = 0
  for enc_x, y in zip(enc_x_test, y_test):
    # encrypted evaluation
    enc_out = model(enc_x)
    out = enc_out.decrypt()
    out = torch.tensor(out)
    out = torch.sigmoid(out)
    if torch.abs(out - y) < 0.5:
      correct += 1

  print(f"Evaluated test_set of {len(test_vectors)} entries")
  print(f"Accuracy: {correct}/{len(test_vectors)} = {correct / len(test_vectors)}")
  return correct / len(test_vectors)

encrypted_accuracy = encrypted_evaluation(enc_lr, enc_x_test, test_targets)
diff_accuracy = plain_accuracy - encrypted_accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")

Evaluated test_set of 1248 entries
Accuracy: 899/1248 = 0.7203525641025641
Difference between plain and encrypted accuracies: 0.18990379571914673


In [None]:
def encrypted_evaluation(model, enc_x_test, y_test):

  correct = 0
  for enc_x, y in zip(enc_x_test, y_test):
    # encrypted evaluation
    enc_out = model(enc_x)
    out = sigmoid_approx(enc_out)
    out = out.decrypt()
    out = torch.tensor(out)
    if torch.abs(out - y) < 0.5:
      correct += 1

  print(f"Evaluated test_set of {len(test_vectors)} entries with sigmoid approximation")
  print(f"Accuracy: {correct}/{len(test_vectors)} = {correct / len(test_vectors)}")
  return correct / len(test_vectors)

encrypted_accuracy = encrypted_evaluation(enc_lr, enc_x_test, test_targets)
diff_accuracy = plain_accuracy - encrypted_accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")

Evaluated test_set of 1248 entries with sigmoid approximation
Accuracy: 890/1248 = 0.7131410256410257
Difference between plain and encrypted accuracies: 0.19711536169052124


In [None]:
def sigmoid_approx(enc_x):
    return enc_x.polyval([0.5, 0.197, 0, -0.004])

## Testing

In [None]:
# Custom Logistic Regression -- plaintext
print(text_to_sentiment("This movie felt like a beautifully animated amusement park ride"))
print(text_to_sentiment("Not high art or anything, but it ticks off almost everything for what Mario should be at least"))
print(text_to_sentiment("easily one of the worst movie Illumination produced so far"))

0.2688787
-0.10797867
-0.08086337


In [None]:
# Custom Logistic Regression -- FHE evaluation
print(enc_text_to_sentiment("This movie felt like a beautifully animated amusement park ride"))
print(enc_text_to_sentiment("Not high art or anything, but it ticks off almost everything for what Mario should be at least"))
print(enc_text_to_sentiment("easily one of the worst movie Illumination produced so far"))

-2.87028
-3.0460367
-2.989269
