# Homomorphic Encryption Example

## This demo is a copy from the following github repo:
https://github.com/anujdutt9/Encrypted-Machine-Learning/blob/master/Homomorphic-Encryption-Demo.ipynb
+ Might need to install library: !pip install phe

+Logistic Regression for Spam/Not Spam e-mail Classification.

For this problem we have two users:

USER-1

USER-2

AI Inc. makes a Machine Learning model that is trained on some email data for classification between Spam/Not Spam. Now, they want to take that model, encrypt it and send to USER-1 and USER-2 who will train the model on their data, fully Homomorphically Encrypted, and send the trained, a bit better model back to AI Inc.

In this process, AI Inc. get a better trained model every time without even looking at USER-1 or USER-2 data. This way AI Inc. can serve the customers better with a smart Machine Learning model and the USER has complete control of his/her data.

In [97]:
# Import Dependencies
import phe as paillier

In [98]:
# Create Public and Private Keys
key_length = 1024
pub_key, private_key = paillier.generate_paillier_keypair(n_length=key_length)

In [99]:
pub_key

<PaillierPublicKey 7a26580b2d>

In [100]:
private_key

<PaillierPrivateKey for <PaillierPublicKey 7a26580b2d>>

In [101]:
# Encrypt an operation using Public Key
a = 10
print("a: ",a)

encrypted_a = pub_key.encrypt(a)
print("Encrypted a: ",encrypted_a)

print("Encrypted a Public Key: ", encrypted_a.public_key)

a:  10
Encrypted a:  <phe.paillier.EncryptedNumber object at 0x7f5715d8a450>
Encrypted a Public Key:  <PaillierPublicKey 7a26580b2d>


In [102]:
# Encrypt another variable
b = 5
print("b: ", b)

encrypted_b = pub_key.encrypt(b)
print("Encrypted b: ", encrypted_b)

print("Encrypted b Public Key: ",encrypted_b.public_key)

b:  5
Encrypted b:  <phe.paillier.EncryptedNumber object at 0x7f5715d8a950>
Encrypted b Public Key:  <PaillierPublicKey 7a26580b2d>


In [103]:
# Do an operation on Encrypted Variables
c = a + b
print("c: ", c)

c:  15


In [104]:
d = a * b
print("d: ",d)

d:  50


In [105]:
e = a - b

encrypted_e = pub_key.encrypt(e)
print("Encrypted e: ", encrypted_e)

Encrypted e:  <phe.paillier.EncryptedNumber object at 0x7f5715d99410>


In [106]:
# Decrypt the Encrypted Data
decrypted_e = private_key.decrypt(encrypted_e)

In [107]:
print("Decrypted e: ", decrypted_e)

Decrypted e:  5


In [76]:

# Import Dependencies

import time
import os.path
from zipfile import ZipFile
from urllib.request import urlopen
from contextlib import contextmanager

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
#download dataset. this only needs to be run the first time.

url = [
    'https://cloudstor.aarnet.edu.au/plus/index.php/s/RpHZ57z2E3BTiSQ/download',
    'https://cloudstor.aarnet.edu.au/plus/index.php/s/QVD4Xk5Cz3UVYLp/download'
]


def download_data():
    """Download two sets of Enron1 spam/ham e-mails if they are not here
    We will use the first as trainset and the second as testset.
    Return the path prefix to us to load the data from disk."""

    n_datasets = 2
    for d in range(1, n_datasets + 1):
        if not os.path.isdir('./enron%d' % d):

            URL = url[d-1]
            print("Downloading %d/%d: %s" % (d, n_datasets, URL))
            folderzip = './enron%d.zip' % d

            with urlopen(URL) as remotedata:
                with open(folderzip, 'wb') as z:
                    z.write(remotedata.read())

            with ZipFile(folderzip) as z:
                z.extractall()
            os.remove(folderzip)

In [21]:
#only run if we need to get data

download_data()

Downloading 1/2: https://cloudstor.aarnet.edu.au/plus/index.php/s/RpHZ57z2E3BTiSQ/download
Downloading 2/2: https://cloudstor.aarnet.edu.au/plus/index.php/s/QVD4Xk5Cz3UVYLp/download


In [77]:
# Data Preprocessing
def preprocess_data():
    """
    Load the email dataset and Represent them as bag-of-words.
    Shuffle and split train/test.
    """

    print("Importing dataset...")
    path = '/home/jupyter/enron1/ham/'
    ham1 = [open(path + f, 'r', errors='replace').read().strip(r"\n")
            for f in os.listdir(path) if os.path.isfile(path + f)]
    path = '/home/jupyter/enron1/spam/'
    spam1 = [open(path + f, 'r', errors='replace').read().strip(r"\n")
             for f in os.listdir(path) if os.path.isfile(path + f)]
    path = '/home/jupyter/enron2/ham/'
    ham2 = [open(path + f, 'r', errors='replace').read().strip(r"\n")
            for f in os.listdir(path) if os.path.isfile(path + f)]
    path = '/home/jupyter/enron2/spam/'
    spam2 = [open(path + f, 'r', errors='replace').read().strip(r"\n")
             for f in os.listdir(path) if os.path.isfile(path + f)]

    # Merge and create labels
    emails = ham1 + spam1 + ham2 + spam2
    y = np.array([-1] * len(ham1) + [1] * len(spam1) +
                 [-1] * len(ham2) + [1] * len(spam2))

    # Words count, keep only frequent words
    # Minimum Document Word Frequency: 0.001
    count_vect = CountVectorizer(decode_error='replace', stop_words='english', min_df=0.001)
    X = count_vect.fit_transform(emails)

    print('Vocabulary size: %d' % X.shape[1])

    # Shuffle
    perm = np.random.permutation(X.shape[0])
    X, y = X[perm, :], y[perm]

    # Split train and test
    split = 500
    X_train, X_test = X[-split:, :], X[:-split, :]
    y_train, y_test = y[-split:], y[:-split]

    print("Labels in trainset are {:.2f} spam : {:.2f} ham".format(
        np.mean(y_train == 1), np.mean(y_train == -1)))

    return X_train, y_train, X_test, y_test

In [78]:
@contextmanager
def timer():
    """Helper for measuring runtime"""

    time0 = time.perf_counter()
    yield
    print('[elapsed time: %.2f s]' % (time.perf_counter() - time0))

In [79]:
class AI_Inc:
    """
    AI Inc. Trains a Logistic Regression model on plaintext data, encrypts the model for remote use by USER-1 and USER-2,
    decrypts encrypted scores using the paillier private key.
    """

    def __init__(self):
        self.model = LogisticRegression()

    # Generate Public and Private Key Pairs
    # Public Key is used to Encrypt the Data, Private Key to Decrypt
    def generate_paillier_keypair(self, n_length):
        self.pubkey, self.privkey = paillier.generate_paillier_keypair(n_length=n_length)

    # Train the Model
    def fit(self, X, y):
        self.model = self.model.fit(X, y)

    # Make Predictions for Email "Spam/Not Spam"
    def predict(self, X):
        return self.model.predict(X)

    # Encypt the Coefficients for the Logistic Regression Equation
    # Weights can tell about the data, so Encrypt them
    # Equation: y = mX + b
    def encrypt_weights(self):
        coef = self.model.coef_[0, :]
        encrypted_weights = [self.pubkey.encrypt(coef[i])
                             for i in range(coef.shape[0])]
        encrypted_intercept = self.pubkey.encrypt(self.model.intercept_[0])
        return encrypted_weights, encrypted_intercept

    # Decrypt the Scores for the Model
    def decrypt_scores(self, encrypted_scores):
        return [self.privkey.decrypt(s) for s in encrypted_scores]

In [80]:
# Now the USER-1 gets a trained model from AI Inc. and trains on its own data all using Homomorphic Encryption.
class User_1:
    """
    USER-1/USER-2 are given the encrypted model trained by AI Inc. and the public key.

    Scores local plaintext data with the encrypted model, but cannot decrypt
    the scores without the private key held by AI Inc..
    """

    def __init__(self, pubkey):
        self.pubkey = pubkey

    # Set Initial Values of Coefficients
    def set_weights(self, weights, intercept):
        self.weights = weights
        self.intercept = intercept

    # Compute the Prediction Scores for the Model all while being totally Encrypted.
    def encrypted_score(self, x):
        """Compute the score of `x` by multiplying with the encrypted model,
        which is a vector of `paillier.EncryptedNumber`"""
        score = self.intercept
        _, idx = x.nonzero()
        for i in idx:
            score += x[0, i] * self.weights[i]
        return score

    # Get the Evaluation Scores for the Model
    def encrypted_evaluate(self, X):
        return [self.encrypted_score(X[i, :]) for i in range(X.shape[0])]

In [81]:
# Get the Preprocessed Split Data
X_train, y_train, X_test, y_test = preprocess_data()

Importing dataset...
Vocabulary size: 7997
Labels in trainset are 0.27 spam : 0.73 ham


In [82]:
# Now firstly the AI Inc. Generates the Public and Private Keys
print("AI Inc.: Generating Paillier Public Private Keypair")
ai_inc = AI_Inc()
# NOTE: using smaller keys sizes wouldn't be cryptographically safe
ai_inc.generate_paillier_keypair(n_length=1024)

AI Inc.: Generating Paillier Public Private Keypair


In [83]:
print("AI Inc.: Training Initial Spam Classifier")
with timer() as t:
    ai_inc.fit(X_train, y_train)

AI Inc.: Training Initial Spam Classifier
[elapsed time: 0.07 s]


In [84]:
print("AI Inc.'s Classification on Test Data, what it would expect the performance to be on USER-1/2's data...")
with timer() as t:
    error = np.mean(ai_inc.predict(X_test) != y_test)
print("Error {:.3f}".format(error))

AI Inc.'s Classification on Test Data, what it would expect the performance to be on USER-1/2's data...
[elapsed time: 0.00 s]
Error 0.040


In [85]:
print("AI Inc.: Encrypting Trained Classifier before sending to USER-1/2")
with timer() as t:
    encrypted_weights, encrypted_intercept = ai_inc.encrypt_weights()

AI Inc.: Encrypting Trained Classifier before sending to USER-1/2
[elapsed time: 15.56 s]


In [62]:
# Confirming the Weights are Encrypted
print("A Few Encrypted Weights: ", encrypted_weights[0:5]) #Not showing full list
print("Encrypted Intercept: ", encrypted_intercept)

A Few Encrypted Weights:  [<phe.paillier.EncryptedNumber object at 0x7f5715551790>, <phe.paillier.EncryptedNumber object at 0x7f5715551150>, <phe.paillier.EncryptedNumber object at 0x7f5715551250>, <phe.paillier.EncryptedNumber object at 0x7f5716209b50>, <phe.paillier.EncryptedNumber object at 0x7f5714fc0050>]
Encrypted Intercept:  <phe.paillier.EncryptedNumber object at 0x7f571542e410>


In [63]:
#TC trying to complete. User makes prediction on unecrypted data using encrypted model.  
user1 = User_1(pubkey=ai_inc.pubkey)
user1.set_weights(encrypted_weights, encrypted_intercept)
evaluateValue = user1.encrypted_evaluate(X_test[0])

print(evaluateValue)
print(evaluateValue[0])

[<phe.paillier.EncryptedNumber object at 0x7f5714fc0a90>]
<phe.paillier.EncryptedNumber object at 0x7f5714fc0a90>


In [64]:
# AI company decrypts score
ai_inc.privkey.decrypt(evaluateValue[0])

-16.413734225260374