In [1]:
import codecs
import copy
import numpy as np
import phe
np.random.seed(12345)

In [2]:
# Preprocessing

In [3]:
spam_lines = codecs.open('data/enron/spam.txt', 'r', encoding='utf-8', errors='ignore').readlines()
ham_lines = codecs.open('data/enron/ham.txt', 'r', encoding='utf-8', errors='ignore').readlines()

In [4]:
unknown = '<unk>'
vocab_set = set([unknown])
spam = [set(row[:-2].split(" ")) for row in spam_lines]
ham = [set(row[:-2].split(" ")) for row in ham_lines]

for row in spam + ham:
    for word in row:
        vocab_set.add(word)

vocab = list(vocab_set)
word2index = {word: i for i, word in enumerate(vocab)}

In [5]:
def to_indices(inp, length=500):
    indices = []
    for line in inp:
        if len(line) < length:
            line = list(line) + [unknown] * (length - len(line))
            idxs = [word2index[word] for word in line]
            indices.append(idxs)
    return indices

In [6]:
spam_idx = to_indices(spam)
ham_idx = to_indices(ham)

train_spam_idx = spam_idx[0:-1000]
train_ham_idx = ham_idx[0:-1000]

test_spam_idx = spam_idx[-1000:]
test_ham_idx = ham_idx[-1000:]

train_data = []
train_target = []
test_data = []
test_target = []

for i in range(max(len(train_spam_idx), len(train_ham_idx))):
    train_data.append(train_spam_idx[i % len(train_spam_idx)])
    train_target.append([1])
    train_data.append(train_ham_idx[i % len(train_ham_idx)])
    train_target.append([0])

for i in range(max(len(test_spam_idx), len(test_ham_idx))):
    test_data.append(test_spam_idx[i % len(test_spam_idx)])
    test_target.append([1])
    test_data.append(test_ham_idx[i % len(test_ham_idx)])
    test_target.append([0])

In [7]:
# Classes from previous chapters

In [8]:
class Layer(object):
    def __init__(self):
        self.parameters = list()
    
    def get_parameters(self):
        return self.parameters

In [9]:
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.id = np.random.randint(0, 100000) if id is None else id
        
        if creators is not None:
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for _, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
    
    def backward(self, grad=None, grad_origin=None):
        if not self.autograd:
            return
        
        if grad is None:
            grad = Tensor(np.ones_like(self.data))
        
        if grad_origin is not None:
            if self.children[grad_origin.id] == 0:
                return
                raise Exception("cannot backprop more than once")

            self.children[grad_origin.id] -= 1
        
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
        
        assert grad.autograd == False
        
        if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
            if self.creation_op == "neg":
                self.creators[0].backward(self.grad.__neg__(), self)
            elif self.creation_op == "add":
                self.creators[0].backward(self.grad, self)
                self.creators[1].backward(self.grad, self)
            elif self.creation_op == "sub":
                positive_grad = Tensor(self.grad.data)
                self.creators[0].backward(positive_grad, self)
                negative_grad = Tensor(self.grad.__neg__().data)
                self.creators[1].backward(negative_grad, self)
            elif self.creation_op == "mul":
                mul_grad_0 = self.grad * self.creators[0]
                self.creators[0].backward(mul_grad_0, self)
                mul_grad_1 = self.grad * self.creators[1]
                self.creators[1].backward(mul_grad_1, self)
            elif self.creation_op == "transpose":
                self.creators[0].backward(self.grad.transpose())
            elif self.creation_op == "mm":
                activations = self.creators[0]
                weights = self.creators[1]
                activations_grad = self.grad.mm(weights.transpose())
                activations.backward(activations_grad)
                weights_grad = self.grad.transpose().mm(activations).transpose()
                weights.backward(weights_grad)
            elif "sum" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                ds = self.creators[0].data.shape[dim]
                self.creators[0].backward(self.grad.expand(dim, ds))
            elif "expand" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                self.creators[0].backward(self.grad.sum(dim))
            elif self.creation_op == "sigmoid":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * self * (ones - self))
            elif self.creation_op == "tanh":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * (ones - (self * self)))
            elif self.creation_op == "index_select":
                new_grad = np.zeros_like(self.creators[0].data)
                indices = self.index_select_indices.data.flatten()
                grad_reshaped = grad.data.reshape(len(indices), -1)
                for i in range(len(indices)):
                    new_grad[indices[i]] += grad_reshaped[i]
                self.creators[0].backward(Tensor(new_grad))
            elif self.creation_op == "cross_entropy":
                dx = self.softmax_output - self.target_dist
                self.creators[0].backward(Tensor(dx))
            

    def __neg__(self):
        if self.autograd:
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add")
        return Tensor(self.data + other.data)

    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data, autograd=True, creators=[self, other], creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data, autograd=True, creators=[self, other], creation_op="mul")
        return Tensor(self.data * other.data)
    
    def sum(self, dimension):
        if self.autograd:
            return Tensor(self.data.sum(dimension), autograd=True, creators=[self], creation_op="sum_"+str(dimension))
        return Tensor(self.data.sum(dimension))

    def expand(self, dimension, copies):
        transpose_cmd = list(range(0, len(self.data.shape)))
        transpose_cmd.insert(dimension, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape).transpose(transpose_cmd)
        
        if self.autograd:
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dimension))
        return Tensor(new_data)
    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if self.autograd and x.autograd:
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self, x], creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)), autograd=True, creators=[self], creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if self.autograd:
            return Tensor(np.tanh(self.data), autograd=True, creators=[self], creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    def index_select(self, indices):
        if self.autograd:
            tensor = Tensor(self.data[indices.data], autograd=True, creators=[self], creation_op="index_select")
            tensor.index_select_indices = indices
            return tensor
        return Tensor(self.data[indices.data])
    
    def softmax(self):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape) - 1, keepdims=True)
        return softmax_output

    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape) - 1, keepdims=True)
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t), -1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * target_dist).sum(1).mean()
        
        if self.autograd:
            tensor = Tensor(loss, autograd=True, creators=[self], creation_op="cross_entropy")
            tensor.softmax_output = softmax_output
            tensor.target_dist = target_dist
            return tensor
        return Tensor(loss)

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [10]:
class Embedding(Layer):
    def __init__(self, vocab_size, dimensions):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dimensions = dimensions
        
        weights = (np.random.rand(vocab_size, dimensions) - 0.5) / dimensions
        self.weights = Tensor(weights, autograd=True)
        
        self.parameters.append(self.weights)
    
    def forward(self, input):
        return self.weights.index_select(input)

In [11]:
class MSELoss(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, prediction, target):
        diff = prediction - target
        return (diff * diff).sum(0)

In [12]:
class SGD(object):
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    
    def zero(self):
        for parameter in self.parameters:
            parameter.grad.data *= 0

    def step(self, zero=True):
        for parameter in self.parameters:
            parameter.data -= parameter.grad.data * self.alpha
            
            if zero:
                parameter.grad.data *= 0

In [13]:
# Train/test functions

In [14]:
def train(model, input_data, target_data, batch_size=500, iterations=5):
    n_batches = int(len(input_data) / batch_size)
    criterion = MSELoss()
    optimizer = SGD(parameters=model.get_parameters(), alpha=0.01)
    
    for iteration in range(iterations):
        iteration_loss = 0
        for b_i in range(n_batches):
            model.weights.data[word2index[unknown]] *= 0
            inp = Tensor(input_data[b_i*batch_size:(b_i + 1)*batch_size], autograd=True)
            target = Tensor(target_data[b_i*batch_size:(b_i + 1)*batch_size], autograd=True)
            
            output = model.forward(inp)
            prediction = output.sum(1).sigmoid()
            loss = criterion.forward(prediction, target)
            loss.backward()
            optimizer.step()
            
            iteration_loss += loss.data[0] / batch_size
            print('Loss: %f' % (iteration_loss / (b_i + 1)))
            
    return model

In [15]:
def test(model, input_data, target_data):
    model.weights.data[word2index[unknown]] *= 0
    
    inp = Tensor(input_data, autograd=True)
    target = Tensor(target_data, autograd=True)
    
    prediction = model.forward(inp).sum(1).sigmoid()
    return ((prediction.data > 0.5) == target.data).mean()

In [16]:
# Vanilla training

In [17]:
model = Embedding(vocab_size=len(vocab), dimensions=1)
model.weights.data *= 0

In [18]:
for i in range(3):
    model = train(model, train_data, train_target, iterations=1)
    print('%% Correct on test set: %f' % (test(model, test_data, test_target) * 100))

Loss: 0.250000
Loss: 0.219082
Loss: 0.293608
Loss: 0.259571
Loss: 0.223674
Loss: 0.204721
Loss: 0.182112
Loss: 0.163808
Loss: 0.150422
Loss: 0.140804
Loss: 0.132100
Loss: 0.125227
Loss: 0.118157
Loss: 0.113585
Loss: 0.110229
Loss: 0.106477
Loss: 0.102434
Loss: 0.098753
Loss: 0.094793
Loss: 0.091434
Loss: 0.088385
Loss: 0.085427
Loss: 0.082804
Loss: 0.080167
Loss: 0.078222
Loss: 0.076305
Loss: 0.074647
Loss: 0.072889
Loss: 0.072076
Loss: 0.070552
Loss: 0.069271
Loss: 0.067928
Loss: 0.066542
Loss: 0.065171
Loss: 0.063914
Loss: 0.062610
Loss: 0.061345
Loss: 0.060248
Loss: 0.059126
Loss: 0.058162
Loss: 0.057278
Loss: 0.056440
Loss: 0.055709
Loss: 0.055264
Loss: 0.054511
Loss: 0.053940
Loss: 0.053269
Loss: 0.052553
Loss: 0.051779
Loss: 0.051058
Loss: 0.050299
Loss: 0.049647
Loss: 0.048958
Loss: 0.048369
Loss: 0.047765
Loss: 0.047195
Loss: 0.046704
Loss: 0.046376
Loss: 0.045985
Loss: 0.045565
Loss: 0.045156
Loss: 0.044695
Loss: 0.044248
Loss: 0.043722
Loss: 0.043226
Loss: 0.042773
Loss: 0.04

In [19]:
# Federated

In [20]:
model = Embedding(vocab_size=len(vocab), dimensions=1)
model.weights.data *= 0

In [21]:
bob = (train_data[0:5000], train_target[0:5000])
alice = (train_data[5000:10000], train_target[5000:10000])
sue = (train_data[10000:], train_target[10000:])

In [22]:
for i in range(3):
    print("Starting training round %d" % i)
    print("\tStep 1: send the model to Bob")
    bob_model = train(copy.deepcopy(model), bob[0], bob[1], iterations=1)
    
    print("\tStep 2: send the model to Alice")
    alice_model = train(copy.deepcopy(model), alice[0], alice[1], iterations=1)
    
    print("\tStep 3: send the model to Sue")
    sue_model = train(copy.deepcopy(model), sue[0], sue[1], iterations=1)
    
    print("\tAverage everyone's new models")
    model.weights.data = (bob_model.weights.data + alice_model.weights.data + sue_model.weights.data) / 3
    
    print("\t%% Correct on test set: %f" % (test(model, test_data, test_target) * 100))

Starting training round 0
	Step 1: send the model to Bob
Loss: 0.250000
Loss: 0.219082
Loss: 0.293608
Loss: 0.259571
Loss: 0.223674
Loss: 0.204721
Loss: 0.182112
Loss: 0.163808
Loss: 0.150422
Loss: 0.140804
	Step 2: send the model to Alice
Loss: 0.250000
Loss: 0.172907
Loss: 0.147506
Loss: 0.180480
Loss: 0.197209
Loss: 0.179812
Loss: 0.162350
Loss: 0.149679
Loss: 0.137586
Loss: 0.128769
	Step 3: send the model to Sue
Loss: 0.250000
Loss: 0.201089
Loss: 0.261282
Loss: 0.233241
Loss: 0.203720
Loss: 0.181804
Loss: 0.163969
Loss: 0.148835
Loss: 0.141523
Loss: 0.136314
Loss: 0.132527
Loss: 0.125221
Loss: 0.118252
Loss: 0.112073
Loss: 0.106696
Loss: 0.101579
Loss: 0.096970
Loss: 0.093086
Loss: 0.089641
Loss: 0.086555
Loss: 0.084155
Loss: 0.081847
Loss: 0.080122
Loss: 0.078493
Loss: 0.076690
Loss: 0.075199
Loss: 0.073372
Loss: 0.071622
Loss: 0.069822
Loss: 0.068120
Loss: 0.066450
Loss: 0.065001
Loss: 0.063491
Loss: 0.062270
Loss: 0.061120
Loss: 0.059990
Loss: 0.058993
Loss: 0.058298
Loss: 0.0

In [23]:
# Hacking into federated learning

In [24]:
email = ["my", "computer", "password", "is", "pizza"]
victim_input = np.array([[word2index[word] for word in email]])
victim_target = np.array([[0]])

In [25]:
model = Embedding(vocab_size=len(vocab), dimensions=1)
model.weights.data *= 0
victim_model = train(copy.deepcopy(model), victim_input, victim_target, iterations=1, batch_size=1)

Loss: 0.250000


In [26]:
for i, v in enumerate(victim_model.weights.data - model.weights.data):
    if v != 0:
        print("GOT", vocab[i])

GOT computer
GOT password
GOT pizza
GOT my
GOT is


In [27]:
# Homomorphic encryption

In [28]:
public_key, private_key = phe.generate_paillier_keypair(n_length=1024)

In [29]:
x = public_key.encrypt(5)
y = public_key.encrypt(3)
z = x + y
z_ = private_key.decrypt(z)

print("Encrypted values:", x.ciphertext(), y.ciphertext())
print("Answer:", z_)

Encrypted values: 383425377072847064575680323960544409719690850472195748727382305161425579523814563448991757841401260406011484778223982626063797971719962251730655074401018251226887264406861243586378480749458619893832526067491876663439673342534567203460037613231429209118328969254772474492037187078776515290140279382933476833986646228157842116083907955289716548794842232054065637840864891900023844675484314697047534567075581502458893657260673588728362429460277157222201780210304752818985266920378854635207319315099306297703675474068281342192896929514656794057182568934603840941411744527575870896306871676307427084500270756317954418710 330049022800895348068482402529073529311527049822967619463905183676177395958266073057743624044227002627764784514799171363254956364472825039072222000474037896797812118965776001391974696390828620211212401509205141953390799711198776506400121037751496555151459005541663572532225719755067326131395389414555529941620460080320790196001534297790419714014222581931515447235494

In [30]:
# Homomorphically encrypted federated learning

In [31]:
model = Embedding(vocab_size=len(vocab), dimensions=1)
model.weights.data *= 0

In [32]:
public_key, private_key = phe.generate_paillier_keypair(n_length=128)

In [33]:
def train_and_encrypt(model, inp, target, pubkey):
    new_model = train(model, inp, target, iterations=1)
    
    encrypted_weights = np.array([
        pubkey.encrypt(v) for v in new_model.weights.data[:,0]
    ]).reshape(new_model.weights.data.shape)
    return encrypted_weights

In [None]:
for i in range(3):
    print("\nStarting training round")
    print("\tStep 1: Send the model to Bob")
    bob_encrypted_model = train_and_encrypt(copy.deepcopy(model), bob[0], bob[1], public_key)
    
    print("\tStep 2: Send the model to Alice")
    alice_encrypted_model = train_and_encrypt(copy.deepcopy(model), alice[0], alice[1], public_key)
    
    print("\tStep 3: Send the model to Sue")
    sue_encrypted_model = train_and_encrypt(copy.deepcopy(model), sue[0], sue[1], public_key)
    
    print("\tStep 4: Bob, Alice, and Sue send their encrypted models to each other")
    aggregated_model = bob_encrypted_model + alice_encrypted_model + sue_encrypted_model
    
    print("\tStep 5: Only the aggregated model is sent back to the model owner who can decrypt it")
    raw_values = [private_key.decrypt(v) for v in aggregated_model.flatten()]
    model.weights.data = np.array(raw_values).reshape(model.weights.data.shape)
    
    print("\t%f%% correct on test set" % (test(model, test_data, test_target) * 100))


Starting training round
	Step 1: Send the model to Bob
Loss: 0.250000
Loss: 0.219082
Loss: 0.293608
Loss: 0.259571
Loss: 0.223674
Loss: 0.204721
Loss: 0.182112
Loss: 0.163808
Loss: 0.150422
Loss: 0.140804
	Step 2: Send the model to Alice
Loss: 0.250000
Loss: 0.172907
Loss: 0.147506
Loss: 0.180480
Loss: 0.197209
Loss: 0.179812
Loss: 0.162350
Loss: 0.149679
Loss: 0.137586
Loss: 0.128769
	Step 3: Send the model to Sue
Loss: 0.250000
Loss: 0.201089
Loss: 0.261282
Loss: 0.233241
Loss: 0.203720
Loss: 0.181804
Loss: 0.163969
Loss: 0.148835
Loss: 0.141523
Loss: 0.136314
Loss: 0.132527
Loss: 0.125221
Loss: 0.118252
Loss: 0.112073
Loss: 0.106696
Loss: 0.101579
Loss: 0.096970
Loss: 0.093086
Loss: 0.089641
Loss: 0.086555
Loss: 0.084155
Loss: 0.081847
Loss: 0.080122
Loss: 0.078493
Loss: 0.076690
Loss: 0.075199
Loss: 0.073372
Loss: 0.071622
Loss: 0.069822
Loss: 0.068120
Loss: 0.066450
Loss: 0.065001
Loss: 0.063491
Loss: 0.062270
Loss: 0.061120
Loss: 0.059990
Loss: 0.058993
Loss: 0.058298
Loss: 0.05