In [1]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, append=True)

import numpy as np
np.random.seed(1988)

import torch
torch.manual_seed(1988)
# torch.set_deterministic(True)

import random 
random.seed(1988) 

In [2]:
# SyferText imports
import syfertext
from syfertext.pipeline import SimpleTagger

# PySyft and PyTorch import
import syft as sy
from syft.generic.string import String
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.optim as optim

# Useful imports
import numpy as np
from tqdm import tqdm
import csv
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sb
import os
from pprint import pprint

sb.set()




In [3]:
# Importar los archivos y crear 2 listas.
# reviews, con las lineas de texto
# labels,  con la correspondiente clase

with open('./raw/badqueries.unix', 'r', encoding='utf-8') as dataset_file:
    bads = dataset_file.readlines()
with open('./raw/goodqueries.unix', 'r', encoding='utf-8') as dataset_file:
    goods = dataset_file.readlines()
    
# Concateno las listas    
reviews = bads + goods
# Creo 2 listas con [1|0] de acuerdo al tamanio de cada dataset
labels  = ([1] * len(bads)) + ([0] * len(goods))

In [4]:
print("Cantidad de Registros Malos [{}]".format(len(bads)))
print("Cantidad de Registros Buenos [{}]".format(len(goods)))
print("Cantidad de Registros Registros Totales [{}]".format(len(reviews)))
print("Cantidad de Registros Labels Totales [{}]".format(len(labels)))

Cantidad de Registros Malos [48126]
Cantidad de Registros Buenos [1294531]
Cantidad de Registros Registros Totales [1342657]
Cantidad de Registros Labels Totales [1342657]


In [5]:
# Importamos los simbolos de puntuacion para poder eliminarlos de los registros
from string import punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
# Generamos un array con todas las palabras del dataset completo
# Se eliminan todos los posibles simbolos de puntuacion y se dejan 
# lo que son palabras / numeros
all_reviews=list()
for text in reviews:
  text = text.lower()
  for a in punctuation:
      text = text.replace(a, ' ')
  all_reviews.append(text)
all_text = " ".join(all_reviews)

In [7]:
print("Cantidad de Palabras Diferentes    [{}]".format(len(all_reviews)))

Cantidad de Palabras Diferentes    [1342657]


In [8]:
# Create a torch hook for PySyft
hook = sy.TorchHook(torch)

# Create some PySyft workers
me = hook.local_worker # This is the worker representing the deep learning company
bob = sy.VirtualWorker(hook, id = 'bob') # Bob owns the first dataset
alice = sy.VirtualWorker(hook, id = 'alice') # Alice owns the second dataset

crypto_provider = sy.VirtualWorker(hook, id = 'crypto_provider') # provides encryption primitive for SMPC

# Create a summary writer for logging performance with Tensorboard
writer = SummaryWriter()



In [9]:
# Crear un dataset_local que es un dict con 2 entradas, review y label
# Key['review'] -> El texto entero
# Key['label']  -> La label Codificada 0/1

dataset_local = []

for i in range(len(all_reviews)):
    example = dict(review = String(all_reviews[i]), label = labels[i])
    dataset_local.append(example)


In [10]:
example = dataset_local[1]
pprint(example)

{'label': 1,
 'review': ' h21y8w52 nsf  script cross site scripting nasl  script \n'}


In [11]:
print(type(example['review']))
print(type(example['label']))

<class 'syft.generic.string.String'>
<class 'int'>


In [12]:
# Create two datasets, one for Bob, and the other for Alice

a, b = train_test_split(dataset_local, train_size = 0.10)
dataset_bob, dataset_alice = train_test_split(a, train_size = 0.5)

# dataset_bob, dataset_alice = train_test_split(dataset_local, train_size = 0.5)

# Now create a validation set for Bob, and another for Alice
train_bob,   val_bob   = train_test_split(dataset_bob, train_size = 0.7)
train_alice, val_alice = train_test_split(dataset_alice, train_size = 0.7)

In [13]:
a = 0
for i in range(len(dataset_bob)):
    if dataset_bob[i]['label'] == 0: 
        a += 1
print("Number of Positive [{}] of [{}] recrords".format(a, len(dataset_bob)))

Number of Positive [64698] of [67132] recrords


In [14]:
# A function that sends the content of each split to a remote worker
def make_remote_dataset(dataset, worker):

    # Got through each example in the dataset
    for example in tqdm(dataset):
        
        # Send each review text
        example['review'] = example['review'].send(worker)

        # Send each label as a one-hot-enceded vector
        one_hot_label = torch.zeros(2).scatter(0, torch.Tensor([example['label']]).long(), 1)
        
        # Send the review label
        example['label'] = one_hot_label.send(worker)

In [15]:
# Bob's remote dataset
make_remote_dataset(train_bob, bob)
make_remote_dataset(val_bob,   bob)

# Alice's remote dataset
make_remote_dataset(train_alice, alice)
make_remote_dataset(val_alice,   alice)

100%|██████████| 46992/46992 [00:27<00:00, 1678.98it/s]
100%|██████████| 20140/20140 [00:13<00:00, 1519.72it/s]
100%|██████████| 46993/46993 [00:27<00:00, 1693.41it/s]
100%|██████████| 20140/20140 [00:11<00:00, 1696.70it/s]


In [16]:
# Take an element from the dataset
example = train_bob[10]

print(type(example['review']))
print(example['label'])

<class 'syft.generic.pointers.string_pointer.StringPointer'>
(Wrapper)>[PointerTensor | me:31632830984 -> bob:91479478368]


In [17]:
print(example['review'].location)
print(example['label'].location)

<VirtualWorker id:bob #objects:134264>
<VirtualWorker id:bob #objects:134264>


In [18]:
# Create a Language object with SyferText
nlp = syfertext.load('en_core_web_lg', owner = me)

In [19]:
nlp.pipeline_template

[{'remote': True, 'name': 'tokenizer'}]

In [20]:
use_stop_tagger = False
use_polarity_tagger = False

# Tokens with these custom tags
# will be excluded from creating
# the Doc vector
excluded_tokens = {}

In [21]:
class DatasetIMDB(Dataset):
    
    def __init__(self, sets, share_workers, crypto_provider, nlp):
        """Initialize the Dataset object
        
        Args:
            sets (list): A list containing all training OR 
                all validation sets to be used.
            share_workers (list): A list of workers that will
                be used to hold the SMPC shares.
            crypto_provider (worker): A worker that will 
                provide SMPC primitives for encryption.
            nlp: This is SyferText's Language object containing
                the preprocessing pipeline.
        """
        self.sets = sets
        self.crypto_provider = crypto_provider
        self.workers = share_workers
    
        # Create a single dataset unifying all datasets.
        # A property called `self.dataset` is created 
        # as a result of this call.
        self._create_dataset()
        
        # The language model
        self.nlp = nlp
        
    def __getitem__(self, index):
        """In this function, preprocessing with SyferText 
        of one review will be triggered. Encryption will also
        be performed and the encrypted vector will be obtained.
        The encrypted label will be computed too.
        
        Args:
            index (int): This is an integer received by the 
                PyTorch DataLoader. It specifies the index of
                the example to be fetched. This actually indexes
                one example in `self.dataset` which pools over
                examples of all the remote datasets.
        """
        
        # get the example
        example = self.dataset[index]
        
        # Run the preprocessing pipeline on 
        # the review text and get a DocPointer object
        doc_ptr = self.nlp(example['review'])
        
        # Get the encrypted vector embedding for the document
        vector_enc = doc_ptr.get_encrypted_vector(bob, 
                                                  alice, 
                                                  crypto_provider = self.crypto_provider,
                                                  requires_grad = True,
                                                  excluded_tokens = excluded_tokens
                                                 )
        

        # Encrypte the target label
        label_enc = example['label'].fix_precision().share(bob, 
                                                           alice, 
                                                           crypto_provider = self.crypto_provider,
                                                           requires_grad = True
                                                          ).get()


        return vector_enc, label_enc

    
    def __len__(self):
        """Returns the combined size of all of the 
        remote training/validation sets.
        """
        
        # The size of the combined datasets
        return len(self.dataset)

    def _create_dataset(self):
        """Create a single list unifying examples from all remote datasets
        """
        
        # Initialize the dataset
        self.dataset = []
      
        # populate the dataset list
        for dataset in self.sets:
            for example in dataset:
                self.dataset.append(example)
                
    @staticmethod
    def collate_fn(batch):
        """The collat_fn method to be used by the
        PyTorch data loader.
        """
        
        # Unzip the batch
        vectors, targets = list(zip(*batch))

        # concatenate the vectors
        vectors = torch.stack(vectors)
        
        #concatenate the labels
        targets = torch.stack(targets)
        
        return vectors, targets

In [22]:
# Instantiate a training Dataset object
trainset = DatasetIMDB(sets = [train_bob,
                               train_alice],
                       share_workers = [bob, alice],
                       crypto_provider = crypto_provider,
                       nlp = nlp
                      )

# Instantiate a validation Dataset object
valset = DatasetIMDB(sets = [val_bob,
                             val_alice],
                     share_workers = [bob, alice],
                     crypto_provider = crypto_provider,
                     nlp = nlp
                    )

In [23]:

# Set some hyper parameters
learning_rate = 0.001
batch_size = 32
epochs = 1

In [24]:
# Instantiate the DataLoader object for the training set
trainloader = DataLoader(trainset, shuffle = True,
                         batch_size = batch_size, num_workers = 0, 
                         collate_fn = trainset.collate_fn)


# Instantiate the DataLoader object for the validation set
valloader = DataLoader(valset, shuffle = True,
                       batch_size = batch_size, num_workers = 0, 
                       collate_fn = valset.collate_fn)

In [25]:
class Classifier(torch.nn.Module):
    def __init__(self, in_features, out_features):
        super(Classifier, self).__init__()        
        self.fc = torch.nn.Linear(in_features, out_features)
                
    def forward(self, x):
        logits = self.fc(x)
        probs = F.relu(logits)
        return probs, logits

In [26]:
# Create the classifer
classifier = Classifier(in_features = 300, out_features = 2)

# Apply SMPC encryption
classifier = classifier.fix_precision().share(bob, alice, 
                                              crypto_provider = crypto_provider,
                                              requires_grad = True
                                              )
print(classifier)

Classifier(
  (fc): Linear(in_features=300, out_features=2, bias=True)
)


In [27]:
optim = optim.SGD(params = classifier.parameters(),
                  lr = learning_rate)

optim = optim.fix_precision()

In [None]:
for epoch in range(epochs):
    
    for iter, (vectors, targets) in enumerate(trainloader):
        
        # Set train mode
        classifier.train()

        # Zero out previous gradients
        optim.zero_grad()

        # Predict sentiment probabilities
        probs, logits = classifier(vectors)

        # Compute loss and accuracy
        loss = ((probs -  targets)**2).sum()


        # Get the predicted labels
        preds = probs.argmax(dim=1)
        targets = targets.argmax(dim=1)
        
        # Compute the prediction accuracy
        accuracy = (preds == targets).sum()
        accuracy = accuracy.get().float_precision()
        accuracy = 100 * (accuracy / batch_size)
        
        # Backpropagate the loss
        loss.backward()

        # Update weights
        optim.step()

        # Decrypt the loss for logging
        loss = loss.get().float_precision()

        
        # Log to Tensorboard
        writer.add_scalar('train/loss', loss, epoch * len(trainloader) + iter )
        writer.add_scalar('train/acc', accuracy, epoch * len(trainloader) + iter )

        
        """ Perform validation on exactly one batch """
        
        # Set validation mode
        classifier.eval()

        for vectors, targets in valloader:
            
            probs, logits = classifier(vectors)

            loss = ((probs -  targets)**2).sum()

            preds = probs.argmax(dim=1)
            targets = targets.argmax(dim=1)

            accuracy = preds.eq(targets).sum()
            accuracy = accuracy.get().float_precision()
            accuracy = 100 * (accuracy / batch_size)

            loss = loss.get().float_precision()
            
            
            # Log to tensorboard
            writer.add_scalar('val/loss', loss, epoch * len(trainloader) + iter )
            writer.add_scalar('val/acc', accuracy, epoch * len(trainloader) + iter )
            
            break

            
writer.close()

In [None]:
# On bob's machine
[bob._objects[id] for id in bob._objects if  isinstance(bob._objects[id], syfertext.SubPipeline)]

In [None]:
# On Alices's machine
[alice._objects[id] for id in alice._objects if  isinstance(alice._objects[id], syfertext.SubPipeline)]