# Prototyping model2regex

In [2]:
import torch
from torch import nn, optim

Let's start by making a simple DGA:

Rules:
* always ends in .net
* a is followed by random number of digits of length 10-20
* seed is day of the month
* the simple regex for this would be
  ```py
  r"a[0-9]{10,20}"
  ```

In [3]:
from random import Random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from itertools import chain
import torch.nn.functional as F
rand = Random(str(datetime.today().day))
def simple_dga():
    digits = '0123456789'
    return 'a' + ''.join(rand.choice(digits) for _ in range(rand.randint(10, 20))) + '.net'

domains = []
for i in range(1000):
    domains.append(simple_dga())
print(rand.choices(domains, k=5))

class Domains(Dataset):

    def __init__(self, size):
        self.data = [simple_dga() for _ in range(size)]
        self.chars = sorted(list(set(chain(*self.data))))
        self.max_size = len(max(self.data, key=lambda d: len(d)))
        self.char2ind = {ch : i for i,ch in enumerate(self.chars, start=1)}
        self.ind2char = {i : ch for i,ch in enumerate(self.chars, start=1)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        item = torch.tensor([self.char2ind[c] for c in self.data[idx]])
        item = F.pad(item, (0,self.max_size - len(item)), "constant", 0)
        return (item, item[1:])
        

['a015034044502969.net', 'a4940242808385971.net', 'a952678823010460941.net', 'a178329295236.net', 'a7012476869089974329.net']


In [4]:
dataset = Domains(10000)
for batch, (x,y) in enumerate(DataLoader(dataset, batch_size=64, shuffle=True)):
    print(batch)
    print("x:\n\t", x, x.shape)
    print("y:\n\t", y, y.shape)
    if batch > 5:
        break

0
x:
	 tensor([[12,  8,  5,  ..., 15,  0,  0],
        [12, 10, 10,  ..., 14, 13, 15],
        [12, 10,  5,  ...,  0,  0,  0],
        ...,
        [12,  5,  4,  ...,  0,  0,  0],
        [12,  4,  8,  ...,  0,  0,  0],
        [12,  4,  2,  ..., 15,  0,  0]]) torch.Size([64, 25])
y:
	 tensor([[ 8,  5,  9,  ..., 15,  0,  0],
        [10, 10,  9,  ..., 14, 13, 15],
        [10,  5,  7,  ...,  0,  0,  0],
        ...,
        [ 5,  4, 10,  ...,  0,  0,  0],
        [ 4,  8,  4,  ...,  0,  0,  0],
        [ 4,  2,  9,  ..., 15,  0,  0]]) torch.Size([64, 24])
1
x:
	 tensor([[12, 11, 11,  ...,  0,  0,  0],
        [12,  3, 11,  ..., 14, 13, 15],
        [12, 11,  6,  ...,  0,  0,  0],
        ...,
        [12,  8,  8,  ...,  0,  0,  0],
        [12, 11, 11,  ...,  0,  0,  0],
        [12,  8,  6,  ..., 14, 13, 15]]) torch.Size([64, 25])
y:
	 tensor([[11, 11,  5,  ...,  0,  0,  0],
        [ 3, 11,  4,  ..., 14, 13, 15],
        [11,  6, 10,  ...,  0,  0,  0],
        ...,
        [ 8,  8,  

Next we want to use the example model from the talk.pdf page

In [5]:
from torch import nn
class Model(nn.Module):
    def __init__(self, vocabSize, emb, size, nlayers):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabSize,
                                      embedding_dim=emb)
        self.rnn = nn.GRU(input_size=emb, hidden_size=size,
                          num_layers=nlayers)
        self.decoder = nn.Linear(size, vocabSize)
        
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        return output, hidden_state.detach()

In [6]:
set(domains[0])
vocab = set()
for domain in domains:
    vocab = vocab.union(set(domain))
c_to_ix = {c:i for i,c in enumerate(vocab)}
ix_to_c = {i:c for i,c in enumerate(vocab)}
c_to_ix, ix_to_c, vocab

({'.': 0,
  'a': 1,
  'n': 2,
  't': 3,
  '4': 4,
  '7': 5,
  '6': 6,
  '3': 7,
  '5': 8,
  '9': 9,
  '8': 10,
  'e': 11,
  '1': 12,
  '2': 13,
  '0': 14},
 {0: '.',
  1: 'a',
  2: 'n',
  3: 't',
  4: '4',
  5: '7',
  6: '6',
  7: '3',
  8: '5',
  9: '9',
  10: '8',
  11: 'e',
  12: '1',
  13: '2',
  14: '0'},
 {'.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'e', 'n', 't'})

In [8]:
HIDDEN_SIZE = 128
NLAYERS = 16
EMBEDDING_DIM = 64
losses = []
def make_word_vector(domain):
    idxs = [c_to_ix[c] for c in domain]
    return torch.tensor(idxs, dtype=torch.long)

h0 = torch.zeros((NLAYERS, HIDDEN_SIZE))
criterion = nn.CrossEntropyLoss(reduction="mean")
dataset = Domains(10000)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
model = Model(len(dataset.chars),EMBEDDING_DIM, HIDDEN_SIZE, NLAYERS)
optimizer = optim.SGD(model.parameters(), lr=0.001)
for epoch in range(10):
    total_loss = 0
    for batch, (x,y) in enumerate(dataloader):
        print(x, x.shape)
        optimizer.zero_grad()
        output, h0 = model(x, h0)
        loss = criterion(output.permute(1,2,0), y)
        loss.backward()
        optimizer.step()
    print()
    print("----------------------------------------------")
    print({'epoch': epoch, 'batch': batch, 'loss': loss.item()})
    print("----------------------------------------------")
    model.train()


tensor([[12,  7,  3,  ..., 15,  0,  0],
        [12,  6,  2,  ...,  0,  0,  0],
        [12,  7,  8,  ...,  0,  0,  0],
        ...,
        [12,  7, 11,  ..., 15,  0,  0],
        [12,  3,  7,  ..., 14, 13, 15],
        [12,  8,  7,  ...,  0,  0,  0]]) torch.Size([64, 25])


IndexError: index out of range in self

In [None]:
output.shape, h0.shape