# Prototyping model2regex

In [1]:
import torch
from torch import nn, optim
from torch.distributions import Categorical
from random import Random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from itertools import chain
import torch.nn.functional as F
import string

Let's start by making a simple DGA:

Rules:
* always ends in .net
* a is followed by random number of digits of length 10-20
* seed is day of the month
* the simple regex for this would be
  ```py
  r"a[0-9]{10,20}"
  ```

In [2]:
from dga import banjori, generate_dataset

domains = generate_dataset(banjori,'earnestnessbiophysicalohax.com', 2**16)

In [3]:
class Domains(Dataset):
    def __init__(self, data: list[str]):
        self.data = data
        self.chars = sorted(list(set(chain(*self.data))))
        self.vocabSize = len(self.chars) + 1
        self.max_size = len(max(self.data, key=lambda d: len(d)))
        self.char2ind = {ch : i for i,ch in enumerate(self.chars, start=1)}
        self.ind2char = {i : ch for i,ch in enumerate(self.chars, start=1)}

    def __len__(self):
        return len(self.data)

    def isEndChar(self, ind):
        return ind == 0
    def charTensor(self, _input):
        return torch.tensor([[self.char2ind[c] for c in _input]]).permute(1,0)
        
    def __getitem__(self, idx: int):
        item = torch.tensor([self.char2ind[c] for c in self.data[idx]])
        item = F.pad(item, (0,self.max_size - len(item)), "constant", 0)
        return (item, F.pad(item[1:], (0,1), "constant", 0))
        

In [4]:
dataset = Domains(domains)
for batch, (x,y) in enumerate(DataLoader(dataset, batch_size=64, shuffle=True)):
    print(batch)
    print("x:\n\t", x, x.shape)
    print("x.permute\n\t", x.permute(1,0))
    print("y:\n\t", y, y.shape)
    if batch > 5:
        break

0
x:
	 tensor([[ 3, 27, 24,  ...,  4, 16, 14],
        [18, 16, 16,  ...,  4, 16, 14],
        [24,  8,  6,  ...,  4, 16, 14],
        ...,
        [24, 18, 18,  ...,  4, 16, 14],
        [ 6,  6, 21,  ...,  4, 16, 14],
        [10, 20, 16,  ...,  4, 16, 14]]) torch.Size([64, 30])
x.permute
	 tensor([[ 3, 18, 24,  ..., 24,  6, 10],
        [27, 16,  8,  ..., 18,  6, 20],
        [24, 16,  6,  ..., 18, 21, 16],
        ...,
        [ 4,  4,  4,  ...,  4,  4,  4],
        [16, 16, 16,  ..., 16, 16, 16],
        [14, 14, 14,  ..., 14, 14, 14]])
y:
	 tensor([[27, 24, 22,  ..., 16, 14,  0],
        [16, 16,  6,  ..., 16, 14,  0],
        [ 8,  6,  2,  ..., 16, 14,  0],
        ...,
        [18, 18, 14,  ..., 16, 14,  0],
        [ 6, 21,  6,  ..., 16, 14,  0],
        [20, 16,  8,  ..., 16, 14,  0]]) torch.Size([64, 30])
1
x:
	 tensor([[21, 21, 25,  ...,  4, 16, 14],
        [22,  8, 17,  ...,  4, 16, 14],
        [17,  5, 15,  ...,  4, 16, 14],
        ...,
        [20, 18, 16,  ...,  4, 1

Next we want to use the example model from the talk.pdf page

In [5]:
from torch import nn
class Model(nn.Module):
    def __init__(self, vocabSize, emb, size, nlayers):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabSize,
                                      embedding_dim=emb)
        self.rnn = nn.GRU(input_size=emb, hidden_size=size,
                          num_layers=nlayers)
        self.decoder = nn.Linear(size, vocabSize)
        
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        return output, hidden_state.detach()

In [6]:
# Trying to understand how the embedding put out the permuted input
dataset = Domains(domains)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
(x,y) = next(iter(dataloader))
print(x.permute(1,0))
embedding = nn.Embedding(num_embeddings=dataset.vocabSize, embedding_dim=64)
emb = embedding(x.permute(1,0))
emb.shape


tensor([[ 7, 26, 18,  ...,  3, 15,  4],
        [13, 18, 26,  ...,  7, 27, 22],
        [ 2, 15, 20,  ...,  9, 12, 11],
        ...,
        [ 4,  4,  4,  ...,  4,  4,  4],
        [16, 16, 16,  ..., 16, 16, 16],
        [14, 14, 14,  ..., 14, 14, 14]])


torch.Size([30, 64, 64])

In [7]:
def predict(starter: str, model, dataset):
    chart = dataset.charTensor(starter)
    state = None
    for _ in range(100):
        if state is not None:
            state = state.to(device)
        output, state = model(chart.to(device), state)
        output = F.softmax(torch.squeeze(output[-1, :]), dim=0)
        dist = Categorical(output)
        index = dist.sample()
        ind = index.item()
        if dataset.isEndChar(ind):
            break
        starter += dataset.ind2char[ind]
        chart = torch.tensor([[ind]])
    return starter 


In [9]:
HIDDEN_SIZE = 128
NLAYERS = 1
EMBEDDING_DIM = 64
losses = []
def make_word_vector(domain):
    idxs = [c_to_ix[c] for c in domain]
    return torch.tensor(idxs, dtype=torch.long)

device = "cuda:0"
h0 = None
criterion = nn.CrossEntropyLoss(reduction="mean")
dataset = Domains(domains)
print(len(dataset.data))
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
print("vocabSize",len(dataset.chars))
model = Model(dataset.vocabSize,EMBEDDING_DIM, HIDDEN_SIZE, NLAYERS)
model.to(device)
model.train()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(100):
    for batch, (x,y) in enumerate(dataloader):
        optimizer.zero_grad()
        if h0 is not None:
            h0 = h0.to(device)
        output, h0 = model(x.permute(1,0).to(device), h0)
        loss = criterion(output.permute(1,2,0), y.to(device))
        loss.backward()
        optimizer.step()
    print()
    print("----------------------------------------------")
    print({'epoch': epoch, 'batch': batch, 'loss': loss.item()})
    for char in string.ascii_lowercase:
        print(predict(char, model, dataset))
    print("----------------------------------------------")
    model.train()
predict('ca', model, dataset)

65536
vocabSize 27

----------------------------------------------
{'epoch': 0, 'batch': 4095, 'loss': 0.2963936924934387}
acom
bysicalohax.com
com
dhestnessbiophysicalohax.com
estnessbiophysicalohax.com
fbiophysicalohax.com
gestnessbiophysicalohax.com
hax.com
iosicalohax.com
jjestnessbiophysicalohax.com
kiophysicalohax.com
lbdhysicalohax.com
mosestnessbiophysicalohax.com
nx.com
om
physicalohax.com
qestnessbiophysicalohax.com
rysicalohax.com
sicalohax.com
tnestnessbiophysicalohax.com
uestnessbiophysicalohax.com
vzestnessbiophysicalohax.com
waestnessbiophysicalohax.com
x.com
ysicalohax.com
zjestnessbiophysicalohax.com
----------------------------------------------

----------------------------------------------
{'epoch': 1, 'batch': 4095, 'loss': 0.30134764313697815}
aiophysicalohax.com
bx.com
coestnessbiophysicalohax.com
dnestnessbiophysicalohax.com
estnessbiophysicalohax.com
fzestnessbiophysicalohax.com
gestnessbiophysicalohax.com
hjh.com
iom
jfestnessbiophysicalohax.com
ksicalohax.co

KeyboardInterrupt: 

In [None]:
predict('google.', model, dataset)