# Prototyping model2regex

In [1]:
import torch
from torch import nn, optim
from torch.distributions import Categorical
from random import Random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from itertools import chain
import torch.nn.functional as F
import string

Let's start by making a simple DGA:

Rules:
* always ends in .net
* a is followed by random number of digits of length 10-20
* seed is day of the month
* the simple regex for this would be
  ```py
  r"a[0-9]{10,20}"
  ```

In [2]:
rand = Random(str(datetime.today().day))
def simple_dga():
    digits = '0123456789'
    return 'a' + ''.join(rand.choice(digits) for _ in range(rand.randint(10, 20))) + rand.choice(['.net', '.com','.xyz'])
    
def banjori(size: int):
    def map_to_lowercase_letter(s):
        return ord('a') + ((s - ord('a')) % 26)

    def next_domain(domain):
        dl = [ord(x) for x in list(domain)]
        dl[0] = map_to_lowercase_letter(dl[0] + dl[3])
        dl[1] = map_to_lowercase_letter(dl[0] + 2*dl[1])
        dl[2] = map_to_lowercase_letter(dl[0] + dl[2] - 1)
        dl[3] = map_to_lowercase_letter(dl[1] + dl[2] + dl[3])
        return ''.join([chr(x) for x in dl])

    seed = 'earnestnessbiophysicalohax.com' # 15372 equal to 0 (seed = 0)
    domain = seed
    domains = [seed]
    for i in range(size):
        domain = next_domain(domain)
        domains.append(domain)
    print(rand.choices(domains, k=5))
    return domains
    
domains = []
for i in range(1000):
    domains.append(simple_dga())
print(rand.choices(domains, k=5))

class Domains(Dataset):

    def __init__(self, size):
        self.data = banjori(size)
        self.chars = sorted(list(set(chain(*self.data))))
        self.vocabSize = len(self.chars) + 1
        self.max_size = len(max(self.data, key=lambda d: len(d)))
        self.char2ind = {ch : i for i,ch in enumerate(self.chars, start=1)}
        self.ind2char = {i : ch for i,ch in enumerate(self.chars, start=1)}

    def __len__(self):
        return len(self.data)

    def isEndChar(self, ind):
        return ind == 0
    def charTensor(self, _input):
        return torch.tensor([[self.char2ind[c] for c in _input]]).permute(1,0)
        
    def __getitem__(self, idx: int):
        item = torch.tensor([self.char2ind[c] for c in self.data[idx]])
        item = F.pad(item, (0,self.max_size - len(item)), "constant", 0)
        return (item, F.pad(item[1:], (0,1), "constant", 0))
        

['a11545064651039533.xyz', 'a27766201860134774905.net', 'a68647553720780572834.net', 'a0469629534370158245.xyz', 'a4151826791415609.net']


In [3]:
print(banjori(16*100))
dataset = Domains(10000)
for batch, (x,y) in enumerate(DataLoader(dataset, batch_size=64, shuffle=True)):
    print(batch)
    print("x:\n\t", x, x.shape)
    print("x.permute\n\t", x.permute(1,0))
    print("y:\n\t", y, y.shape)
    if batch > 5:
        break

['bjysestnessbiophysicalohax.com', 'esriestnessbiophysicalohax.com', 'tnxdestnessbiophysicalohax.com', 'iczbestnessbiophysicalohax.com', 'jtuzestnessbiophysicalohax.com']
['earnestnessbiophysicalohax.com', 'kwtoestnessbiophysicalohax.com', 'rvcxestnessbiophysicalohax.com', 'hjbtestnessbiophysicalohax.com', 'txmoestnessbiophysicalohax.com', 'agekestnessbiophysicalohax.com', 'dbzwestnessbiophysicalohax.com', 'sgjxestnessbiophysicalohax.com', 'igjyestnessbiophysicalohax.com', 'zxahestnessbiophysicalohax.com', 'zfrpestnessbiophysicalohax.com', 'hdquestnessbiophysicalohax.com', 'umcuestnessbiophysicalohax.com', 'hrbyestnessbiophysicalohax.com', 'ysrtestnessbiophysicalohax.com', 'kgteestnessbiophysicalohax.com', 'hfsnestnessbiophysicalohax.com', 'njxfestnessbiophysicalohax.com', 'lpagestnessbiophysicalohax.com', 'kacuestnessbiophysicalohax.com', 'xjrgestnessbiophysicalohax.com', 'wafxestnessbiophysicalohax.com', 'myjqestnessbiophysicalohax.com', 'vdwbestnessbiophysicalohax.com', 'phdxestness

Next we want to use the example model from the talk.pdf page

In [4]:
from torch import nn
class Model(nn.Module):
    def __init__(self, vocabSize, emb, size, nlayers):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabSize,
                                      embedding_dim=emb)
        self.rnn = nn.GRU(input_size=emb, hidden_size=size,
                          num_layers=nlayers)
        self.decoder = nn.Linear(size, vocabSize)
        
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        return output, hidden_state.detach()

In [5]:
# Trying to understand how the embedding put out the permuted input
dataset = Domains(10000)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
(x,y) = next(iter(dataloader))
print(x.permute(1,0))
embedding = nn.Embedding(num_embeddings=dataset.vocabSize, embedding_dim=64)
emb = embedding(x.permute(1,0))
emb.shape


['qunhestnessbiophysicalohax.com', 'pzylestnessbiophysicalohax.com', 'thxwestnessbiophysicalohax.com', 'hdlnestnessbiophysicalohax.com', 'catdestnessbiophysicalohax.com']
tensor([[12, 21, 16,  ..., 20, 23, 14],
        [ 2,  7, 20,  ...,  4, 23, 22],
        [17,  8, 27,  ..., 22, 14,  5],
        ...,
        [ 4,  4,  4,  ...,  4,  4,  4],
        [16, 16, 16,  ..., 16, 16, 16],
        [14, 14, 14,  ..., 14, 14, 14]])


torch.Size([30, 64, 64])

In [6]:
def predict(starter: str, model, dataset):
    chart = dataset.charTensor(starter)
    state = None
    for _ in range(100):
        if state is not None:
            state = state.to(device)
        output, state = model(chart.to(device), state)
        output = F.softmax(torch.squeeze(output[-1, :]), dim=0)
        dist = Categorical(output)
        index = dist.sample()
        ind = index.item()
        if dataset.isEndChar(ind):
            break
        starter += dataset.ind2char[ind]
        chart = torch.tensor([[ind]])
    return starter 


In [7]:
HIDDEN_SIZE = 128
NLAYERS = 1
EMBEDDING_DIM = 64
losses = []
def make_word_vector(domain):
    idxs = [c_to_ix[c] for c in domain]
    return torch.tensor(idxs, dtype=torch.long)

device = "cuda:0"
h0 = None
criterion = nn.CrossEntropyLoss(reduction="mean")
dataset = Domains(16*1000-1)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
print("vocabSize",len(dataset.chars))
model = Model(dataset.vocabSize,EMBEDDING_DIM, HIDDEN_SIZE, NLAYERS)
model.to(device)
model.train()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(100):
    for batch, (x,y) in enumerate(dataloader):
        optimizer.zero_grad()
        if h0 is not None:
            h0 = h0.to(device)
        output, h0 = model(x.permute(1,0).to(device), h0)
        loss = criterion(output.permute(1,2,0), y.to(device))
        loss.backward()
        optimizer.step()
    print()
    print("----------------------------------------------")
    print({'epoch': epoch, 'batch': batch, 'loss': loss.item()})
    for char in string.ascii_lowercase:
        print(predict(char, model, dataset))
    print("----------------------------------------------")
    model.train()
predict('ca', model, dataset)

['qooeestnessbiophysicalohax.com', 'jjfoestnessbiophysicalohax.com', 'flduestnessbiophysicalohax.com', 'cqjmestnessbiophysicalohax.com', 'mwaiestnessbiophysicalohax.com']
vocabSize 27

----------------------------------------------
{'epoch': 0, 'batch': 999, 'loss': 0.31919538974761963}
alohax.com
byohax.com
coaxuestnessbiophysicalohax.com
dsiophysicalohax.com
estnessbiophysicalohax.com
fyestnessbiophysicalohax.com
gqysicalohax.com
haestnessbiophysicalohax.com
iicalohax.com
jcom
kstnessbiophysicalohax.com
lohax.com
mkgyestnessbiophysicalohax.com
nastnessbiophysicalohax.com
ophysicalohax.com
physicalohax.com
qqestnessbiophysicalohax.com
rnestnessbiophysicalohax.com
sxnessbiophysicalohax.com
tnestnessbiophysicalohax.com
ufhestnessbiophysicalohax.com
vfhestnessbiophysicalohax.com
wax.com
xnestnessbiophysicalohax.com
ytnessbiophysicalohax.com
zfhbssbiophysicalohax.com
----------------------------------------------

----------------------------------------------
{'epoch': 1, 'batch': 999, '

'camestnessbiophysicalohax.com'

In [27]:
predict('google.', model, dataset)

'google.netsbiophysicalohax.com'