# Prototyping model2regex

In [52]:
import sys
!{sys.executable} -m pip install -r requirements.txt




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
from torch import nn, optim
from torch.distributions import Categorical
from random import Random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from itertools import chain
import torch.nn.functional as F
import string

We going to try to learn a simple DGA called bajori

First we are generating a dataset of of domains

In [2]:
from dga import banjori, generate_dataset
domains: list[str] = generate_dataset(banjori,'earnestnessbiophysicalohax.com', 2**16)

We are going to generate a pytorch [Dataset](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html) to help us doing the batching and data shuffling 

In [3]:
class Domains(Dataset):
    def __init__(self, data: list[str]):
        self.data = data
        self.chars = sorted(list(set(chain(*self.data))))
        self.vocabSize = len(self.chars) + 1
        self.max_size = len(max(self.data, key=lambda d: len(d)))
        self.char2ind = {ch : i for i,ch in enumerate(self.chars, start=1)}
        self.ind2char = {i : ch for i,ch in enumerate(self.chars, start=1)}

    def __len__(self):
        return len(self.data)

    def isEndChar(self, ind):
        return ind == 0
    def charTensor(self, _input):
        return torch.tensor([[self.char2ind[c] for c in _input]]).permute(1,0)
        
    def __getitem__(self, idx: int):
        item = torch.tensor([self.char2ind[c] for c in self.data[idx]])
        # we need tensors of same size, so if any domain has a different size we then pad it with 0 which will be our "end char"
        item = F.pad(item, (0,self.max_size - len(item)), "constant", 0)
        return (item, F.pad(item[1:], (0,1), "constant", 0))
        

In [17]:
dataset = Domains(domains)
x,y = next(iter(DataLoader(dataset, batch_size=64, shuffle=True)))
print("x:\n\t", x.shape)
print("y:\n\t", y.shape)
print(dataset.data[0])

x:
	 torch.Size([64, 30])
y:
	 torch.Size([64, 30])
kwtoestnessbiophysicalohax.com


Following the dataset we define a Model for learning the structure of our DGA

In [5]:
from torch import nn
class Model(nn.Module):
    def __init__(self, vocabSize, emb, size, nlayers):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabSize,
                                      embedding_dim=emb)
        self.rnn = nn.GRU(input_size=emb, hidden_size=size,
                          num_layers=nlayers)
        self.decoder = nn.Linear(size, vocabSize)
        
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        return output, hidden_state.detach()

In [6]:
# Trying to understand how the embedding put out the permuted input
dataset = Domains(domains)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
(x,y) = next(iter(dataloader))
print(x.permute(1,0))
embedding = nn.Embedding(num_embeddings=dataset.vocabSize, embedding_dim=64)
emb = embedding(x.permute(1,0))
emb.shape


tensor([[19, 27, 11,  ..., 14, 19,  9],
        [ 7,  3, 11,  ...,  2, 27,  5],
        [21,  6,  9,  ..., 26, 13,  2],
        ...,
        [ 4,  4,  4,  ...,  4,  4,  4],
        [16, 16, 16,  ..., 16, 16, 16],
        [14, 14, 14,  ..., 14, 14, 14]])


torch.Size([30, 64, 64])

In [14]:
class Trainer:
    def __init__(self, dataset: Domains, **kwargs):
        self.dataset = dataset
        self.hidden_size = kwargs.get('hidden_size', 128)
        self.num_layers = kwargs.get('num_layers', 1)
        self.embed_dim = kwargs.get('embed_dim', 64)
        self.device = kwargs.get('device', 'cuda:0')
        self.model = Model(dataset.vocabSize, self.embed_dim, self.hidden_size, self.num_layers)
        
    def predict(self, starter: str):
        chart = self.dataset.charTensor(starter)
        state = None
        device = self.device
        for _ in range(100):
            if state is not None:
                state = state.to(device)
            output, state = self.model(chart.to(device), state)
            output = F.softmax(torch.squeeze(output[-1, :]), dim=0)
            dist = Categorical(output)
            index = dist.sample()
            ind = index.item()
            if dataset.isEndChar(ind):
                break
            starter += self.dataset.ind2char[ind]
            chart = torch.tensor([[ind]])
        return starter 

    def train(self):
        criterion = nn.CrossEntropyLoss(reduction="mean")
        dataset = self.dataset
        device = self.device
        dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
        model = self.model
        model.to(device)
        model.train()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        h0 = None
        for epoch in range(100):
            for batch, (x,y) in enumerate(dataloader):
                optimizer.zero_grad()
                if h0 is not None:
                    h0 = h0.to(device)
                output, h0 = model(x.permute(1,0).to(device), h0)
                loss = criterion(output.permute(1,2,0), y.to(device))
                loss.backward()
                optimizer.step()
            print()
            print("----------------------------------------------")
            print({'epoch': epoch, 'batch': batch, 'loss': loss.item()})
            for char in string.ascii_lowercase:
                print(self.predict(char))
            print("----------------------------------------------")
            model.train()


In [18]:
HIDDEN_SIZE = 128
NLAYERS = 1
EMBEDDING_DIM = 64
trainer = Trainer(dataset, hidden_size=HIDDEN_SIZE, num_layers=NLAYERS, embed_dim=EMBEDDING_DIM)
trainer.train()


----------------------------------------------
{'epoch': 0, 'batch': 4095, 'loss': 0.3035871386528015}
ax.com
bjohesicalohax.com
com
dtestnessbiophysicalohax.com
estnessbiophysicalohax.com
fphysicalohax.com
gcalohax.com
hysicalohax.com
iqcalohax.com
jfdestnessbiophysicalohax.com
kcalohax.com
lcaysicalohax.com
micoestnessbiophysicalohax.com
nphysicalohax.com
ophysicalohax.com
pnestnessbiophysicalohax.com
qusicalohax.com
rx.coestnessbiophysicalohax.com
sicalohax.com
tnestnessbiophysicalohax.com
uustnessbiophysicalohax.com
vqestnessbiophysicalohax.com
wnestnessbiophysicalohax.com
xfwestnessbiophysicalohax.com
ysicalohax.com
zdhestnessbiophysicalohax.com
----------------------------------------------

----------------------------------------------
{'epoch': 1, 'batch': 4095, 'loss': 0.3004227578639984}
ax.com
bohax.com
caestnessbiophysicalohax.com
dlestnessbiophysicalohax.com
essbiophysicalohax.com
fzestnessbiophysicalohax.com
gwstnessbiophysicalohax.com
hlxhysicalohax.com
ioestnessbiophy

In [42]:
class DGAClassifier(nn.Module):
    def __init__(self, vocabSize, emb, size, nlayers):
        super(DGAClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabSize,
                                      embedding_dim=emb)
        self.rnn = nn.GRU(input_size=emb, hidden_size=size,
                          num_layers=nlayers)
        self.linear = nn.Linear(in_features=size, out_features=2)
        
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        return self.linear(hidden_state[-1])

model = DGAClassifier(dataset.vocabSize, EMBEDDING_DIM, HIDDEN_SIZE, NLAYERS)
out = model(dataset.charTensor(domains[0]), None)


tensor([[-0.0572,  0.0613]], grad_fn=<AddmmBackward0>)

In [61]:
import pandas as pd
from pathlib import Path
top1m = pd.read_csv(Path('data/top-1m.csv'))
top1m.values[:,1]
class DomainDGA(Dataset)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.