In [1]:
import torch
from torch import nn, optim
from torch.distributions import Categorical
import random
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from itertools import chain
import torch.nn.functional as F
import string
import os
from typing import Sequence, Tuple
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
class DGAClassifier(nn.Module):
    def __init__(self, vocabSize, emb, size, nlayers):
        super(DGAClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabSize,
                                      embedding_dim=emb)
        self.rnn = nn.GRU(input_size=emb, hidden_size=size,
                          num_layers=nlayers)
        self.out = nn.Linear(in_features=size, out_features=1)
        self.drop = nn.Dropout(0.3)
        self.sig = nn.Sigmoid()
        
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        x = hidden_state[-1, :]
        x = self.drop(x)
        x = self.out(x)
        x = self.sig(x)
        return x, hidden_state.detach()


In [3]:
class DomainsAndDGA(Dataset):
    def __init__(self, domains: Sequence[Tuple[str, int]]):
        self.data = domains
        self.max_size = len(max(self.data, key=lambda d: len(d[0]))[0])
        self.chars = sorted(list(set(chain(*[d[0] for d in self.data]))))
        self.vocabSize = len(self.chars) + 1
        self.char2ind = {ch : i for i,ch in enumerate(self.chars, start=1)}
        self.ind2char = {i : ch for i,ch in enumerate(self.chars, start=1)}

    def __len__(self):
        return len(self.data)

    def isEndChar(self, ind):
        return ind == 0
    def charTensor(self, _input):
        return torch.tensor([[self.char2ind[c] for c in _input]]).permute(1,0)
        
    def __getitem__(self, idx: int):
        item, label = self.data[idx]
        item = torch.tensor([self.char2ind[c] for c in item])
        # we need tensors of same size, so if any domain has a different size we then pad it with 0 which will be our "end char"
        item = F.pad(item, (0,self.max_size - len(item)), "constant", 0)
        return (item, torch.tensor(label, dtype=torch.float))

In [4]:

def predict(self, starter: str):
    chart = self.dataset.charTensor(starter)
    state = None
    device = self.device
    for _ in range(100):
        if state is not None:
            state = state.to(device)
        output, state = self.model(chart.to(device), state)
        output = F.softmax(torch.squeeze(output[-1, :]), dim=0)
        dist = Categorical(output)
        index = dist.sample()
        ind = index.item()
        if dataset.isEndChar(ind):
            break
        starter += self.dataset.ind2char[ind]
        chart = torch.tensor([[ind]])
    return starter 

def train(*, dataset: DomainsAndDGA, dataloader: DataLoader, device: str, model: DGAClassifier, epochs: int=100):
    criterion = nn.CrossEntropyLoss()
    model.to(device)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    h0 = None
    
    for epoch in range(epochs):
        print("epoch: ", epoch)
        for batch, (x,y) in enumerate(dataloader):
            optimizer.zero_grad()
            if h0 is not None:
                h0 = h0.to(device)
            output, h0 = model(x.permute(1,0).to(device), h0)
            if batch % 500 == 0:
                idx = random.randint(0, len(x)-1)
                print("----------------------------------------------------")
                print(f"showing one prediction for random sample of batch: {batch:,}")
                print("inputstr:\t", ''.join(dataset.ind2char[c] for c in x[idx].tolist() if c != 0), " label: ", y[idx].tolist())
                print("output:\t", output[idx].round().tolist())
                print("----------------------------------------------------")
            loss = criterion(output.squeeze(), y.to(device))
            loss.backward()
            optimizer.step()
            if batch % 500 == 0:
                print(f"loss at batch {batch:,}: {loss.item()}")
            
        model.train()
    return model, h0

In [5]:
import pandas as pd
from pathlib import Path
from dga import banjori, generate_dataset
from random import shuffle
from torch.utils.data import SubsetRandomSampler
from sklearn.model_selection import KFold
top1m = pd.read_csv(Path('data/top-1m.csv'))
real_domains = top1m.values[:,1]
real_domains = list(tuple(zip(real_domains, (1 for _ in range(len(real_domains))))))
dga_domains = generate_dataset(algorithm=banjori, seed='earnestnessbiophysicalohax.com', size=len(real_domains))
dga_domains = list(tuple(zip(dga_domains, (0 for _ in range(len(dga_domains))))))

dataset = list(chain(real_domains, dga_domains))
dataset = DomainsAndDGA(dataset)
kfold = KFold(n_splits=5, shuffle=True)
hidden_size = 128
num_layers = 1
embed_dim =64
device ='cuda:0'

for fold, (train_dataset, test_dataset) in enumerate(kfold.split(dataset)):
    model = DGAClassifier(dataset.vocabSize, embed_dim, hidden_size, num_layers)
    train_sampler = SubsetRandomSampler(train_dataset)
    test_sampler = SubsetRandomSampler(test_dataset)
    trainloader = DataLoader(dataset, batch_size=500, sampler=train_sampler, drop_last=True)
    testloader = DataLoader(dataset, batch_size=500, sampler=test_sampler, drop_last=True)
    model, h0 = train(dataset=dataset, dataloader=trainloader, device=device, model=model, epochs=10)
    save_path = Path(f'./model-fold-{fold}.pth')
    torch.save(model.state_dict(), save_path)
    correct, total = 0, 0
    print("------------------------------------")
    print("verifying via test dataset")
    with torch.no_grad():
        for batch, (x,y) in enumerate(testloader):
            output, h0 = model(x.permute(1,0).to(device), h0)
            total += y.size(0)
            correct += (output.permute(1,0).round() == y.to(device)).sum().item()
            print("total: ", total, " correct: ", correct)
    print(f'Accuracy for fold {fold}: {correct / total:%}')
    print('---------------------------------------------------')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


epoch:  0
----------------------------------------------------
showing one prediction for random sample of batch: 0
inputstr:	 guttestnessbiophysicalohax.com  label:  0.0
output:	 [1.0]
----------------------------------------------------
loss at batch 0: 1609.8421630859375
----------------------------------------------------
showing one prediction for random sample of batch: 500
inputstr:	 djbjestnessbiophysicalohax.com  label:  0.0
output:	 [0.0]
----------------------------------------------------
loss at batch 500: 1462.440673828125
----------------------------------------------------
showing one prediction for random sample of batch: 1,000
inputstr:	 assets.cdnbf.net  label:  1.0
output:	 [1.0]
----------------------------------------------------
loss at batch 1,000: 1619.537109375
----------------------------------------------------
showing one prediction for random sample of batch: 1,500
inputstr:	 zrhpestnessbiophysicalohax.com  label:  0.0
output:	 [0.0]
----------------------