#### Learned Bloom Filter - Malicious URLs Dataset 

In [1]:
import random
import sympy as sp
import math
from tqdm import tqdm
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import numpy as np
import psutil
import xxhash
import bitarray
from pympler import asizeof

random.seed(123)

In [2]:
import csv

#file_path = 'datasets/URLs_Singh/Webpages_Classification_train_data.csv/Webpages_Classification_train_data.csv'
file_path = 'datasets/AdaBF_URL_data.csv'

# load only 'url' and 'label' columns
columns_to_load = [0,1] #[1, 11]

# Initialize empty lists to hold the data of the two columns
urls = []
labels = []

# Open the CSV file and read line by line
with open(file_path, 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) # skip header
    for row in reader:
        # Append the data from the specified columns to the lists
        urls.append(row[columns_to_load[0]])
        labels.append(row[columns_to_load[1]])

# separate the data into good and bad URLs
benign_urls = [url for url, label in zip(urls, labels) if label == '-1']
malicious_urls = [url for url, label in zip(urls, labels) if label == '1']

print(f"Number of good URLs: {len(benign_urls)}, Number of bad URLs: {len(malicious_urls)}")


Number of good URLs: 344799, Number of bad URLs: 66385


In [3]:
# show some example instances
print(f"Bening URLs:")
for i in range(1,5):
    print(f"{benign_urls[i]}")

print(f"\nMalicious URLs:")
for i in range(1,5):
    print(f"{malicious_urls[i]}")    

Bening URLs:
monsterzine.com/200301/kingkong.php
budiz.com/socialnetworks/arts/drawing/fashion-illustration-social-network
disney.go.com/home/html/index.html
waatp.com/people/erin-everly/658742/

Malicious URLs:
congressomossoroense.com.br/nophfjkgjfshgjdfhjfhkj/
oweridreamsact.com.ng/_vti_009/serverphp/cp.php?m=login
ranhadinhen.ru/gate.php
sie-liebt-mich.de/yuilk/djchsj/03a7028413fb70343d6344476ad6a5b1/


In [4]:
class TwoUniversalHashFamily:
    def __init__(self, m, max_key):
        self.m = m  # Size of the hash table
        self.p = sp.nextprime(max_key)  # generate a large prime number, greater than any key
        self.a = random.randint(1, self.p-1)  # Choose a randomly
        self.b = random.randint(0, self.p-1)  # Choose b randomly

    def hash(self, k):
        return ((self.a * k + self.b) % self.p) % self.m
    
    def __call__(self, k):
        return self.hash(k)


class BloomStandard:
    def __init__(self, S, m, k=None):
        self.m = m
        if k is None:
            self.k = max(1,round((m/len(S)) * math.log(2)))  # optimal number of hash functions for a given m and n 
            print(f"Optimal number of hash functions: {self.k}")      
        else:
            self.k = k

        self.B = bitarray.bitarray(m)
        self.B.setall(0)  # initialize all bits to 0

        # construct bit array
        for key in S:
            self.insert(key)

        print(f"Bloom filter constructed! Size: {self.m}, Number of hash functions: {self.k}")


    # insert new integer key into the bloom filter 
    def insert(self, key):
        for i in range(self.k):
            hash_val = xxhash.xxh3_64(key.encode('utf-8'), seed=i).intdigest()%self.m 
            self.B[hash_val] = 1

    # poerform membership query for the given key
    def query(self, key):
        q = [0]*self.k
        for i in range(self.k):
            hash_val = xxhash.xxh3_64(key.encode('utf-8'), seed=i).intdigest()%self.m 
            q[i] = self.B[hash_val]
        if 0 in q:
            return False
        else:
            return True    
            
    def __str__(self):
        return str(self.B)


# define pytroch dataset class for training
class URLDataset(torch.utils.data.Dataset):
    def __init__(self, urls, labels):
        self.samples = list(zip(urls, labels))
        random.shuffle(self.samples)
        self.label2idx = {'malicious': 1, 'benign': 0}
        # character vocabulary
        self.vocab = ['<PAD>'] + list(set(''.join(urls))) 
        self.char2idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}
        self.vocab_size = len(self.vocab)
        print(f"Vocabulary size: {self.vocab_size}")

    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        url, label = self.samples[idx]
        # tokenize the url
        x = [self.char2idx[char] for char in url]
        y = self.label2idx[label]
        return x, y


def custom_collate_fn(batch):
    # Unzip the batch to separate sequences and labels
    sequences, labels = zip(*batch)
    # Convert sequences to tensors and pad
    sequences = [torch.tensor(sequence, dtype=torch.long) for sequence in sequences]
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)  # padding index is 0
    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.long)
    
    return padded_sequences, labels


# define stacked RNN classifier oracle
class Oracle(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dims=16, hidden_dims=8, num_layers=2, dropout_rate=0.2, padding_idx=0):
        super(Oracle, self).__init__()
        self.hidden_dims = hidden_dims
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dims)
        # 2 stacked RNN GRU layers
        self.rnn = torch.nn.GRU(embedding_dims, hidden_dims, num_layers=num_layers, bidirectional=True, batch_first=True, dropout=dropout_rate)
        self.dropout = torch.nn.Dropout(dropout_rate)    
        self.output_layer = torch.nn.Linear(hidden_dims*2, 2)
        self.padding_idx = padding_idx

    def forward(self, x, y=None):
        x = self.embedding(x) # shape: (batch_size, seq_len, embedding_dims)
        x = self.dropout(x) # shape: (batch_size, seq_len, embedding_dims)
        # get RNN final hidden states
        _, h = self.rnn(x) # shape: (num_layers*2, batch_size, hidden_dims*2)
        # concatenate the final forward and backward hidden states of the last layer
        h_forward = h[-2, :, :]  # Last layer forward hidden state
        h_backward = h[-1, :, :]  # Last layer backward hidden state
        x = torch.cat((h_forward, h_backward), dim=1)  # shape: (batch_size, hidden_dims*2)
        # map to output layer
        x = self.output_layer(x) # shape: (batch_size, 2)
        if y is not None:
            y = y.view(-1)
            loss = F.cross_entropy(x, y)
            return x, loss
        
        return x
    

# training loop
def train(train_dataloader, val_dataloader, model, optimizer, num_epochs=10, val_every=1, device='cpu'):
    model.train()
    val_loss, val_accuracy = 0, 0
    for epoch in range(num_epochs):
        pbar = tqdm(train_dataloader, desc="Epochs")
        num_total, num_pos_total, num_neg_total = 0, 0, 0
        num_correct, num_pos_correct, num_neg_correct = 0, 0, 0
        avg_loss = 0
        for batch in pbar:
            x, y = batch
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            output_logits, loss = model(x, y)           
            loss.backward()
            # clip gradients to avoid exploding gradients
            #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            # compute moving average loss
            avg_loss = 0.9 * avg_loss + 0.1 * loss.item()

            # compute accuracy for batch precictions
            _, predicted = torch.max(output_logits, 1)
            num_total += y.shape[0]
            num_correct += (predicted == y).sum().item()
            accuracy = num_correct / num_total

            # compute accuracy for positive samples
            num_pos_total += y[y == 1].shape[0]
            num_pos_correct += (predicted[y == 1] == 1).sum().item()
            pos_accuracy = num_pos_correct / max(1,num_pos_total)

            # compute accuracy for negative samples
            num_neg_total += y[y == 0].shape[0]
            num_neg_correct += (predicted[y == 0] == 0).sum().item()
            neg_accuracy = num_neg_correct / max(1,num_neg_total)  

            pbar.set_description(f"Epoch {epoch + 1}, Train Loss: {avg_loss:.5f}, Train Accuracy Overall: {accuracy: .5f}, Train Accuracy Positive: {pos_accuracy:.5f}, Train Accuracy Negative: {neg_accuracy:.5f}, Val Loss: {val_loss:.5f}, Val Accuracy: {val_accuracy:.5f}")

        if (epoch + 1) % val_every == 0:
            val_loss, val_accuracy = evaluate(val_dataloader, model, device=device)

# evaluation on test samples
def evaluate(test_dataloader, model, tau=0.5, device='cpu', verbose=False):
    num_total, num_pos_total, num_neg_total = 0, 0, 0
    num_correct = 0
    num_FP, num_FN = 0, 0
    avg_loss = 0.0
    model.eval()
    #xy = []
    with torch.no_grad():
        for x, y in test_dataloader:
            x, y = x.to(device), y.to(device)
            output_logits, loss = model(x, y)
            avg_loss += loss.item()
            positive_probs = torch.nn.Softmax(dim=1)(output_logits)[:, 1]
            predicted = (positive_probs >= tau).long()
            
            num_total += y.shape[0]
            num_correct += (predicted == y).sum().item()
            num_pos_total += y[y == 1].shape[0]            
            num_neg_total += y[y == 0].shape[0]

            # compute number of false positives and false negatives
            num_FP += (predicted[y == 0] == 1).sum().item()
            num_FN += (predicted[y == 1] == 0).sum().item() 

            #xy.extend(list(zip(x.cpu().numpy(), y.cpu().numpy(), predicted.cpu().numpy())))
    model.train()

    avg_loss /= len(test_dataloader)
    accuracy = num_correct / num_total

    if verbose:
        FP_rate = num_FP / max(1,num_neg_total)
        FN_rate = num_FN / max(1,num_pos_total)                 
        print(f"Num total: {num_total}, Num correct: {num_correct}, Num False Positives: {num_FP}, Num False Negatives: {num_FN}")
        print(f"Test Accuracy: {accuracy}, Test FP rate: {FP_rate:.5f}, Test FN rate: {FN_rate:.5f}")
    return  avg_loss, accuracy #, xy


class LearnedBF:
    def __init__(self, oracle, backup_bf, x_mean, x_std, tau=0.5, device='cuda'):
        self.oracle = oracle
        self.backup_bf = backup_bf
        self.tau = tau
        self.x_mean = x_mean
        self.x_std = x_std
        self.device = device

    def query(self, key):
        if self.oracle_predict(key):
            return True
        else:
            return self.backup_bf.query(key)

    def oracle_predict(self, x):
        x = (key - self.x_mean) / self.x_std
        x = torch.tensor(x, dtype=torch.float32).unsqueeze(0).to(self.device)
        outputs = self.oracle(x)
        predicted = torch.nn.Softmax(dim=0)(outputs)
        return predicted[1].item() > self.tau
 

Test the standard bloom filter on this dataset.

In [5]:
# construct a bloom filter on the bad URLs
bf = BloomStandard(S=malicious_urls, m=4*len(malicious_urls))

# draw some random negative samples from the good URLs
neg_samples = random.sample(benign_urls, 5000)

# evaluate the bloom filter on the negative samples
FP_rate = sum([bf.query(url) for url in neg_samples])/len(neg_samples)
print(f"False Positive Rate: {FP_rate}")


print(f"Memory usage for bad URLs: {asizeof.asizeof(malicious_urls)/1000} kilobytes, Memory usage for Bloom filter: {asizeof.asizeof(bf.B)/1000} kilobytes")

Optimal number of hash functions: 3
Bloom filter constructed! Size: 265540, Number of hash functions: 3
False Positive Rate: 0.1434
Memory usage for bad URLs: 7754.68 kilobytes, Memory usage for Bloom filter: 33.28 kilobytes


Now we will train an oracle on the dataset.

In [6]:
batch_size = 64

# hold out 5% of good URLs for testing
test_size = int(0.1*len(benign_urls))
benign_urls_train = benign_urls[:-test_size]
benign_urls_test = benign_urls[-test_size:]

urls_train = benign_urls_train + malicious_urls
labels_train = ['benign']*len(benign_urls_train) + ['malicious']*len(malicious_urls)
urls_test = benign_urls_test
labels_test = ['benign']*len(benign_urls_test)

print(len(benign_urls_train), len(malicious_urls))

train_dataset = URLDataset(urls_train, labels_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=custom_collate_fn, shuffle=True, pin_memory=False)

test_dataset = URLDataset(urls_test, labels_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn, shuffle=False, pin_memory=False)

310320

 66385
Vocabulary size: 177
Vocabulary size: 101


In [7]:
# model hyperparameters
embedding_dims = 16
hidden_dims = 8
device = 'cuda'


# define model and optimizer
model = Oracle(vocab_size=train_dataset.vocab_size, embedding_dims=embedding_dims, hidden_dims=hidden_dims, dropout_rate=0.1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

  _torch_pytree._register_pytree_node(


In [8]:
# train oracle
train(train_dataloader, test_dataloader, model, optimizer, num_epochs=20, val_every=2, device=device)

Epoch 1, Train Loss: 0.55159, Train Accuracy Overall:  0.82377, Train Accuracy Positive: 0.00000, Train Accuracy Negative: 1.00000, Val Loss: 0.00000, Val Accuracy: 0.00000: 100%|██████████| 5887/5887 [00:47<00:00, 123.74it/s]
Epoch 2, Train Loss: 0.53621, Train Accuracy Overall:  0.82377, Train Accuracy Positive: 0.00000, Train Accuracy Negative: 1.00000, Val Loss: 0.20676, Val Accuracy: 1.00000: 100%|██████████| 5887/5887 [00:46<00:00, 125.70it/s]
Epoch 3, Train Loss: 0.42260, Train Accuracy Overall:  0.82378, Train Accuracy Positive: 0.00002, Train Accuracy Negative: 1.00000, Val Loss: 0.21247, Val Accuracy: 1.00000: 100%|██████████| 5887/5887 [00:44<00:00, 132.06it/s]
Epoch 4, Train Loss: 0.49840, Train Accuracy Overall:  0.82381, Train Accuracy Positive: 0.00023, Train Accuracy Negative: 0.99999, Val Loss: 0.22349, Val Accuracy: 1.00000: 100%|██████████| 5887/5887 [00:45<00:00, 129.12it/s]
Epoch 5, Train Loss: 0.45127, Train Accuracy Overall:  0.82426, Train Accuracy Positive: 0.0

In [25]:
urls_test = good_urls_test
labels_test = ['good']*len(good_urls_test)
test_dataset = URLDataset(urls_test, labels_test)

# Create a DataLoader with the custom collate function
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn, shuffle=False, pin_memory=False)

Vocabulary size: 101


In [26]:
# evaluate on test set
FP_rate, xy = evaluate(test_dataloader, model, tau=0.5, device=device)

Num total: 34479, Num correct: 471, Num False Positives: 34008, Num False Negatives: 0
Test Accuracy: 0.01366048899330027, Test FP rate: 0.98634, Test FN rate: 0.00000


In [15]:
# evaluate on training set
FP_rate, xy = evaluate(train_dataloader, model, tau=0.5, device=device)

Num total: 376705, Num correct: 357574, Num False Positives: 2007, Num False Negatives: 17124
Test Accuracy: 0.9492149029081112, Test FP rate: 0.00647, Test FN rate: 0.25795
