#### Learned Bloom Filter - Malicious URLs Dataset 

In [1]:
import random
import sympy as sp
import math
from tqdm import tqdm
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import numpy as np
import psutil
import xxhash

random.seed(123)

In [None]:
class TwoUniversalHashFamily:
    def __init__(self, m, max_key):
        self.m = m  # Size of the hash table
        self.p = sp.nextprime(max_key)  # generate a large prime number, greater than any key
        self.a = random.randint(1, self.p-1)  # Choose a randomly
        self.b = random.randint(0, self.p-1)  # Choose b randomly

    def hash(self, k):
        return ((self.a * k + self.b) % self.p) % self.m
    
    def __call__(self, k):
        return self.hash(k)


class BloomStandard:
    def __init__(self, S, m, k=None):
        self.m = m
        if k is None:
            self.k = max(1,round((m/len(S)) * math.log(2)))  # optimal number of hash functions for a given m and n 
            print(f"Optimal number of hash functions: {self.k}")      
        else:
            self.k = k

        self.B = [0] * self.m

        # construct bit array
        for key in S:
            self.insert(key)

        print(f"Bloom filter constructed! Size: {self.m}, Number of hash functions: {self.k}")


    # insert new integer key into the bloom filter 
    def insert(self, key):
        for i in range(self.k):
            hash_val = xxhash.xxh3_64(key.encode('utf-8'), seed=i).intdigest()%self.m 
            self.B[hash_val] = 1

    # poerform membership query for the given key
    def query(self, key):
        q = [0]*self.k
        for i in range(self.k):
            hash_val = xxhash.xxh3_64(key.encode('utf-8'), seed=i).intdigest()%self.m 
            q[i] = self.B[hash_val]
        if 0 in q:
            return False
        else:
            return True    
            
    def __str__(self):
        return str(self.B)


# define pytroch dataset class for training
class BloomDataset(torch.utils.data.Dataset):
    def __init__(self, positive_samples, negative_samples):
        if positive_samples is None:
            positive_samples = []
        self.samples = positive_samples + negative_samples
        random.shuffle(self.samples)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        # convert to tensor
        x = torch.tensor(x, dtype=torch.float32).unsqueeze(0)
        y = torch.tensor(y, dtype=torch.long)
        return x, y


# define neural network with two hidden layers
class Oracle(torch.nn.Module):
    def __init__(self, hidden_dims=10, dropout_rate=0.2):
        super(Oracle, self).__init__()
        self.hidden_dims = hidden_dims
        self.hidden_layer1 = torch.nn.Linear(1, hidden_dims)
        self.hidden_layer2 = torch.nn.Linear(hidden_dims, hidden_dims)
        self.relu = torch.nn.ReLU()
        self.tanh = torch.nn.Tanh()
        #self.batch_norm = torch.nn.BatchNorm1d(hidden_dims)
        #self.leaky_relu = torch.nn.LeakyReLU(0.01)
        self.output_layer = torch.nn.Linear(hidden_dims, 2)
        self.dropout = torch.nn.Dropout(dropout_rate)    

    def forward(self, x):
        x = self.hidden_layer1(x)
        x = self.hidden_layer2(x)
        x = self.relu(x)
        #x = self.tanh(x)
        #x = self.batch_norm(x)
        #x = self.leaky_relu(x)
        #x = self.dropout(x)
        # compute output logits
        x = self.output_layer(x)
        return x
    

# training loop
def train(train_dataloader, model, optimizer, num_epochs=10, device='cpu'):
    model.train()
    for epoch in range(num_epochs):
        pbar = tqdm(train_dataloader, desc="Epochs")
        num_total, num_pos_total, num_neg_total = 0, 0, 0
        num_correct, num_pos_correct, num_neg_correct = 0, 0, 0
        avg_loss = 0
        for batch in pbar:
            x, y = batch
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(x)
            # flatten the labels
            y = y.view(-1)      
            loss = F.cross_entropy(outputs, y)
            loss.backward()
            optimizer.step()

            # compute moving average loss
            avg_loss = 0.9 * avg_loss + 0.1 * loss.item()

            # compute accuracy for batch precictions
            _, predicted = torch.max(outputs, 1)
            num_total += y.shape[0]
            num_correct += (predicted == y).sum().item()
            accuracy = num_correct / num_total

            # compute accuracy for positive samples
            num_pos_total += y[y == 1].shape[0]
            num_pos_correct += (predicted[y == 1] == 1).sum().item()
            pos_accuracy = num_pos_correct / max(1,num_pos_total)

            # compute accuracy for negative samples
            num_neg_total += y[y == 0].shape[0]
            num_neg_correct += (predicted[y == 0] == 0).sum().item()
            neg_accuracy = num_neg_correct / max(1,num_neg_total)  

            pbar.set_description(f"Epoch {epoch + 1}, Train Loss: {avg_loss:.5f}, Train Accuracy Overall: {accuracy: .5f}, Train Accuracy Positive: {pos_accuracy:.5f}, Train Accuracy Negative: {neg_accuracy:.5f}")


# evaluation on test samples
def evaluate(test_dataloader, model, tau=0.5, device='cpu'):
    num_total, num_pos_total, num_neg_total = 0, 0, 0
    num_correct = 0
    num_FP, num_FN = 0, 0
    model.eval()
    xy = []
    with torch.no_grad():
        for x, y in test_dataloader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            predicted = (outputs[:, 1] >= tau).long()
            
            num_total += y.shape[0]
            num_correct += (predicted == y).sum().item()
            num_pos_total += y[y == 1].shape[0]            
            num_neg_total += y[y == 0].shape[0]

            # compute number of false positives and false negatives
            num_FP += (predicted[y == 0] == 1).sum().item()
            num_FN += (predicted[y == 1] == 0).sum().item() 

            xy.extend(list(zip(x.cpu().numpy(), y.cpu().numpy(), predicted.cpu().numpy())))

    print(f"Num total: {num_total}, Num correct: {num_correct}, Num False Positives: {num_FP}, Num False Negatives: {num_FN}")

    accuracy = num_correct / num_total
    FP_rate = num_FP / max(1,num_neg_total)
    FN_rate = num_FN / max(1,num_pos_total)                 
    print(f"Test Accuracy: {accuracy}, Test FP rate: {FP_rate:.5f}, Test FN rate: {FN_rate:.5f}")
    return FP_rate, xy


class LearnedBF:
    def __init__(self, oracle, backup_bf, x_mean, x_std, tau=0.5, device='cuda'):
        self.oracle = oracle
        self.backup_bf = backup_bf
        self.tau = tau
        self.x_mean = x_mean
        self.x_std = x_std
        self.device = device

    def query(self, key):
        if self.oracle_predict(key):
            return True
        else:
            return self.backup_bf.query(key)

    def oracle_predict(self, x):
        x = (key - self.x_mean) / self.x_std
        x = torch.tensor(x, dtype=torch.float32).unsqueeze(0).to(self.device)
        outputs = self.oracle(x)
        predicted = torch.nn.Softmax(dim=0)(outputs)
        return predicted[1].item() > self.tau
 

In [8]:
import csv

file_path = 'datasets/URLs_Singh/Webpages_Classification_train_data.csv/Webpages_Classification_train_data.csv'

# load only 'url' and 'label' columns
columns_to_load = [1, 11]

# Initialize empty lists to hold the data of the two columns
urls = []
labels = []

# Open the CSV file and read line by line
with open(file_path, 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        # Append the data from the specified columns to the lists
        urls.append(row[columns_to_load[0]])
        labels.append(row[columns_to_load[1]])

In [10]:
num_good = sum([1 for l in labels if l == 'good'])
num_bad = sum([1 for l in labels if l == 'bad'])

print(f"Number of good URLs: {num_good}, Number of bad URLs: {num_bad}")

Number of good URLs: 1172747, Number of bad URLs: 27253


In [9]:
print(len(urls))  # Print the number of elements in the columns

# Now column1_data and column2_data contain the data from the specified columns
print(urls[:5])  # Print first 5 elements of column1_data
print(labels[:5])  # Print first 5 elements of column2_dat


1200001
['url', 'http://members.tripod.com/russiastation/', 'http://www.ddj.com/cpp/184403822', 'http://www.naef-usa.com/', 'http://www.ff-b2b.de/']
['label', 'good', 'good', 'good', 'bad']
