# Malicious URL Detection

Project by:  
[Jen Patrick Nataba](https://ph.linkedin.com/in/cytojen)  
[John Ferry Lagman](https://ph.linkedin.com/in/thatjohnlagman)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets torch scikit-learn safetensors

  pid, fd = os.forkpty()




# import needed libraries

In [None]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# dataset

In [None]:
# load dataset
def load_data():
    return load_dataset("kmack/Phishing_urls")

In [None]:
# preprocess
def preprocess_data(dataset):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def preprocess(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=64)

    encoded_dataset = dataset.map(preprocess, batched=True, num_proc=4)
    encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return encoded_dataset, tokenizer

In [None]:
# split into training, validation, and test sets
def split_data(dataset):
    dataset = dataset.shuffle(seed=42)
    train_test_split = dataset['train'].train_test_split(test_size=0.2)
    val_test_split = train_test_split['test'].train_test_split(test_size=0.25)

    train_data = train_test_split['train']
    val_data = val_test_split['train']
    test_data = val_test_split['test']

    return train_data, val_data, test_data

# modeling

In [None]:
# load distilbert model sequence classification
def load_model(num_labels=2):
    return DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=num_labels
    )

In [None]:
# accuracy, precision, recall, and F1-score
def create_training_args(output_dir, learning_rate=2e-5, batch_size=16, epochs=2, run_name="malicious_url_detector"):
    return TrainingArguments(
        output_dir=output_dir,
        logging_dir=output_dir,
        evaluation_strategy="epoch",
        logging_steps=50,
        save_steps=1000,
        save_total_limit=1,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        fp16=True,
        run_name=run_name,
        report_to="none",
        load_best_model_at_end=True,
        save_strategy="epoch"
    )

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# hyperparemeter tuning

In [None]:
# hyperparameter tuning for learning rate, batch size, and epochs
def hyperparameter_tuning(train_data, val_data, tokenizer, learning_rates, batch_sizes):
    best_params = {}
    best_f1 = 0

    for lr in learning_rates:
        for bs in batch_sizes:
            print(f"Training with LR={lr}, BS={bs}, Epochs=2")

            model = load_model()
            args = create_training_args(
                output_dir=f"/content/drive/MyDrive/omdena_hackathon/models/malicious_url_detector",
                learning_rate=lr,
                batch_size=bs,
                epochs=2,
                run_name=f"LR_{lr}_BS_{bs}_E_2"
            )

            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=train_data,
                eval_dataset=val_data,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            trainer.train()
            metrics = trainer.evaluate(val_data)
            f1 = metrics["eval_f1"]

            if f1 > best_f1:
                best_f1 = f1
                best_params = {"learning_rate": lr, "batch_size": bs}

    print(f"Best hyperparameters: {best_params}")
    return best_params

# performance function

In [None]:
# check model performance
def evaluate_model(trainer, test_data):
    metrics = trainer.evaluate(test_data)
    print("evaluation Metrics:", metrics)
    return metrics

# call the functions and run the codes

In [None]:
dataset = load_data()
encoded_dataset, tokenizer = preprocess_data(dataset)
train_data, val_data, test_data = split_data(encoded_dataset)

README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

(…)-00000-of-00001-d8afc95a165ea87b.parquet:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

(…)-00000-of-00001-4d6cbda5297196e7.parquet:   0%|          | 0.00/3.18M [00:00<?, ?B/s]

(…)-00000-of-00001-4e1abfe96aa382c2.parquet:   0%|          | 0.00/3.18M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/567056 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/70882 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/70882 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/567056 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/70882 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/70882 [00:00<?, ? examples/s]

In [None]:
# hyperparameters
learning_rates = [2e-5, 3e-5]
batch_sizes = [16, 32]

best_params = hyperparameter_tuning(train_data, val_data, tokenizer, learning_rates, batch_sizes)

Training with LR=2e-05, BS=16, Epochs=2


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2572,0.235134,0.89686,0.857865,0.949317,0.901277
2,0.2039,0.234728,0.898647,0.85796,0.953489,0.903206




Training with LR=2e-05, BS=32, Epochs=2


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2304,0.230629,0.896766,0.859242,0.94697,0.900975
2,0.2021,0.226526,0.898941,0.858402,0.953513,0.903461




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with LR=3e-05, BS=16, Epochs=2


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.25,0.236194,0.896719,0.855801,0.952186,0.901424
2,0.2062,0.238984,0.898153,0.857557,0.952921,0.902727




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with LR=3e-05, BS=32, Epochs=2


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2283,0.228084,0.897577,0.857729,0.951261,0.902077
2,0.1936,0.227206,0.899117,0.858035,0.954509,0.903704




Best hyperparameters: {'learning_rate': 3e-05, 'batch_size': 32}


In [None]:
final_args = create_training_args(
    output_dir="/content/drive/MyDrive/omdena_hackathon/models/malicious_url_detector/final_results",
    learning_rate=best_params["learning_rate"],
    batch_size=best_params["batch_size"],
    epochs=2,
    run_name="malicious_url_detector_final"
)

model = load_model()
trainer = Trainer(
    model=model,
    args=final_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2283,0.228084,0.897577,0.857729,0.951261,0.902077
2,0.1936,0.227206,0.899117,0.858035,0.954509,0.903704




TrainOutput(global_step=14178, training_loss=0.23168202754152664, metrics={'train_runtime': 3818.8602, 'train_samples_per_second': 237.581, 'train_steps_per_second': 3.713, 'total_flos': 1.5023260148975616e+16, 'train_loss': 0.23168202754152664, 'epoch': 2.0})

In [None]:
evaluate_model(trainer, test_data)



evaluation Metrics: {'eval_loss': 0.23061107099056244, 'eval_accuracy': 0.8962367297993158, 'eval_precision': 0.8554451710261569, 'eval_recall': 0.9548708590679393, 'eval_f1': 0.9024276996550809, 'eval_runtime': 37.6729, 'eval_samples_per_second': 752.61, 'eval_steps_per_second': 11.786, 'epoch': 2.0}


{'eval_loss': 0.23061107099056244,
 'eval_accuracy': 0.8962367297993158,
 'eval_precision': 0.8554451710261569,
 'eval_recall': 0.9548708590679393,
 'eval_f1': 0.9024276996550809,
 'eval_runtime': 37.6729,
 'eval_samples_per_second': 752.61,
 'eval_steps_per_second': 11.786,
 'epoch': 2.0}

# thoughts

This model performs really well in detecting malicious URLs. Its high recall and strong F1 score show it's excellent at catching malicious URLs with great accuracy. Plus, it’s efficient, making it well-suited for real-world applications where speed is key. This model is showing a lot of promise and effectiveness.