In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tldextract
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Load the dataset
data = pd.read_csv('dataset_phishing.csv')

# Feature and label extraction
X = data['url']
y = data['status'].apply(lambda x: 1 if x == 'phishing' else 0)

# Character n-gram TF-IDF vectorization
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 5), max_features=1000)
X_tfidf = vectorizer.fit_transform(X)

# Fit the scaler only on the training data and transform both sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# DataLoader
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=64, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=64, shuffle=False)

In [3]:
class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__()
        self.feature_dim = feature_dim
        self.proj = nn.Linear(feature_dim, 64)
        self.out = nn.Linear(64, feature_dim)  # Ensure output dimension matches feature_dim

    def forward(self, x):
        eij = self.proj(x)
        eij = torch.tanh(eij)
        eij = self.out(eij)
        a = torch.softmax(eij, dim=1)
        weighted_input = x * a
        return weighted_input  # Return the weighted input for further processing

class AdvancedURLNetWithAttention(nn.Module):
    def __init__(self, num_features):
        super(AdvancedURLNetWithAttention, self).__init__()
        self.fc1 = nn.Linear(num_features, 512)
        self.relu1 = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.attention = Attention(512)
        self.fc2 = nn.Linear(512, 256)  # Ensure this matches the output of attention
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout(x)
        x = self.attention(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

In [4]:

# Check CUDA availability
cuda_available = torch.cuda.is_available()
print('CUDA available:', cuda_available)

# Define the model, loss function, and optimizer
model = AdvancedURLNetWithAttention(X_train_tensor.shape[1])  # assuming this class is defined correctly
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0004)

# Training and evaluation logic goes here

CUDA available: True


In [5]:
# Check CUDA availability
cuda_available = torch.cuda.is_available()

# Training function
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    if cuda_available:
        model = model.cuda()
    for epoch in range(epochs):
        model.train()
        for inputs, labels in train_loader:
            if cuda_available:
                inputs, labels = inputs.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            if cuda_available:
                inputs, labels = inputs.cuda(), labels.cuda()
            outputs = model(inputs)
            predicted = (outputs.squeeze() > 0.5).float()
            all_preds.extend(predicted.cpu().numpy())  # Move predictions back to CPU
            all_labels.extend(labels.cpu().numpy())    # Move labels back to CPU
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")



In [6]:
# Train and evaluate the model
train_model(model, train_loader, criterion, optimizer, epochs=50)

Epoch 1/50, Loss: 0.2174403816461563
Epoch 2/50, Loss: 0.11305555701255798
Epoch 3/50, Loss: 0.20063000917434692
Epoch 4/50, Loss: 0.1452265828847885
Epoch 5/50, Loss: 0.19266310334205627
Epoch 6/50, Loss: 0.2560330033302307
Epoch 7/50, Loss: 0.19283118844032288
Epoch 8/50, Loss: 0.1334250420331955
Epoch 9/50, Loss: 0.04060792922973633
Epoch 10/50, Loss: 0.08784619718790054
Epoch 11/50, Loss: 0.056720562279224396
Epoch 12/50, Loss: 0.013113554567098618
Epoch 13/50, Loss: 0.14176133275032043
Epoch 14/50, Loss: 0.017495980486273766
Epoch 15/50, Loss: 0.010683671571314335
Epoch 16/50, Loss: 0.06407639384269714
Epoch 17/50, Loss: 0.012251357547938824
Epoch 18/50, Loss: 0.03344601020216942
Epoch 19/50, Loss: 0.0532824732363224
Epoch 20/50, Loss: 0.058811381459236145
Epoch 21/50, Loss: 0.018506919965147972
Epoch 22/50, Loss: 0.010648740455508232
Epoch 23/50, Loss: 0.05117183178663254
Epoch 24/50, Loss: 0.09678561240434647
Epoch 25/50, Loss: 0.027561470866203308
Epoch 26/50, Loss: 0.001914588

In [7]:
evaluate_model(model, test_loader)

Accuracy: 0.9125109361329834, Precision: 0.9042645778938208, Recall: 0.9202834366696191, F1-Score: 0.9122036874451273


In [17]:
def transform_and_predict_single(url, vectorizer, scaler, model):
    # Transform the URL using the pre-fitted TF-IDF vectorizer
    tfidf_features = vectorizer.transform([url]).toarray()
    
    # Standardize the features using the pre-fitted scaler
    scaled_features = scaler.transform(tfidf_features)

    # Convert to PyTorch tensor
    features_tensor = torch.tensor(scaled_features, dtype=torch.float32)

    # Predict using the model
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        prediction = model(features_tensor)
        predicted_class = (prediction.squeeze() > 0.5).float()
        return predicted_class.item()  # Return the prediction as a Python scalar

# Example usage for a single URL
single_url = "http://example.com"
prediction = transform_and_predict_single(single_url, vectorizer, scaler, model)
print("Malicious" if prediction == 1 else "Benign")



Benign
