In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tldextract
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Load the dataset
data = pd.read_csv('dataset_phishing.csv')

# Feature and label extraction, including additional numerical features
selected_features = ['url', 'length_url', 'nb_dots', 'https_token', 'nb_subdomains']  # Add other selected feature column names
X_numerical = data[selected_features[1:]].values  # Extract numerical features
y = data['status'].apply(lambda x: 1 if x == 'phishing' else 0).values

# Character n-gram TF-IDF vectorization for the 'url' feature
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 5), max_features=1000)
X_tfidf = vectorizer.fit_transform(data['url']).toarray()

# Combine TF-IDF features with numerical features
X_combined = np.hstack((X_tfidf, X_numerical))

# Fit the scaler only on the training data and transform both sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# DataLoader
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=64, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=64, shuffle=False)


In [3]:
class Attention(nn.Module):
    def __init__(self, feature_dim):
        super(Attention, self).__init__()
        self.feature_dim = feature_dim
        self.proj = nn.Linear(feature_dim, 64)
        self.out = nn.Linear(64, feature_dim)  # Ensure output dimension matches feature_dim

    def forward(self, x):
        eij = self.proj(x)
        eij = torch.tanh(eij)
        eij = self.out(eij)
        a = torch.softmax(eij, dim=1)
        weighted_input = x * a
        return weighted_input  # Return the weighted input for further processing

class AdvancedURLNetWithAttention(nn.Module):
    def __init__(self, num_features):
        super(AdvancedURLNetWithAttention, self).__init__()
        # Increased the complexity of the first layer and added a second attention layer
        self.fc1 = nn.Linear(num_features, 1024)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)  # Reduced dropout
        self.attention1 = Attention(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.attention2 = Attention(512)  # New attention layer
        self.fc3 = nn.Linear(512, 128)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.3)
        self.fc4 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.attention1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.attention2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x

In [4]:

# Define the model, loss function, and optimizer
model = AdvancedURLNetWithAttention(X_train_tensor.shape[1])  # assuming this class is defined correctly
criterion = nn.BCELoss()  # Binary cross-entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

In [5]:
# Check CUDA availability
cuda_available = torch.cuda.is_available()

# Training function
def train_model(model, train_loader, criterion, optimizer, scheduler, epochs=10):
    if cuda_available:
        model = model.cuda()
    for epoch in range(epochs):
        model.train()
        for inputs, labels in train_loader:
            if cuda_available:
                inputs, labels = inputs.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
        
        scheduler.step()
        
        if epoch % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in test_loader:
            if cuda_available:
                inputs, labels = inputs.cuda(), labels.cuda()
            outputs = model(inputs)
            predicted = (outputs.squeeze() > 0.5).float()
            all_preds.extend(predicted.cpu().numpy())  # Move predictions back to CPU
            all_labels.extend(labels.cpu().numpy())    # Move labels back to CPU
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")



In [6]:
# Train and evaluate the model
train_model(model, train_loader, criterion, optimizer, scheduler, epochs=100)

Epoch 1/100, Loss: 0.35087350010871887
Epoch 6/100, Loss: 0.0742310881614685
Epoch 11/100, Loss: 0.033635057508945465
Epoch 16/100, Loss: 0.028876053169369698
Epoch 21/100, Loss: 0.08816889673471451
Epoch 26/100, Loss: 0.0003738495579455048
Epoch 31/100, Loss: 0.004314770922064781
Epoch 36/100, Loss: 0.015597413294017315
Epoch 41/100, Loss: 0.004892611410468817
Epoch 46/100, Loss: 0.00022619387891609222
Epoch 51/100, Loss: 0.0002798225323203951
Epoch 56/100, Loss: 0.00036960034049116075
Epoch 61/100, Loss: 0.0008046565926633775
Epoch 66/100, Loss: 0.000380099838366732
Epoch 71/100, Loss: 0.0001737980346661061
Epoch 76/100, Loss: 0.011571326293051243
Epoch 81/100, Loss: 0.0006883034948259592
Epoch 86/100, Loss: 0.0014102268032729626
Epoch 91/100, Loss: 0.0008534871158190072
Epoch 96/100, Loss: 0.0002813483006320894


In [7]:
model.to('cuda')
evaluate_model(model, test_loader)

Accuracy: 0.9190726159230096, Precision: 0.9206773618538324, Recall: 0.9149689991142604, F1-Score: 0.9178143047534429


In [8]:
model.to('cpu')

def transform_and_predict(features_list, vectorizer, scaler, model):
    # Separate the URL from the numerical features
    url, numerical_features = features_list[0], np.array(features_list[1:])

    # Transform the URL using the pre-fitted TF-IDF vectorizer
    tfidf_features = vectorizer.transform([url]).toarray()

    # Ensure numerical_features is a 2D array with a single sample
    numerical_features = numerical_features.reshape(1, -1)

    # Combine TF-IDF features with other numerical features
    combined_features = np.hstack((tfidf_features, numerical_features))

    # Standardize the features using the pre-fitted scaler
    scaled_features = scaler.transform(combined_features)  # No need for additional brackets now

    # Convert to PyTorch tensor
    features_tensor = torch.tensor(scaled_features, dtype=torch.float32)

    # Predict using the model
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        prediction = model(features_tensor)
        predicted_class = (prediction.squeeze() > 0.5).float()
        return predicted_class.item()  # Return the prediction as a Python scalar

# Example usage for a single URL and its numerical features
single_features = ["http://www.crestonwood.com/router.php", 35, 3, 1, 2]  # URL followed by its numerical features
prediction = transform_and_predict(single_features, vectorizer, scaler, model)
print("Phishing" if prediction == 1 else "Legitimate")

Phishing


In [9]:
import joblib

# Save the PyTorch model's state dictionary
torch.save(model.state_dict(), 'model_state_dict.pth')

# Save the fitted TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Save the fitted scaler
joblib.dump(scaler, 'standard_scaler.joblib')

['standard_scaler.joblib']