In [None]:
# import Dependencies
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import RegexpTokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Load the dataset from the device
from google.colab import files
uploaded = files.upload()

Saving phishing_site_urls.csv to phishing_site_urls.csv


In [None]:
# Get the file name from the keys of Dictionary
filename = list(uploaded.keys())[0]
print(f"Name of the dataset: {filename}")

Name of the dataset: phishing_site_urls.csv


In [None]:
# Load the data into Dataframes
dataset = pd.read_csv(filename)
dataset.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [None]:
# shape of the dataset
print(dataset.shape)

(549346, 2)


In [None]:
# Check dataset information
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [None]:
# Data Preprocessing
dataset['URL'] = dataset['URL'].apply(lambda url: url.lower())

# Remove duplicates
dataset.drop(dataset[dataset['URL'].duplicated() == True].index, axis=0, inplace=True)
dataset.reset_index(drop=True, inplace=True)

# Replace target text labels into integer values
dataset['Label'] = dataset['Label'].apply(lambda class_name: 0 if class_name == 'good' else 1)

In [None]:
# count class labels distribution
dataset['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,392897
1,114214


In [None]:
# print dataset size after preprocessing
dataset.shape

(507111, 2)

In [None]:
# Convert the input and output features from the dataset
X = dataset['URL'].values
y = dataset['Label'].values

In [None]:
# Split the data in to train and test sets
X, X_test, y, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
# Tokenize: Now, convert characters into integers
tokenizer = Tokenizer(char_level = True)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

max_len = 200
X_padded = pad_sequences(sequences, maxlen=max_len, padding='post') # "post": the padding, zeros, should be added at the end

In [None]:
# print the results
print(X[1])
print(X_padded[1])
print(y[:4])
print(len(X_padded[1]))

jorgensenconveyors.com/replacement.aspx
[39  2  9 20  1 10  8  1 10  7  2 10 29  1 25  2  9  8 12  7  2 11  5  9
  1 15 13  3  7  1 11  1 10  6 12  3  8 15 37  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]
[1 0 0 0]
200


In [None]:
# split the data again into train and test tensors
x_train, x_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.5, random_state=42)

In [None]:
# convert data into tensor datasets
X_tensor_train = torch.tensor(x_train, dtype=torch.long)
y_tensor_train = torch.tensor(y_train, dtype=torch.float32)

X_tensor_test = torch.tensor(x_test, dtype=torch.long)
y_tensor_test = torch.tensor(y_test, dtype=torch.float32)

In [None]:
# Define the dataset class
class PhishingDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)


    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Create DataLoader instances
train_dataset = PhishingDataset(X_tensor_train, y_tensor_train)
test_dataset = PhishingDataset(X_tensor_test, y_tensor_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # batch size is set to 32
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) # batch size is set to 32 and load on every iteration


In [None]:
# Define the LSTM model
class LSTMPhishingClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMPhishingClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # embedding layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) # LSTM layer
        self.fc = nn.Linear(hidden_dim, output_dim)  # fully connected layer
        self.sigmoid = nn.Sigmoid()


    def forward(self, x):     # embedding: convert words/tokens into a more meaningful dense vector
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        out = self.fc(hidden[-1]) # it only passes the last hidden state of LSTM to the fc layer
        return self.sigmoid(out) # sigmoid: function gives the propabilities of input x


In [None]:
# Instantiate the model
vocab_size = len(tokenizer.word_index) + 1
model = LSTMPhishingClassifier(vocab_size, embedding_dim=32, hidden_dim=64, output_dim=1)


# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training the Model
num_epochs = 10  # Define the number of training epochs

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    correct = 0
    total = 0

    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()  # Reset gradients before each batch

        # Forward pass
        output = model(data)

        # Ensure target has correct shape and compute loss
        loss = criterion(output.squeeze(), target)

        # Backpropagation
        loss.backward()  # Compute gradients
        optimizer.step()  # Update model parameters

        # Convert outputs to binary predictions
        predictions = (output >= 0.5).int()  # Threshold at 0.5

        correct += (predictions == target.unsqueeze(1)).sum().item()
        total += target.size(0)

        # # Print batch loss every 100 batches
        # if (batch_idx + 1) % 1000 == 0:
        #     print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Batch Loss: {loss.item():.4f}")


    accuracy_train = (correct / total) * 100

print(f"Model Training Accuracy: {accuracy_train:.2f}%\n")

Model Training Accuracy: 98.01%



In [None]:
# Evaluate the model
model.eval()  # Set model to evaluation mode
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for batch_idx, (data, target) in enumerate(test_loader):
        output = model(data)  # Forward pass

        loss = criterion(output.squeeze(), target)
        test_loss += loss.item()

        predictions = (output >= 0.5).int()  # Threshold at 0.5

        # Count correct predictions
        correct += (predictions == target.unsqueeze(1)).sum().item()
        total += target.size(0)

        # Print loss for every 100 batches
        # if (batch_idx + 1) % 1000 == 0:
        #     print(f"Batch {batch_idx + 1}/{len(test_loader)}, Batch Loss: {loss.item():.4f}")

accuracy_test = (correct / total) * 100

print(f"Model Test Accuracy: {accuracy_test:.2f}%")

Model Test Accuracy: 96.86%
