In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from utils import run_ML

In [3]:
# !pip install tldextract
# !pip install torch_geometric

In [4]:
data_dir = "data/URLdatasetX2_1.csv"
df = pd.read_csv(data_dir,index_col=0)

In [5]:
df.shape, df.head(2)

((2802, 2),
                                           url        type
 0       http://www.crestonwood.com/router.php  legitimate
 1  http://vamoaestudiarmedicina.blogspot.com/  legitimate)

In [6]:
# smalldata = df.sample(n = 20000, random_state=1)
smalldata = df.sample(n = 2000, random_state=1) # take random 300 samples

In [7]:
# get labels of urls
labels = smalldata.iloc[:,-1].values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

### Feature extraction

In [8]:
from utils import extract_features

In [9]:
# Example usage:
url = "http://www.example.com/path/to/==file.html"
url_features = extract_features(url)
print(url_features)

{'domain': 'www.example.com', 'num_subdomains': 2, 'contains_ip': 0, 'path_length': 20, 'num_path_segments': 3, 'uses_https': 0, 'file_extension': 'html', 'count_special_characters': 11, 'count_non_alphanumeric_characters': 11, 'TLD': 'com', 'count_obfuscated_characters': 0, 'letter_ratio_in_url': 0.7380952380952381, 'digit_ratio_in_url': 0.0, 'count_equals_in_url': 2, 'NoOfAmpersandInURL': 0, 'CharContinuationRate': 0.11904761904761904, 'ratio_obfuscated_characters': 0.0, 'NoOfQMarkInURL': 0}


In [10]:
# print(url_features.keys())

In [11]:
# get numerical and catergorical features
phish_url = []
for link in list(smalldata.iloc[:,0]):
    url_features = extract_features(link)
    phish_url.append(list(url_features.values())[1:])

In [12]:
phish_url_df = pd.DataFrame(phish_url, columns = list(url_features.keys())[1:])

In [13]:
# phish_url_df.head(2)

In [14]:
phish_url_df.iloc[:,5] = pd.Categorical(phish_url_df.iloc[:,5]).codes
phish_url_df.iloc[:,8] = pd.Categorical(phish_url_df.iloc[:,8]).codes

In [15]:
phish_url_df.head(2)

Unnamed: 0,num_subdomains,contains_ip,path_length,num_path_segments,uses_https,file_extension,count_special_characters,count_non_alphanumeric_characters,TLD,count_obfuscated_characters,letter_ratio_in_url,digit_ratio_in_url,count_equals_in_url,NoOfAmpersandInURL,CharContinuationRate,ratio_obfuscated_characters,NoOfQMarkInURL
0,2,0,40,3,1,0,9,9,37,0,0.569231,0.292308,0,0,0.076923,0.0,0
1,2,0,1,1,1,0,6,6,37,0,0.785714,0.0,0,0,0.142857,0.0,0


In [16]:
# test on URLs features
run_ML(phish_url_df, labels, "URLdatasetX2", "manual")

Run:  0 , fold:  0
Train freq:  [1179, 421]
kNN, LightGBM, Run:  0 , fold:  1
Train freq:  [1165, 435]
kNN, LightGBM, Run:  0 , fold:  2
Train freq:  [1180, 420]
kNN, LightGBM, Run:  0 , fold:  3
Train freq:  [1166, 434]
kNN, LightGBM, Run:  0 , fold:  4
Train freq:  [1158, 442]
kNN, LightGBM, ['kNN', 'LightGBM']
[0.68 0.87]


In [17]:
## test on numerical URLs features
from utils import extract_numerical_features
phish_url = []
for link in list(smalldata.iloc[:,0]):
    url_features = extract_numerical_features(link)
    phish_url.append(list(url_features.values()))
run_ML(np.array(phish_url), labels, "URLdatasetX2", "manual_numerical")

Run:  0 , fold:  0
Train freq:  [1179, 421]
kNN, LightGBM, Run:  0 , fold:  1
Train freq:  [1165, 435]
kNN, LightGBM, Run:  0 , fold:  2
Train freq:  [1180, 420]
kNN, LightGBM, Run:  0 , fold:  3
Train freq:  [1166, 434]
kNN, LightGBM, Run:  0 , fold:  4
Train freq:  [1158, 442]
kNN, LightGBM, ['kNN', 'LightGBM']
[0.69 0.83]


### Extract graph features

In [18]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError

In [19]:
# return root and hyperlinks features
def get_graph_features(idx):
    url = smalldata.iloc[idx,0]
    root_feature = extract_numerical_features(url) # dict
    hyperlink_data = [list(root_feature.values())]
    try:    
        # find all hyperlinks
        reqs = requests.get(url)
        soup = BeautifulSoup(reqs.text, 'html.parser')
        urls = []
        count = 0;
        for link in soup.find_all('a'):
            # print(link.get('href'))
            weblink = link.get('href')
            if (weblink is not None) and ('http' in weblink):
                urls.append(weblink)
            count += 1
            if count > 50:
                break
        # extract numerical features in from hyperlinks
        if len(urls) > 0:
            for link in urls:
                try:
                    url_features = extract_numerical_features(link)
                    datalinkssss = list(url_features.values())
                except ValueError as ve:
                    datalinkssss = list(np.zeros(15))
                hyperlink_data.append(datalinkssss)
        else:
            hyperlink_data.append(list(np.zeros(15)))
    
    except ConnectionError as e:
        # print("No rep", end = ',')
        hyperlink_data.append(list(np.zeros(15)))
    return (idx,  hyperlink_data)

In [20]:
n_test_samples = 200 # how many link we want to test
from joblib import Parallel, delayed
results = Parallel(n_jobs=8)(delayed(get_graph_features)(i) for i in range(n_test_samples)) # test on 100 links

## PyG

In [21]:
# # Transfer data object to GPU.
# device = torch.device('cuda')
# data = data.to(device)

In [22]:
import torch
from torch_geometric.data import Data, Dataset

class GraphClassificationDataset(Dataset):
    def __init__(self, graphs):
        self.graphs = graphs
        # self.labels = labels

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        graph = self.graphs[idx]
        # label = self.labels[idx]
        return graph

In [23]:
# Assume you have a list of graphs represented as Data objects and a corresponding list of labels
# Only take the url with more than 4 hyperlinks
graphs = []
for i in range(len(results)):
    idx, graph_feature = results[i]
    n_hyperlinks = len(graph_feature)-1
    child_id = [i+1 for i in range(n_hyperlinks)]
    source_id = list(np.zeros(n_hyperlinks).astype(int))
    edge_index = torch.tensor([source_id + child_id,
                               child_id + source_id], dtype=torch.long)
    x = torch.tensor(graph_feature, dtype=torch.float)
    y = torch.tensor([labels[idx]], dtype=torch.int64)
    data = Data(x=x, edge_index=edge_index, y = y)
    if n_hyperlinks > 5:
        graphs.append(data)
   
# Create a dataset instance
dataset = GraphClassificationDataset(graphs)

In [24]:
dataset[10]

Data(x=[18, 15], edge_index=[2, 34], y=[1])

In [25]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.


Dataset: GraphClassificationDataset(94):
Number of graphs: 94
Number of features: 15
Number of classes: 2


In [26]:
# dataset = dataset.shuffle()
n_samples = len(dataset)
train_dataset = dataset[:int(0.8*n_samples)]
test_dataset = dataset[int(0.8*n_samples):]

In [27]:
len(train_dataset), len(test_dataset)

(75, 19)

In [28]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [29]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(15, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [30]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()

    for data in train_loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 10):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 001, Train Acc: 0.9333, Test Acc: 0.8947
Epoch: 002, Train Acc: 0.9333, Test Acc: 0.8947
Epoch: 003, Train Acc: 0.9333, Test Acc: 0.8947
Epoch: 004, Train Acc: 0.9333, Test Acc: 0.8947
Epoch: 005, Train Acc: 0.9333, Test Acc: 0.8947
Epoch: 006, Train Acc: 0.9467, Test Acc: 0.8947
Epoch: 007, Train Acc: 0.9333, Test Acc: 0.8947
Epoch: 008, Train Acc: 0.9333, Test Acc: 0.8947
Epoch: 009, Train Acc: 0.9467, Test Acc: 0.8947
