In [1]:
pip install networkx beautifulsoup4 requests

Collecting networkx
  Downloading networkx-3.1-py3-none-any.whl.metadata (5.3 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting requests
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.2.1-py3-none-any.whl.metadata (6.4 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2024.2.2-py3-none-any.whl.metadata (2.2 kB)
Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0

In [2]:
pip install pandas torch torchvision torch-geometric


Collecting torch
  Downloading torch-2.2.2-cp38-none-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting torchvision
  Downloading torchvision-0.17.2-cp38-cp38-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting filelock (from torch)
  Downloading filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch)
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting jinja2 (from torch)
  Using cached Jinja2-3.1.3-py3-none-any.whl.metadata (3.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Collecting tqdm (from torch-geometric)
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.2 M

In [1]:
import pandas as pd

dataset = pd.read_csv('/Users/rollie/Documents/URL_Phishing_Detection/notebooks/filtered_file.csv')
import networkx as nx
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import requests
from bs4 import BeautifulSoup

#hàm trích xuất tất cả các hyperlink từ một URL 
def fetch_hyperlinks(url):
     
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a') if link.get('href')]
        return links
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return []

def build_graph(dataset):
    G = nx.DiGraph()
    url_to_index = {}
    features = []
    labels = []
    index = 0
    for _, row in dataset.iterrows():
        url = row['URL']
        if url not in url_to_index:
            G.add_node(url)
            url_to_index[url] = index
            # lấyy đặc trưng trongg tập dataset: 'TLD', 'URLLength', 'IsDomainIP', 'NoOfSubDomain', 'NoOfObfuscatedChar', 'IsHTTPS', 'NoOfDegitsInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL','NoOfAmpersandInURL'
            features.append([row['TLD'], row['URLLength'], row['IsDomainIP'], row['NoOfSubDomain'], row['NoOfObfuscatedChar'], row['IsHTTPS'], row['NoOfDegitsInURL'], row['NoOfEqualsInURL'], row['NoOfQMarkInURL'], row['NoOfAmpersandInURL']])
            labels.append(row['label'])
            index += 1
        hyperlinks = fetch_hyperlinks(url)
        for link in hyperlinks:
            if link in url_to_index:  # Chỉ thêm cạnh nếu hyperlink đã là một node
                G.add_edge(url, link)

    features = np.array(features)
    labels = np.array(labels)
    return G, features, labels

graph, features, labels = build_graph(dataset)

#cb dữ liệu Pytorch
x = torch.tensor(features, dtype=torch.float)
# Giả sử graph.edges() trả về danh sách các cạnh dưới dạng tuple của chuỗi (ví dụ: [('1', '2'), ('3', '4')])
# Chúng ta cần chuyển đổi chúng thành số nguyên
edge_index = torch.tensor([[int(node) for node in edge] for edge in graph.edges()], dtype=torch.long).t().contiguous()

#edge_index = torch.tensor([list(edge) for edge in graph.edges()], dtype=torch.long).t().contiguous()
y = torch.tensor(labels, dtype=torch.long)

data = Data(x=x, edge_index=edge_index, y=y)

class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(num_features=2, num_classes=len(set(labels))) # set 2 lớp GCN
#huấn luyện
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Giả sử bạn đã tạo một mặt nạ huấn luyện
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(200):  # Số lượng epoch huấn luyện
    loss = train()
    print(f'Epoch {epoch}, Loss: {loss}')



KeyboardInterrupt: 