# **Phishing Detection: Trích xuất đặc trưng**



In [1]:
#importing required packages for this module
import sys
import ipaddress
import re
from dateutil.parser import parse as date_parse
from urllib.parse import urlparse
import pandas as pd

# Thêm đường dẫn Google Drive vào sys.path
#sys.path.append('/content/drive/MyDrive/an_toan_thong_tin_3')

# Import feature extraction class từ Google Drive
from feature import FeatureExtraction

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [2]:
#loading the URLs data to dataframe
data_url = pd.read_csv("DataFiles/dataset_with_domain.csv")
print(f"Total dataset size: {data_url.shape}")
print(f"Type distribution:")
print(data_url['type'].value_counts())

Total dataset size: (504983, 3)
Type distribution:
type
legitimate    345738
phishing      159245
Name: count, dtype: int64


In [5]:
data_url.shape

(504983, 3)

In [4]:
# Lấy ngẫu nhiên 5000 mẫu legitimate và 5000 mẫu phishing
import numpy as np

# Đặt seed để kết quả có thể tái lập
np.random.seed(42)

# Lọc dữ liệu theo type
legitimate_data = data_url[data_url['type'] == 'legitimate']
phishing_data = data_url[data_url['type'] == 'phishing']

print(f"Available legitimate URLs: {len(legitimate_data)}")
print(f"Available phishing URLs: {len(phishing_data)}")

# Lấy ngẫu nhiên 5000 mẫu từ mỗi loại
legitimate_sample = legitimate_data.sample(n=1000, random_state=42)
phishing_sample = phishing_data.sample(n=1000, random_state=42)

# Gộp lại và xáo trộn
sampled_data = pd.concat([legitimate_sample, phishing_sample], ignore_index=True)
sampled_data = sampled_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Final sampled dataset size: {sampled_data.shape}")
print(f"Sampled type distribution:")
print(sampled_data['type'].value_counts())

# Hiển thị một vài mẫu
sampled_data.head()

Available legitimate URLs: 345738
Available phishing URLs: 159245
Final sampled dataset size: (2000, 3)
Sampled type distribution:
type
phishing      1000
legitimate    1000
Name: count, dtype: int64


Unnamed: 0,url,domain,type
0,http://www.asean-works.com/modules/mod_simplef...,www.asean-works.com,phishing
1,https://www.yellowbook.com/profile/jean-denis-...,www.yellowbook.com,legitimate
2,https://sign-in-106472.weeblysite.com/,sign-in-106472.weeblysite.com,phishing
3,https://www.stingraygraphix.co.za/,www.stingraygraphix.co.za,legitimate
4,http://trehoada.org/878hf33f34f,trehoada.org,phishing


# **Feature Extraction:**

Trích xuất đặc trưng


1.   Address Bar based Features



### **3.1. Address Bar Based Features:**


1.1. Address Bar based Features
1.1.1.	Using the IP Address
1.1.2.	Long URL to Hide the Suspicious Part
1.1.3.	Using URL Shortening Services “TinyURL”
1.1.4.	URL’s having “@” Symbol
1.1.5.	Redirecting using “//”
1.1.6.	Adding Prefix or Suffix Separated by (-) to the Domain
1.1.7.	Sub Domain and Multi Sub Domains
1.1.8.	HTTPS (Hyper Text Transfer Protocol with Secure Sockets Layer) 
1.1.9.	The Existence of “HTTPS” Token in the Domain Part of the URL



In [5]:
#Function to extract features
def featureExtraction(url, label):
    try:
        extractor = FeatureExtraction(url)
        features = extractor.features
        features.append(1 if label == 'legitimate' else -1)  # Convert label to numeric
        return features
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        pass

In [None]:
# Extracting features từ sampled data
features = []
total_samples = len(sampled_data)

print("Bắt đầu trích xuất đặc trưng...")
print(f"Tổng số mẫu cần xử lý: {total_samples}")

for i in range(total_samples):
    if i % 100 == 0:  # In progress mỗi 1000 mẫu
        print(f"Đã xử lý: {i}/{total_samples} mẫu ({i/total_samples*100:.1f}%)")
    
    url = sampled_data['url'].iloc[i]
    label = sampled_data['type'].iloc[i]
    
    extracted_features = featureExtraction(url, label)
    features.append(extracted_features)

print(f"Hoàn thành! Đã trích xuất đặc trưng cho {len(features)} URL.")

Bắt đầu trích xuất đặc trưng...
Tổng số mẫu cần xử lý: 2000
Đã xử lý: 0/2000 mẫu (0.0%)
Đã xử lý: 100/2000 mẫu (5.0%)
Đã xử lý: 200/2000 mẫu (10.0%)
Đã xử lý: 300/2000 mẫu (15.0%)


In [None]:
featureExtraction("https://www.facebook.com/", 1)

In [None]:
#converting the list to dataframe
feature_names = ['UsingIp', 'LongUrl', 'ShortUrl', 'Symbol', 'Redirecting', 'PrefixSuffix',
                 'SubDomains', 'Https', 'Favicon', 'NonStdPort', 'HTTPSDomainURL', 'RequestURL', 
                 'AnchorURL', 'LinksInScriptTags', 'ServerFormHandler', 'InfoEmail', 
                 'WebsiteForwarding', 'StatusBarCust', 'DisableRightClick', 'UsingPopupWindow',
                 'IframeRedirection', 'LinksPointingToPage', 'Label']

data = pd.DataFrame(features, columns=feature_names)
print(f"DataFrame shape: {data.shape}")
print(f"Label distribution:")
print(data['Label'].value_counts())
data.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,Https_Domain,TinyURL,Prefix/Suffix,Label
0,sites.google.com,0,0,1,3,0,0,0,0,1
1,pub-6b5396a7447e450890996021695aad4d.r2.dev,0,0,1,1,0,0,0,1,1
2,firebasestorage.googleapis.com,0,0,1,5,0,0,1,0,1
3,open-webmail-help-desk1.weebly.com,0,0,0,0,0,0,0,1,1
4,paybyplatema.site,0,0,0,0,0,0,0,0,1


In [None]:
# Storing the extracted URLs features to csv file
data.to_csv('extracted_features_10k_samples.csv', index=False)
print("Đã lưu file extracted_features_10k_samples.csv thành công!")
print(f"File chứa {len(data)} mẫu với {len(data.columns)} đặc trưng.")