In [1]:
import pandas as pd
import time
import os
import csv
import traceback
from tqdm import tqdm
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
import re
from dateutil import parser
import signal
import urllib.parse
import tldextract
from urllib.parse import urlparse

In [2]:
# Function to extract features from a URL
def extract_url_features(url):
    # Parse the URL
    parsed_url = urlparse(url)

    # Feature 1: URL length
    url_length = len(url)

    # Feature 2: Number of dots in the URL
    num_dots = url.count('.')

    # Feature 3: Presence of hyphen in domain
    has_hyphen = 1 if re.search(r'-', parsed_url.netloc) else 0

    # Feature 4: Presence of '@' symbol
    has_at_symbol = 1 if re.search(r'@', url) else 0

    # Feature 5: URL path length
    path_length = len(parsed_url.path)

    # Feature 6: Query parameter length
    query_length = len(parsed_url.query)

    # Feature 7: Number of URL parameters
    num_params = int(len(parsed_url.query.split('&'))) if parsed_url.query else 0

    # Feature 8: Top-level domain (TLD) length
    tld_info = tldextract.extract(url)
    tld_length = len(tld_info.suffix)

    # Feature 9: Use of HTTPS
    uses_https = 1 if re.search(r'https', url) else 0

    # Feature 10: URL depth (number of '/' in path)
    url_depth = parsed_url.path.count('/')

    # Feature 11: Presence of IP address in domain
    ip_pattern = re.compile(r'(\d{1,3}\.){3}\d{1,3}')
    netloc_without_port = parsed_url.netloc.split(':')[0]  # Remove port number if present
    has_ip = 1 if ip_pattern.search(netloc_without_port) else 0

    # Feature 12: Length of the domain
    domain_length = int(len(parsed_url.netloc))

    # Feature 13: Number of subdomains
    num_subdomains = len(parsed_url.netloc.split('.')) - 2

    # Feature 14: Presence of suspicious words (e.g., 'login', 'secure', 'account')
    suspicious_words = ['login', 'secure', 'account', 'webscr', 'banking', 'confirm', 'verification']
    contains_suspicious_word = 0
    for word in suspicious_words:
        if word in url.lower():
            contains_suspicious_word = 1
            break

    # Feature 15: Length of the hostname
    hostname_length = int(len(parsed_url.hostname)) if parsed_url.hostname else 0

    # Feature 16: Use of port in URL
    has_port = 1 if parsed_url.port else 0

    # Feature 17: Number of special characters in the URL (like '?', '=', '#', etc.)
    special_chars = ['?', '=', '#', '%', '&']
    num_special_chars = sum([url.count(char) for char in special_chars])

    # Feature 18: Whether the URL is shortened (e.g., using bit.ly, tinyurl)
    shortening_services = ['bit.ly', 'tinyurl.com', 'goo.gl', 'shorte.st', 'go2l.ink', 'x.co', 'ow.ly', 't.co']
    is_shortened = 1 if any(service in url.lower() for service in shortening_services) else 0

    # Feature 19: Length of the URL fragment (after '#')
    fragment_length = int(len(parsed_url.fragment)) if parsed_url.fragment else 0

    # Feature 20: Position of the first occurrence of "//" in the URL
    double_slash_pos = url.find('//')

    # Feature 21: Ratio of digits to the total length of the URL
    digit_count = sum(c.isdigit() for c in url)
    digit_to_length_ratio = digit_count / url_length if url_length > 0 else 0

    # Feature 22: Number of unique characters in the URL
    num_unique_chars = len(set(url))

    # Feature 23: Presence of hexadecimal characters (indicative of encoded URLs)
    has_hex_chars = 1 if re.search(r'%[0-9a-fA-F]{2}', url) else 0

    # Feature 24: Length of the file extension (if present)
    file_extension = parsed_url.path.split('.')[-1] if '.' in parsed_url.path else ''
    file_extension_length = len(file_extension)

    # Feature 25: Number of underscores ('_') in the URL
    num_underscores = url.count('_')


    netcraft_url = f"https://sitereport.netcraft.com/?url={url}"
    driver.get(netcraft_url)
    WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')

    # bagroung section - 2 features
    baground_section = soup.find('section', {'id': 'background_table_section'})
    date_first_seen = None
    domain_age_days = None
    site_rank = None

    if baground_section:
      rows = baground_section.find_all('tr')
      for row in rows:
        th = row.find('th')
        if th and "Site rank" in th.text: #need to adjust Not Present
          td = row.find('td')
          site_rank = td.text.strip()
          try:
            site_rank = int(site_rank)
          except:
            site_rank = None
        if th and "Date first seen" in th.text:
          td = row.find('td')
          date_first_seen = td.text.strip()
          break

    if date_first_seen:
      try:
        first_seen_date = datetime.strptime(date_first_seen, "%B %Y")
        current_date = datetime.now()
        domain_age_days = (current_date - first_seen_date).days
      except:
        domain_age_days = None

    # network Section - 5 features
    network_section = soup.find('section', {'id': 'network_table_section'})
    asn = None
    reverse_dns_present = None
    netblock_owner = None
    organisation = None
    dnsssec = None

    if network_section:
      rows = network_section.find_all('tr')
      for row in rows:
        th = row.find('th')
        if th and "Netblock Owner" in th.text:
          td = row.find('td')
          netblock_owner = td.text.strip()
        if th and "IPv4 autonomous systems" in th.text:
          anchor = row.find('a')
          asn = anchor.text.strip()
          asn = asn[2:]
        if th and "Reverse DNS" in th.text:
          td = row.find('td')
          reverse_dns = td.text.strip()
          if reverse_dns not in ["Unknown","Not available"]:
            reverse_dns_present = 1
          else:
            reverse_dns_present = 0
        if th and "Organisation" in th.text:
          td = row.find('td')
          organisation = td.text.strip()
          if organisation not in ["Unknown","Not available"]:
            organisation = 1
          else:
            organisation = 0
        if th and "DNS Security Extensions" in th.text:
          td = row.find('td')
          dnsssec = td.text.strip()
          if dnsssec == "Enabled":
            dnsssec = 1
          else:
            dnsssec = 0
          break

      # Collect all the features in a list
    features = [
    url_length, num_dots, has_hyphen, has_at_symbol, path_length, query_length,
    num_params, tld_length, uses_https, url_depth, has_ip, domain_length,
    num_subdomains, contains_suspicious_word, hostname_length, has_port,
    num_special_chars, is_shortened, fragment_length, double_slash_pos,
    digit_to_length_ratio, num_unique_chars, has_hex_chars, file_extension_length,
    num_underscores, domain_age_days, site_rank, netblock_owner, asn,
    reverse_dns_present, organisation, dnsssec]

    return features

In [3]:
# Load the input CSV file
file_path = 'legitimate_urls_chunk_no_22.csv'
df = pd.read_csv(file_path)

feature_names = [
    "url_length", "num_dots", "has_hyphen", "has_at_symbol", "path_length", "query_length",
    "num_params", "tld_length", "uses_https", "url_depth", "has_ip", "domain_length",
    "num_subdomains", "contains_suspicious_word", "hostname_length", "has_port",
    "num_special_chars", "is_shortened", "fragment_length", "double_slash_pos",
    "digit_to_length_ratio", "num_unique_chars", "has_hex_chars", "file_extension_length",
    "num_underscores", "domain_age_days", "site_rank", "netblock_owner", "asn",
    "reverse_dns_present", "organisation", "dnsssec","class"
]

# Create the DataFrame with the first row for column titles
features_df = pd.DataFrame(columns=feature_names)

In [4]:
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options=chrome_options)

for index, row in tqdm(df.iterrows(), total=len(df)):
    url = row['URL']
    features = extract_url_features(url)

    class_label = 1

    data_row = features + [class_label]

    # Append the data to the DataFrame
    features_df.loc[len(features_df)] = data_row

  features_df.loc[len(features_df)] = data_row
 71%|███████   | 1331/1878 [2:04:51<51:18,  5.63s/it]  


WebDriverException: Message: unknown error: net::ERR_INTERNET_DISCONNECTED
  (Session info: chrome=136.0.7103.94)
Stacktrace:
#0 0x5ed611fc775a <unknown>
#1 0x5ed611a6a0a0 <unknown>
#2 0x5ed611a61177 <unknown>
#3 0x5ed611a52199 <unknown>
#4 0x5ed611a53e8d <unknown>
#5 0x5ed611a5252e <unknown>
#6 0x5ed611a51ece <unknown>
#7 0x5ed611a51ba2 <unknown>
#8 0x5ed611a4f9ef <unknown>
#9 0x5ed611a501aa <unknown>
#10 0x5ed611a6d5d9 <unknown>
#11 0x5ed611b07eb5 <unknown>
#12 0x5ed611ae13a2 <unknown>
#13 0x5ed611b072a0 <unknown>
#14 0x5ed611ae1173 <unknown>
#15 0x5ed611aadd4b <unknown>
#16 0x5ed611aae9b1 <unknown>
#17 0x5ed611f8c90b <unknown>
#18 0x5ed611f9080a <unknown>
#19 0x5ed611f74662 <unknown>
#20 0x5ed611f91394 <unknown>
#21 0x5ed611f5949f <unknown>
#22 0x5ed611fb5538 <unknown>
#23 0x5ed611fb5716 <unknown>
#24 0x5ed611fc65c6 <unknown>
#25 0x75c14609caa4 <unknown>
#26 0x75c146129c3c <unknown>


In [5]:
#save data to output file.
features_df.to_csv('/home/tejas/Desktop/phish_dataset/dataset_chunks/legitimate_dataset_chunk_22A.csv', index=False)