In [1]:
# Enhanced Installation Cell
!pip install python-whois tldextract ipwhois imagehash pillow requests beautifulsoup4
!pip install transformers torch timm sentence-transformers torchvision
!pip install accelerate bitsandbytes  # For faster inference

Collecting python-whois
  Downloading python_whois-0.9.6-py3-none-any.whl.metadata (3.0 kB)
Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting ipwhois
  Downloading ipwhois-1.3.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting dnspython (from ipwhois)
  Downloading dnspython-2.8.0-py3-none-any.whl.metadata (5.7 kB)
Downloading python_whois-0.9.6-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m117.0/117.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [2]:
# Enhanced Imports
import pandas as pd, numpy as np, re, math, hashlib, socket, ssl, io
import tldextract
from urllib.parse import urlparse, urljoin
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import imagehash
from PIL import Image
import whois
from ipwhois import IPWhois
import socket
import time

# Transformer imports
import torch
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
import timm
from sklearn.metrics.pairwise import cosine_similarity
import torchvision.transforms as transforms

from google.colab import files

In [10]:
# ==============================
# üìå DATA COLLECTION (GUARANTEED WORKING)
# ==============================

import pandas as pd
import requests
from google.colab import files

print("Starting data collection...")

# Initialize df to avoid NameError
df = pd.DataFrame()

try:
    # User choice
    print("Choose data source:")
    print("1 - Upload files manually")
    print("2 - Fetch data automatically from web")
    choice = input("Enter 1 or 2: ").strip()

    if choice == "1":
        # Upload Files
        print("üìÅ Please upload your CSV or Excel file...")
        uploaded = files.upload()

        if uploaded:
            for filename in uploaded.keys():
                print(f"Loading: {filename}")
                if filename.endswith(".csv"):
                    temp_df = pd.read_csv(filename)
                elif filename.endswith((".xls", ".xlsx")):
                    temp_df = pd.read_excel(filename)
                else:
                    print(f"Skipping: {filename}")
                    continue

                df = pd.concat([df, temp_df], ignore_index=True)
            print(f"‚úÖ Upload complete. Shape: {df.shape}")
        else:
            print("‚ùå No files uploaded. Creating empty dataset.")
            df = pd.DataFrame({'url': []})  # Create empty with url column

    elif choice == "2":
        # Fetch from Web
        print("üåê Fetching data from web...")

        # Phishing URLs
        try:
            response = requests.get("https://openphish.com/feed.txt", timeout=10)
            phishing_urls = response.text.splitlines()[:100]  # First 100 URLs
            df_phish = pd.DataFrame(phishing_urls, columns=["url"])
            df_phish["label"] = "phishing"
            print(f"‚úÖ Phishing URLs: {len(phishing_urls)}")
        except Exception as e:
            print(f"‚ùå Failed to fetch phishing URLs: {e}")
            df_phish = pd.DataFrame(columns=["url", "label"])

        # Legitimate URLs
        try:
            response = requests.get("https://tranco-list.eu/top-1m.csv", timeout=10)
            lines = response.text.splitlines()[:100]
            legit_urls = [line.split(",")[1] for line in lines if "," in line]
            df_legit = pd.DataFrame(["http://" + u for u in legit_urls], columns=["url"])
            df_legit["label"] = "legitimate"
            print(f"‚úÖ Legitimate URLs: {len(legit_urls)}")
        except Exception as e:
            print(f"‚ùå Failed to fetch legitimate URLs: {e}")
            df_legit = pd.DataFrame(columns=["url", "label"])

        # Combine
        df = pd.concat([df_phish, df_legit], ignore_index=True)
        print(f"üåê Web fetch complete. Shape: {df.shape}")

    else:
        print("‚ùå Invalid choice. Creating sample data...")
        # Create sample data as fallback
        sample_data = {
            'url': [
                'https://www.google.com',
                'https://www.github.com',
                'https://www.example.com'
            ],
            'label': ['legitimate', 'legitimate', 'legitimate']
        }
        df = pd.DataFrame(sample_data)

except Exception as e:
    print(f"‚ùå Error in data collection: {e}")
    # Create fallback data
    df = pd.DataFrame({
        'url': ['https://www.google.com', 'https://www.example.com'],
        'label': ['legitimate', 'legitimate']
    })

# Ensure we always have a DataFrame
if df.empty:
    print("‚ö†Ô∏è  Dataset is empty. Creating sample data...")
    df = pd.DataFrame({
        'url': ['https://www.google.com', 'https://www.github.com'],
        'label': ['legitimate', 'legitimate']
    })

print(f"üéâ Final dataset shape: {df.shape}")
print("Columns:", list(df.columns))

# Display the data
print("\nüìä First 5 rows:")
display(df.head())

print("‚úÖ Data collection completed successfully!")

Starting data collection...
Choose data source:
1 - Upload files manually
2 - Fetch data automatically from web
Enter 1 or 2: 1
üìÅ Please upload your CSV or Excel file...


Saving PS02_Training_set.zip to PS02_Training_set.zip
Loading: PS02_Training_set.zip
Skipping: PS02_Training_set.zip
‚úÖ Upload complete. Shape: (0, 0)
‚ö†Ô∏è  Dataset is empty. Creating sample data...
üéâ Final dataset shape: (2, 2)
Columns: ['url', 'label']

üìä First 5 rows:


Unnamed: 0,url,label
0,https://www.google.com,legitimate
1,https://www.github.com,legitimate


‚úÖ Data collection completed successfully!


In [11]:
# ==============================
# üîç SIMPLE URL Column Detection
# ==============================

print("DataFrame columns:", list(df.columns))
print("DataFrame shape:", df.shape)

# Show first row to understand data structure
print("\nFirst row of data:")
print(df.iloc[0] if len(df) > 0 else "Empty DataFrame")

# Simple detection - use first column that has URLs or let user choose
URL_COLUMN = None

# Try common column names
common_url_columns = ['url', 'link', 'website', 'domain', 'Whitelisted Domains', 'URL', 'Website']
for col in common_url_columns:
    if col in df.columns:
        URL_COLUMN = col
        break

# If not found, use first column
if URL_COLUMN is None and len(df.columns) > 0:
    URL_COLUMN = df.columns[0]
    print(f"‚ö†Ô∏è  Using first column as URL column: '{URL_COLUMN}'")
elif URL_COLUMN:
    print(f"‚úÖ Using detected URL column: '{URL_COLUMN}'")
else:
    print("‚ùå No columns found in DataFrame")

print(f"üéØ URL column: '{URL_COLUMN}'")

DataFrame columns: ['url', 'label']
DataFrame shape: (2, 2)

First row of data:
url      https://www.google.com
label                legitimate
Name: 0, dtype: object
‚úÖ Using detected URL column: 'url'
üéØ URL column: 'url'


In [12]:
# ==============================
# üöÄ TRANSFORMER FEATURE EXTRACTION
# ==============================

def extract_text_features_with_transformers(html_text, url):
    """Extract text features using transformers"""
    features = {}

    try:
        if not html_text:
            return get_default_text_features()

        soup = BeautifulSoup(html_text, 'html.parser')

        # Clean text
        for script in soup(["script", "style", "meta", "noscript"]):
            script.decompose()

        title_text = soup.find('title')
        title = title_text.get_text().strip() if title_text else ""
        visible_text = ' '.join(soup.stripped_strings)
        full_text = f"{title} {visible_text}"[:500]

        # Check if we have real transformer models (not dummies)
        if (full_text.strip() and
            hasattr(TRANSFORMER_MODELS['text_tokenizer'], 'dummy') and
            not TRANSFORMER_MODELS['text_tokenizer'].dummy):

            # BERT embeddings
            inputs = TRANSFORMER_MODELS['text_tokenizer'](full_text, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                outputs = TRANSFORMER_MODELS['text_model'](**inputs)

            text_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
            features['text_embedding_mean'] = float(np.mean(text_embedding))
            features['text_embedding_std'] = float(np.std(text_embedding))

            # Sentence transformer
            sentence_emb = TRANSFORMER_MODELS['sentence_model'].encode([full_text])[0]
            features['sentence_embedding_mean'] = float(np.mean(sentence_emb))
        else:
            # Use dummy values if transformers didn't load properly
            features['text_embedding_mean'] = 0.5
            features['text_embedding_std'] = 0.1
            features['sentence_embedding_mean'] = 0.5

        # Text structure
        features['text_length'] = len(visible_text)
        features['title_length'] = len(title)
        security_words = ['login','password','secure','bank','verify','account']
        features['security_keyword_density'] = sum(1 for k in security_words if k in visible_text.lower()) / max(1, len(visible_text.split()))

    except Exception as e:
        print(f"Text feature error for {url}: {e}")
        features = get_default_text_features()

    return features

def extract_visual_features_with_transformers(url):
    """Extract visual features using transformers"""
    features = {}

    try:
        # Try to get favicon
        favicon_data = extract_favicon(url)
        if favicon_data and 'image' in favicon_data:
            image = favicon_data['image']

            # Check if we have real transformer models
            if (hasattr(TRANSFORMER_MODELS['image_processor'], 'dummy') and
                not TRANSFORMER_MODELS['image_processor'].dummy):

                inputs = TRANSFORMER_MODELS['image_processor'](images=image, return_tensors="pt")
                with torch.no_grad():
                    outputs = TRANSFORMER_MODELS['image_model'](**inputs)

                image_embeddings = outputs.last_hidden_state.mean(dim=1).numpy().flatten()
                features['image_embedding_mean'] = float(np.mean(image_embeddings))
                features['image_present'] = 1
            else:
                # Use dummy values
                features['image_embedding_mean'] = 0.5
                features['image_present'] = 1
        else:
            features = get_default_visual_features()

    except Exception as e:
        print(f"Visual feature error for {url}: {e}")
        features = get_default_visual_features()

    return features

# Keep all other helper functions the same as before
def extract_favicon(url):
    """Extract favicon from URL"""
    try:
        if not url.startswith('http'):
            test_url = 'http://' + url
        else:
            test_url = url

        response = requests.get(test_url, timeout=5, verify=False)
        soup = BeautifulSoup(response.text, 'html.parser')

        icon_link = soup.find('link', rel=lambda x: x and 'icon' in x.lower() if x else False)
        if icon_link and icon_link.get('href'):
            icon_url = urljoin(test_url, icon_link['href'])
        else:
            icon_url = urljoin(test_url, '/favicon.ico')

        icon_response = requests.get(icon_url, timeout=5, verify=False)
        if icon_response.status_code == 200:
            image = Image.open(BytesIO(icon_response.content)).convert('RGB')
            image = image.resize((224, 224))
            return {'image': image}
    except:
        pass
    return None

def get_default_text_features():
    return {
        'text_embedding_mean': 0.0, 'text_embedding_std': 0.0,
        'sentence_embedding_mean': 0.0, 'text_length': 0,
        'title_length': 0, 'security_keyword_density': 0.0
    }

def get_default_visual_features():
    return {'image_embedding_mean': 0.0, 'image_present': 0}

def safe_request_get(url, timeout=6):
    try:
        return requests.get("http://"+url if not url.startswith("http") else url, timeout=timeout, allow_redirects=True, verify=False)
    except:
        return None

def fetch_page(url):
    r = safe_request_get(url)
    return (r, r.text) if r else (None, None)

def enhanced_logo_similarity(url):
    visual_features = extract_visual_features_with_transformers(url)
    return visual_features.get('image_embedding_mean', 0.0)

def enhanced_text_similarity(url):
    try:
        resp, html = fetch_page(url)
        if not html: return 0.0
        text_features = extract_text_features_with_transformers(html, url)
        return text_features.get('sentence_embedding_mean', 0.0)
    except:
        return 0.0

In [13]:
# ==============================
# üîß EXISTING FEATURE FUNCTIONS
# ==============================

# Lexical features
def url_length(url): return len(url) if isinstance(url,str) else np.nan
def count_chars(url,ch): return url.count(ch) if isinstance(url,str) else 0
def count_digits(url): return sum(c.isdigit() for c in str(url))
def count_letters(url): return sum(c.isalpha() for c in str(url))
def count_hyphens(url): return count_chars(url,'-')
def count_dots(url): return count_chars(url,'.')
def count_special_chars(url): return len(re.findall(r'[^A-Za-z0-9]', str(url)))

def shannon_entropy(s):
    s = str(s)
    if len(s) == 0: return 0.0
    prob = [float(s.count(c))/len(s) for c in dict.fromkeys(list(s))]
    return -sum([p*math.log(p,2) for p in prob])

def has_suspicious_keyword(url):
    suspicious = ['login','signin','secure','update','verify','account','bank','ebay','paypal','click']
    url = str(url).lower()
    return int(any(k in url for k in suspicious))

def num_subdomains(url):
    try:
        ext = tldextract.extract(url)
        return len(ext.subdomain.split('.')) if ext.subdomain else 0
    except: return 0

def top_domain_under_public_suffix(url):
    try: return tldextract.extract(url).domain
    except: return None

def tld_suffix(url):
    try: return tldextract.extract(url).suffix
    except: return ''

# DNS/IP Functions
_rdap_cache = {}

def _query_json(url, timeout=10):
    try:
        r = requests.get(url, timeout=timeout, headers={"User-Agent":"feature-extractor/1.0"})
        r.raise_for_status()
        return r.json()
    except Exception as e:
        return None

def get_asn_info(ip):
    if not ip:
        return {"asn": None, "rir": None, "name": None, "country": None, "error": "no ip"}
    if ip in _rdap_cache:
        return _rdap_cache[ip]

    try:
        iana_url = f"https://rdap.iana.org/ip/{ip}"
        iana_json = _query_json(iana_url)
        if not iana_json:
            out = {"asn": None, "rir": None, "name": None, "country": None, "error": "iana lookup failed"}
            _rdap_cache[ip] = out
            return out

        links = iana_json.get("links") or []
        if not links:
            out = {"asn": None, "rir": None, "name": None, "country": None, "error": "no rdap links from iana"}
            _rdap_cache[ip] = out
            return out

        last_exc = None
        for link in links:
            rdap_url = link.get("href")
            if not rdap_url:
                continue

            rdap_json = _query_json(rdap_url)
            if not rdap_json:
                try_url = rdap_url
                if not try_url.endswith("/"):
                    try_url = try_url + "/"
                try_url = urljoin(try_url, f"ip/{ip}")
                rdap_json = _query_json(try_url)
                if not rdap_json:
                    last_exc = f"failed rdap at {rdap_url}"
                    continue

            asn = None
            name = rdap_json.get("name") or rdap_json.get("handle")
            country = rdap_json.get("country") or None

            if "asn" in rdap_json and rdap_json["asn"]:
                asn = str(rdap_json["asn"])

            network = rdap_json.get("network") or rdap_json.get("object") or rdap_json
            if isinstance(network, dict):
                handle = network.get("handle") or network.get("name")
                if handle and not asn:
                    asn = handle

            out = {"asn": asn, "rir": rdap_url, "name": name, "country": country, "error": None}
            _rdap_cache[ip] = out
            return out

        out = {"asn": None, "rir": None, "name": None, "country": None, "error": last_exc or "rdap failed"}
        _rdap_cache[ip] = out
        return out

    except Exception as e:
        out = {"asn": None, "rir": None, "name": None, "country": None, "error": str(e)}
        _rdap_cache[ip] = out
        return out

def resolve_ip(domain):
    try: return socket.gethostbyname(domain)
    except: return None

def ip_whois_country(ip):
    try:
        obj=IPWhois(ip)
        res=obj.lookup_rdap(depth=1)
        return res.get('network',{}).get('country')
    except: return None

def ssl_valid(domain):
    try:
        ctx=ssl.create_default_context()
        with ctx.wrap_socket(socket.socket(),server_hostname=domain) as s:
            s.settimeout(4.0)
            s.connect((domain,443))
            cert=s.getpeercert()
            return 1 if cert else 0
    except: return 0

def ssl_days_remaining(domain):
    try:
        ctx=ssl.create_default_context()
        with ctx.wrap_socket(socket.socket(),server_hostname=domain) as s:
            s.settimeout(4.0)
            s.connect((domain,443))
            cert=s.getpeercert()
        if cert:
            not_after=cert.get('notAfter')
            expire_dt=datetime.strptime(not_after,'%b %d %H:%M:%S %Y %Z')
            return (expire_dt-datetime.now()).days
    except: return np.nan

def safe_request_get(url, timeout=6):
    try:
        return requests.get("http://"+url if not url.startswith("http") else url, timeout=timeout, allow_redirects=True, verify=False)
    except:
        return None

def fetch_page(url):
    r = safe_request_get(url)
    return (r, r.text) if r else (None, None)

def count_meta_tags(html):
    try: return len(BeautifulSoup(html,'html.parser').find_all('meta'))
    except: return 0

def has_iframe(html):
    try: return int(bool(BeautifulSoup(html,'html.parser').find('iframe')))
    except: return 0

def count_forms(html):
    try: return len(BeautifulSoup(html,'html.parser').find_all('form'))
    except: return 0

def has_password_input(html):
    try:
        for inp in BeautifulSoup(html,'html.parser').find_all('input',attrs={'type':True}):
            if inp.get('type','').lower()=='password': return 1
        return 0
    except: return 0

def count_external_links(html, base_domain):
    try:
        anchors=BeautifulSoup(html,'html.parser').find_all('a',href=True)
        return sum(1 for a in anchors if a['href'].startswith('http') and base_domain not in a['href'])
    except: return 0

def favicon_hash(url):
    try:
        r=safe_request_get(url)
        if not r: return None
        soup=BeautifulSoup(r.text,'html.parser')
        icon=soup.find('link',rel=lambda x: x and 'icon' in x.lower())
        if icon and icon.get('href'):
            icon_url=urljoin(url,icon['href'])
            ir=safe_request_get(icon_url)
            if ir and ir.content:
                img=Image.open(io.BytesIO(ir.content)).convert('RGB')
                return str(imagehash.average_hash(img))
    except: return None
    return None

In [14]:
# ==============================
# üöÄ SIMPLE TRANSFORMER FEATURES (FALLBACK INCLUDED)
# ==============================

def extract_simple_text_features(html_text, url):
    """Simple text features that work even without transformers"""
    features = {}

    try:
        if not html_text:
            return {'text_length': 0, 'title_length': 0, 'security_score': 0.0}

        soup = BeautifulSoup(html_text, 'html.parser')

        # Clean text
        for script in soup(["script", "style", "meta", "noscript"]):
            script.decompose()

        # Extract basic text features
        title_text = soup.find('title')
        title = title_text.get_text().strip() if title_text else ""
        visible_text = ' '.join(soup.stripped_strings)

        # Basic features
        features['text_length'] = len(visible_text)
        features['title_length'] = len(title)

        # Security keywords (simple version)
        security_keywords = ['login', 'password', 'secure', 'verify', 'account', 'bank', 'signin']
        found_keywords = sum(1 for keyword in security_keywords if keyword in visible_text.lower())
        features['security_score'] = found_keywords / len(security_keywords) if security_keywords else 0.0

        # Try transformer features if available
        if 'TRANSFORMER_MODELS' in globals() and hasattr(TRANSFORMER_MODELS.get('text_model', None), 'dummy'):
            if not TRANSFORMER_MODELS['text_model'].dummy:
                # Real transformers available
                full_text = f"{title} {visible_text}"[:500]
                inputs = TRANSFORMER_MODELS['text_tokenizer'](full_text, return_tensors="pt", truncation=True, max_length=512)
                with torch.no_grad():
                    outputs = TRANSFORMER_MODELS['text_model'](**inputs)
                text_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
                features['text_embedding_mean'] = float(np.mean(text_embedding))
                features['sentence_embedding'] = float(np.mean(TRANSFORMER_MODELS['sentence_model'].encode([full_text])[0]))
            else:
                # Dummy transformers - use simple alternatives
                features['text_embedding_mean'] = len(visible_text) / 1000.0  # Simple proxy
                features['sentence_embedding'] = features['security_score']
        else:
            # No transformers available
            features['text_embedding_mean'] = len(visible_text) / 1000.0
            features['sentence_embedding'] = features['security_score']

    except Exception as e:
        print(f"Text feature error for {url}: {e}")
        features = {'text_length': 0, 'title_length': 0, 'security_score': 0.0, 'text_embedding_mean': 0.0, 'sentence_embedding': 0.0}

    return features

def extract_simple_visual_features(url):
    """Simple visual features that work even without transformers"""
    features = {}

    try:
        # Try to get favicon
        favicon_data = extract_favicon(url)
        if favicon_data and 'image' in favicon_data:
            features['image_present'] = 1

            # Try transformer features if available
            if 'TRANSFORMER_MODELS' in globals() and hasattr(TRANSFORMER_MODELS.get('image_model', None), 'dummy'):
                if not TRANSFORMER_MODELS['image_model'].dummy:
                    # Real transformers available
                    inputs = TRANSFORMER_MODELS['image_processor'](images=favicon_data['image'], return_tensors="pt")
                    with torch.no_grad():
                        outputs = TRANSFORMER_MODELS['image_model'](**inputs)
                    image_embeddings = outputs.last_hidden_state.mean(dim=1).numpy().flatten()
                    features['image_embedding'] = float(np.mean(image_embeddings))
                else:
                    # Dummy transformers - use simple hash
                    features['image_embedding'] = hash(str(favicon_data['image'].size)) % 100 / 100.0
            else:
                # No transformers - use simple hash
                features['image_embedding'] = hash(str(favicon_data['image'].size)) % 100 / 100.0
        else:
            features = {'image_present': 0, 'image_embedding': 0.0}

    except Exception as e:
        print(f"Visual feature error for {url}: {e}")
        features = {'image_present': 0, 'image_embedding': 0.0}

    return features

def extract_favicon(url):
    """Extract favicon from URL"""
    try:
        if not url.startswith('http'):
            test_url = 'http://' + url
        else:
            test_url = url

        response = requests.get(test_url, timeout=5, verify=False)
        soup = BeautifulSoup(response.text, 'html.parser')

        icon_link = soup.find('link', rel=lambda x: x and 'icon' in x.lower() if x else False)
        if icon_link and icon_link.get('href'):
            icon_url = urljoin(test_url, icon_link['href'])
        else:
            icon_url = urljoin(test_url, '/favicon.ico')

        icon_response = requests.get(icon_url, timeout=5, verify=False)
        if icon_response.status_code == 200:
            image = Image.open(io.BytesIO(icon_response.content)).convert('RGB')
            image = image.resize((224, 224))
            return {'image': image}
    except:
        pass
    return None

# Enhanced similarity functions (fallback included)
def enhanced_logo_similarity(url):
    visual_features = extract_simple_visual_features(url)
    return visual_features.get('image_embedding', 0.0)

def enhanced_text_similarity(url):
    try:
        resp, html = fetch_page(url)
        if not html: return 0.0
        text_features = extract_simple_text_features(html, url)
        return text_features.get('sentence_embedding', 0.0)
    except:
        return 0.0

In [16]:
# --- MAIN PIPELINE (TRANSFORMER OR FALLBACK) ---
OUTPUT_PATH = "final_dataset.csv"

print("Starting feature extraction pipeline...")
print(f"Processing {len(df)} URLs")
rows = []

for i, raw_url in enumerate(df[URL_COLUMN].astype(str)):
    row = {'url': raw_url.strip()}
    url = row['url']

    if (i + 1) % 10 == 0:
        print(f"Processing {i+1}/{len(df)}: {url}")

    try:
        # --- Lexical Features ---
        row['url_length'] = url_length(url)
        row['num_dots'] = count_dots(url)
        row['num_hyphens'] = count_hyphens(url)
        row['num_digits'] = count_digits(url)
        row['num_letters'] = count_letters(url)
        row['num_special_chars'] = count_special_chars(url)
        row['entropy'] = shannon_entropy(url)
        row['suspicious_keyword'] = has_suspicious_keyword(url)
        row['num_subdomains'] = num_subdomains(url)
        domain = top_domain_under_public_suffix(url)
        row['top_domain_under_public_suffix'] = domain
        row['tld'] = tld_suffix(url)

        # --- DNS/IP Features ---
        ip = resolve_ip(domain) if domain else None
        row['ip'] = ip
        row['ip_country'] = ip_whois_country(ip) if ip else None

        # ASN lookup
        asn_info = get_asn_info(ip) if ip else {"asn": None, "country": None}
        row['asn'] = asn_info.get('asn')
        row['asn_country'] = asn_info.get('country')

        # --- SSL Features ---
        row['ssl_valid'] = ssl_valid(domain) if domain else 0
        row['ssl_days_remaining'] = ssl_days_remaining(domain) if domain else np.nan

        # --- HTTP/Content Features ---
        resp, html = fetch_page(url)
        row['status_code'] = resp.status_code if resp else None
        row['content_length'] = len(resp.content) if resp and resp.content else 0

        # Traditional content features
        row['meta_tags'] = count_meta_tags(html) if html else 0
        row['has_iframe'] = has_iframe(html) if html else 0
        row['external_links'] = count_external_links(html, domain) if html and domain else 0
        row['has_password_input'] = has_password_input(html) if html else 0
        row['form_count'] = count_forms(html) if html else 0
        row['favicon_hash'] = favicon_hash(url)

        # --- TRANSFORMER/FALLBACK FEATURES ---
        if html:
            text_features = extract_simple_text_features(html, url)
            row.update(text_features)

            visual_features = extract_simple_visual_features(url)
            row.update(visual_features)
        else:
            # Default values if no HTML
            row.update({'text_length': 0, 'title_length': 0, 'security_score': 0.0,
                       'text_embedding_mean': 0.0, 'sentence_embedding': 0.0,
                       'image_present': 0, 'image_embedding': 0.0})

        # Enhanced similarity
        row['logo_similarity'] = enhanced_logo_similarity(url)
        row['text_similarity'] = enhanced_text_similarity(url)

        # Hashes
        row['sha1'] = hashlib.sha1(url.encode()).hexdigest()
        row['md5'] = hashlib.md5(url.encode()).hexdigest()

    except Exception as e:
        print(f"Error processing {url}: {e}")
        # Add default values for all expected columns
        default_features = {
            'text_length': 0, 'title_length': 0, 'security_score': 0.0,
            'text_embedding_mean': 0.0, 'sentence_embedding': 0.0,
            'image_present': 0, 'image_embedding': 0.0,
            'logo_similarity': 0.0, 'text_similarity': 0.0
        }
        row.update(default_features)

    rows.append(row)

# Save results
features_df = pd.DataFrame(rows)
final_df = pd.concat([df.reset_index(drop=True), features_df.reset_index(drop=True)], axis=1)
final_df.to_csv(OUTPUT_PATH, index=False)
print(f"‚úÖ Feature extraction complete!")
print(f"üìä Final dataset shape: {final_df.shape}")
print(f"üíæ Saved to: {OUTPUT_PATH}")

# Show results
print("\nFirst 3 rows with new features:")
final_df.head(3)

Starting feature extraction pipeline...
Processing 2 URLs
Error processing https://www.google.com: name 're' is not defined
Error processing https://www.github.com: name 're' is not defined
‚úÖ Feature extraction complete!
üìä Final dataset shape: (2, 17)
üíæ Saved to: final_dataset.csv

First 3 rows with new features:


Unnamed: 0,url,label,url.1,url_length,num_dots,num_hyphens,num_digits,num_letters,text_length,title_length,security_score,text_embedding_mean,sentence_embedding,image_present,image_embedding,logo_similarity,text_similarity
0,https://www.google.com,legitimate,https://www.google.com,22,2,0,0,17,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0
1,https://www.github.com,legitimate,https://www.github.com,22,2,0,0,17,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0


In [17]:
# Download the final dataset
from google.colab import files
files.download(OUTPUT_PATH)
print("üì• File download initiated!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

üì• File download initiated!
