In [94]:
!pip install liac-arff xgboost tldextract beautifulsoup4 requests python-whois




In [1]:
import arff
import pandas as pd
import numpy as np
import re
import requests
import tldextract
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import whois
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb
import pickle


In [6]:
with open('phishing+websites/Training Dataset.arff', 'r') as f:
    arff_data = arff.load(f)
df = pd.DataFrame(arff_data['data'], columns=[attr[0] for attr in arff_data['attributes']])

# Clean and prepare
df = df.replace('?', np.nan).fillna(0)
for col in df.columns:
    if col != 'Result':
        df[col] = pd.to_numeric(df[col], errors='coerce')
df['Result'] = pd.to_numeric(df['Result'], errors='coerce').replace(-1, 0)
df = df[df['Result'].isin([0, 1])]

X = df.drop(columns=['Result'])
y = df['Result']
feature_columns = X.columns.tolist()


In [7]:
def get_whois_info(domain):
    try:
        w = whois.whois(domain)
        creation = w.creation_date
        expiration = w.expiration_date
        if isinstance(creation, list): creation = creation[0]
        if isinstance(expiration, list): expiration = expiration[0]
        now = datetime.now()
        age = (now - creation).days if creation else 0
        reg_length = (expiration - creation).days if creation and expiration else 0
        return age, reg_length
    except:
        return 0, 0


In [8]:
def extract_arff_features(url, expected_columns):
    features = {}
    def has_ip(url): return 1 if re.match(r"^(http[s]?://)?\d+\.\d+\.\d+\.\d+", url) else 0
    def shortening_service(url): return 1 if re.search(r"(bit\.ly|goo\.gl|tinyurl|ow\.ly|t\.co|x\.co)", url) else 0
    def has_at_symbol(url): return 1 if '@' in url else 0
    def double_slash_redirect(url): return 1 if url.rfind('//') > 6 else 0
    def prefix_suffix(domain): return 1 if '-' in domain else 0
    def sub_domain_count(url):
        hostname = urlparse(url).hostname or ''
        return 2 if hostname.count('.') > 2 else (1 if hostname.count('.') == 2 else 0)
    def ssl_state(url): return 1 if url.startswith("https") else 0
    def favicon_check(domain, soup):
        icons = soup.find_all("link", rel=lambda x: x and 'icon' in x.lower())
        for icon in icons:
            href = icon.get("href", "")
            if domain not in href: return 1
        return 0

    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=10)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
    except:
        html = ""
        soup = BeautifulSoup("", 'html.parser')

    parsed = urlparse(url)
    ext = tldextract.extract(url)
    domain = parsed.netloc
    full_domain = ext.domain + "." + ext.suffix

    age, reg_length = get_whois_info(full_domain)

    features['having_IP_Address'] = has_ip(url)
    features['URL_Length'] = len(url)
    features['Shortining_Service'] = shortening_service(url)
    features['having_At_Symbol'] = has_at_symbol(url)
    features['double_slash_redirecting'] = double_slash_redirect(url)
    features['Prefix_Suffix'] = prefix_suffix(full_domain)
    features['having_Sub_Domain'] = sub_domain_count(url)
    features['SSLfinal_State'] = ssl_state(url)
    features['Domain_registeration_length'] = reg_length
    features['Favicon'] = favicon_check(domain, soup)
    features['port'] = 0
    features['HTTPS_token'] = 1 if 'https' in domain.lower() else 0

    imgs = soup.find_all('img', src=True)
    total_imgs = len(imgs)
    external_imgs = len([img for img in imgs if ext.domain not in img['src']])
    features['Request_URL'] = 1 if total_imgs > 0 and external_imgs / total_imgs >= 0.5 else 0

    anchors = soup.find_all('a', href=True)
    total_anchors = len(anchors)
    null_anchors = len([a for a in anchors if a['href'] in ['#', 'javascript:void(0);', ''] or ext.domain not in a['href']])
    features['URL_of_Anchor'] = 1 if total_anchors > 0 and null_anchors / total_anchors > 0.5 else 0

    tags = soup.find_all(['link', 'script', 'meta'], href=True)
    external_tags = len([tag for tag in tags if ext.domain not in tag.get('href', '')])
    features['Links_in_tags'] = 1 if external_tags > 2 else 0

    forms = soup.find_all('form', action=True)
    suspicious_forms = len([form for form in forms if form['action'] == "" or ext.domain not in form['action']])
    features['SFH'] = 1 if len(forms) > 0 and suspicious_forms / len(forms) >= 0.5 else 0

    features['Submitting_to_email'] = 1 if re.search(r"mailto:", html.lower()) else 0
    features['Abnormal_URL'] = 1 if domain not in url else 0
    features['Redirect'] = 1 if soup.find('meta', attrs={'http-equiv': re.compile("refresh", re.I)}) or re.search(r"window\.location|location\.replace", html.lower()) else 0
    features['on_mouseover'] = 1 if re.search(r"onmouseover\s*=", html.lower()) else 0
    features['RightClick'] = 1 if re.search(r"event.button\s*==\s*2", html.lower()) else 0
    features['popUpWidnow'] = 1 if re.search(r"alert\s*\(", html.lower()) else 0
    features['Iframe'] = 1 if "<iframe" in html.lower() else 0

    features['age_of_domain'] = age
    features['DNSRecord'] = 1
    features['web_traffic'] = 0
    features['Page_Rank'] = 0
    features['Google_Index'] = 1 if "google" in html.lower() else 0
    features['Links_pointing_to_page'] = len(anchors)
    features['Statistical_report'] = 0

    for col in expected_columns:
        if col not in features:
            features[col] = 0

    return pd.DataFrame([[features[col] for col in expected_columns]], columns=expected_columns)


In [14]:
# 🐟 Updated list of known phishing URLs
phish_urls = [
    "http://toncoinsp.com",
    "http://www.paypa1.com",
    "http://www.faceb00k-login.com",
    "http://www.apple-support-security-alert.com",
    "http://www.bankofamerica-login.com",
    "http://www.g00gle-login-attempt.com",
    "http://paypal-opladen.be",
    "http://login.microsoftonline.ccisystems.us",
    "http://login-amazon-account.com",
    "http://apple-grx-support-online.com",
    "http://renew-netflix.com",
    "http://allegrolokalnie.pl-1751.cfd",
    "http://mondiai-rps.com/index.php"
]

# Extract features and label them as phishing (Result = 0)
new_phish_samples = []
for url in phish_urls:
    try:
        feat = extract_arff_features(url, feature_columns)
        feat["Result"] = 0
        new_phish_samples.append(feat)
    except Exception as e:
        print(f"❌ Failed to process {url}: {e}")

# Add new phishing samples to the existing dataset
if new_phish_samples:
    df_phish = pd.concat(new_phish_samples, ignore_index=True)
    df = pd.concat([df, df_phish], ignore_index=True)

# ✅ Rebuild feature and label sets
X = df.drop(columns=['Result'])
y = df['Result']


In [10]:
# 🔁 Retrain the XGBoost model with updated data
model = xgb.XGBClassifier(eval_metric='logloss', objective='binary:logistic')
model.fit(X, y)


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split your dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Retrain on training split (optional but recommended for realistic evaluation)
model.fit(X_train, y_train)

# Predict on test split
y_pred = model.predict(X_test)

# Evaluate
print("📈 Model Evaluation Metrics:\n")
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred, target_names=["Phishing (0)", "Legitimate (1)"]))


📈 Model Evaluation Metrics:

✅ Accuracy: 0.9742895805142084

🧮 Confusion Matrix:
 [[ 945   40]
 [  17 1215]]

📋 Classification Report:
                 precision    recall  f1-score   support

  Phishing (0)       0.98      0.96      0.97       985
Legitimate (1)       0.97      0.99      0.98      1232

      accuracy                           0.97      2217
     macro avg       0.98      0.97      0.97      2217
  weighted avg       0.97      0.97      0.97      2217



In [16]:
test_url = input("🔍 Enter a URL to check: ").strip()

def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.scheme) and bool(parsed.netloc)

if is_valid_url(test_url):
    try:
        features_df = extract_arff_features(test_url, feature_columns)
        print("\n✅ Extracted Features:")
        print(features_df.T)

        proba = model.predict_proba(features_df)[0][1]
        print(f"\n🎯 Phishing Probability: {proba:.4f}")

        if proba > 0.85:
            print("✅ Legitimate (High confidence)")
        elif proba > 0.5:
            print("⚠️ Suspicious – Further analysis recommended")
        else:
            print("❌ Phishing (High confidence)")

    except Exception as e:
        print("❌ Error processing URL:", e)
else:
    print("⚠ Invalid URL format. Please include http:// or https://")



✅ Extracted Features:
                               0
having_IP_Address              0
URL_Length                    84
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 1
Domain_registeration_length  365
Favicon                        0
port                           0
HTTPS_token                    0
Request_URL                    1
URL_of_Anchor                  0
Links_in_tags                  1
SFH                            1
Submitting_to_email            0
Abnormal_URL                   0
Redirect                       1
on_mouseover                   0
RightClick                     0
popUpWidnow                    0
Iframe                         1
age_of_domain                235
DNSRecord                      1
web_traffic                    0
Page_Rank                      0
Google_Index                   1
Links_pointing_to_pa