In [133]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
import json

# Load CSV file
df = pd.read_csv('PhiUSIIL_Phishing_URL_Dataset.csv/PhiUSIIL_Phishing_URL_Dataset.csv')
# Display first 10 rows
print(df.head(10))

                                  URL  URLLength                      Domain  \
0    https://www.southbankmosaics.com         31    www.southbankmosaics.com   
1            https://www.uni-mainz.de         23            www.uni-mainz.de   
2      https://www.voicefmradio.co.uk         29      www.voicefmradio.co.uk   
3         https://www.sfnmjournal.com         26         www.sfnmjournal.com   
4  https://www.rewildingargentina.org         33  www.rewildingargentina.org   
5     https://www.globalreporting.org         30     www.globalreporting.org   
6          https://www.saffronart.com         25          www.saffronart.com   
7          https://www.nerdscandy.com         25          www.nerdscandy.com   
8      https://www.hyderabadonline.in         29      www.hyderabadonline.in   
9                 https://www.aap.org         18                 www.aap.org   

   DomainLength  IsDomainIP  TLD  URLSimilarityIndex  CharContinuationRate  \
0            24           0  com         

In [134]:
columns_to_drop = [
    "URL", "Domain", "TLD", "Title",
    "URLSimilarityIndex", "HasSocialNet", "HasCopyrightInfo", 
    "HasDescription", "IsHTTPS", "DomainTitleMatchScore", 
    "URLTitleMatchScore", "HasSubmitButton", "HasHiddenFields",
    "IsResponsive", "NoOfJS", "HasTitle", "HasFavicon", 
    "URLCharProb", "CharContinuationRate", "NoOfSelfRef",
    "NoOfLettersInURL", "LetterRatioInURL", "NoOfDegitsInURL", "DegitRatioInURL",
    "Robots", "NoOfCSS", "NoOfExternalRef", "Pay", "NoOfImage", 
    "NoOfSubDomain", "NoOfObfuscatedChar", "NoOfURLRedirect",
    "NoOfiFrame", "Bank", "HasExternalFormSubmit", "HasPasswordField", 
    "Crypto", "TLDLegitimateProb", "NoOfPopup",
    "IsDomainIP", "HasObfuscation", "ObfuscationRatio", 
    "NoOfEqualsInURL", "NoOfQMarkInURL", "NoOfAmpersandInURL"
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

X = df.drop(columns=["label"])
y = df["label"].astype(int)

print("Label distribution:\n", y.value_counts())

# Drop the specified features
features_to_drop = ['URLSimilarityIndex', 'LineOfCode']
df_filtered = df.drop(columns=[col for col in features_to_drop if col in df.columns])
X_filtered = df_filtered.drop(columns=["label"])
y = df_filtered["label"].astype(int)

feature_columns = X_filtered.columns.tolist()

# Save for later use
with open('feature_columns.json', 'w') as f:
    json.dump(feature_columns, f)


Label distribution:
 label
1    134850
0    100945
Name: count, dtype: int64


In [135]:
# Recalculate correlation after dropping additional features
correlation_with_label = df_filtered[X_filtered.columns.tolist() + ["label"]].corr()["label"].sort_values(ascending=False)
print("Correlation with label after dropping additional features:\n", correlation_with_label)
print("Remaining features:", X_filtered.columns.tolist())

Correlation with label after dropping additional features:
 label                         1.000000
NoOfEmptyRef                  0.109235
LargestLineLength            -0.041111
NoOfSelfRedirect             -0.076463
TLDLength                    -0.079159
URLLength                    -0.233445
DomainLength                 -0.283152
NoOfOtherSpecialCharsInURL   -0.358891
SpacialCharRatioInURL        -0.533537
Name: label, dtype: float64
Remaining features: ['URLLength', 'DomainLength', 'TLDLength', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'LargestLineLength', 'NoOfSelfRedirect', 'NoOfEmptyRef']


In [136]:
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
X_filtered, y, test_size=0.4, stratify=y, random_state=42)
model_filtered = lgb.LGBMClassifier()
model_filtered.fit(X_train_f, y_train_f)
y_pred_f = model_filtered.predict(X_test_f)
print("Baseline Accuracy:", accuracy_score(y_test_f, y_pred_f))
print(confusion_matrix(y_test_f, y_pred_f))
print(classification_report(y_test_f, y_pred_f))

[LightGBM] [Info] Number of positive: 80910, number of negative: 60567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003908 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1046
[LightGBM] [Info] Number of data points in the train set: 141477, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.571895 -> initscore=0.289587
[LightGBM] [Info] Start training from score 0.289587
Baseline Accuracy: 0.9831421361776119
[[39270  1108]
 [  482 53458]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     40378
           1       0.98      0.99      0.99     53940

    accuracy                           0.98     94318
   macro avg       0.98      0.98      0.98     94318
weighted avg       0.98      0.98      0.98     94318



In [137]:
import requests
import re
import pandas as pd
import tldextract
from bs4 import BeautifulSoup

def extract_features(url, expected_columns):
    try:
        headers = {'Accept-Encoding': 'identity'}
        response = requests.get(url, headers=headers, timeout=10)
        html = response.text
    except Exception as e:
        print("Error fetching URL:", e)
        html = ""

    soup = BeautifulSoup(html, 'html.parser')
    ext = tldextract.extract(url)
    domain = ext.domain + '.' + ext.suffix if ext.suffix else ext.domain

    def safe_div(a, b):
        return a / b if b != 0 else 0

    # Extract features based on the current feature list
    features = {
        'URLLength': len(url),
        'DomainLength': len(domain),
        'LargestLineLength': max([len(line) for line in html.split('\n')]) if html else 0,
        'SpacialCharRatioInURL': safe_div(sum(not c.isalnum() for c in url), len(url)),
        'TLDLength': len(ext.suffix),
        'NoOfEmptyRef': len([a for a in soup.find_all("a", href=True) if a["href"] in ["#", ""]]),
        'NoOfOtherSpecialCharsInURL': sum(not c.isalnum() and c not in "=?&" for c in url),
        'NoOfSelfRedirect': 0  # Placeholder, as no clear definition is provided
    }

    # Ensure all expected columns are included
    for col in expected_columns:
        if col not in features:
            features[col] = 0

    return pd.DataFrame([[features[col] for col in expected_columns]], columns=expected_columns)

In [None]:
known_legit_urls = [
    "https://www.facebook.com/",
    "https://accounts.google.com/",
    "https://twitter.com/",
    "https://www.linkedin.com/",
    "https://github.com/",
    "https://login.microsoftonline.com/",
    "https://www.paypal.com/",
    "https://www.instagram.com/",
    "https://www.apple.com/",
    "https://www.amazon.com/",
    "https://www.netflix.com/",
    "https://www.youtube.com/",
    "https://www.wikipedia.org/"
]

new_legit_samples = []
for url in known_legit_urls:
    try:
        feat = extract_features(url, feature_columns)
        feat["label"] = 1
        new_legit_samples.append(feat)
    except Exception as e:
        print(f"Failed to process {url}: {e}")

df_new_legit = pd.concat(new_legit_samples, ignore_index=True)

In [None]:
df_filtered["label"] = y
df_combined = pd.concat([df_filtered, df_new_legit], ignore_index=True)
X_new = df_combined.drop(columns=["label"])
y_new = df_combined["label"].astype(int)

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
    X_new, y_new, test_size=0.25, stratify=y_new, random_state=42
)

model_improved = lgb.LGBMClassifier()
model_improved.fit(X_train_new, y_train_new)

[LightGBM] [Info] Number of positive: 101147, number of negative: 75709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003411 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1113
[LightGBM] [Info] Number of data points in the train set: 176856, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.571917 -> initscore=0.289678
[LightGBM] [Info] Start training from score 0.289678


In [None]:
with open('feature_columns.json', 'r') as f:
    feature_columns = json.load(f)

url = input("Enter a URL to check: ")
features_df = extract_features(url, feature_columns)

print("Extracted Features:")
print(features_df.T)

pred = model_improved.predict(features_df)[0]
print("\nPrediction:", "Legitimate" if pred == 1 else "Malicious")

Error fetching URL: Invalid URL 'reddit.com': No scheme supplied. Perhaps you meant https://reddit.com?
Extracted Features:
                               0
URLLength                   10.0
DomainLength                10.0
TLDLength                    3.0
NoOfOtherSpecialCharsInURL   1.0
SpacialCharRatioInURL        0.1
LargestLineLength            0.0
NoOfSelfRedirect             0.0
NoOfEmptyRef                 0.0

Prediction: Malicious
