In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/phishing-website-detector/phishing.txt
/kaggle/input/phishing-website-detector/phishing.csv


In [2]:
!pip install -U numpy==1.25.2 scikit-learn==1.4.2 imbalanced-learn==0.12.0
!pip install tldextract
!pip install scikeras


Collecting numpy==1.25.2
  Downloading numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.12.0
  Downloading imbalanced_learn-0.12.0-py3-none-any.whl.metadata (8.2 kB)
Downloading numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m102.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.7/257.7 kB[0

In [3]:
# Loading dataset
import pandas as pd

# Load data
csv_data = pd.read_csv('/kaggle/input/phishing-website-detector/phishing.csv')
print("Dataset Loaded. Shape:", csv_data.shape)

Dataset Loaded. Shape: (11054, 32)


In [4]:
# Dataset operations
import pandas as pd
import tldextract
import urllib.parse
from math import log

# Feature extraction function
def calculate_entropy(s):
    freq = {}
    for c in s:
        freq[c] = freq.get(c, 0) + 1
    return -sum((f/len(s)) * log(f/len(s), 2) for f in freq.values() if f) if len(s) > 0 else 0

def extract_features_for_training(row, url_column='URL'):
    try:
        url = row[url_column]
        parsed_url = urllib.parse.urlparse(url if url.startswith(('http://', 'https://')) else f'http://{url}')
        extracted = tldextract.extract(url)
        domain_len = len(extracted.domain)
        subdomain_count = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
        path = parsed_url.path.lower()
        domain = extracted.domain.lower()
        suspicious_keywords = ['login', 'secure', 'account', 'verify', 'update', 'auth']
        tld = extracted.suffix.lower()
        is_local_ip = parsed_url.hostname and parsed_url.hostname.startswith('192.168.')
        entropy = calculate_entropy(domain)
        https = 1 if parsed_url.scheme == 'https' else -1
        features = {
            'AnchorURL': 1 if any(kw in path or kw in domain for kw in suspicious_keywords) else -1,
            'WebsiteTraffic': 1 if domain_len < 7 and tld in ['com', 'org', 'edu', 'gov'] else -1,
            'AgeofDomain': 1 if domain_len < 9 and tld in ['com', 'org', 'gov'] else -1,
            'PageRank': 1 if domain_len < 7 and tld in ['com', 'org', 'edu', 'gov'] else -1,
            'PrefixSuffix-': 1 if '-' in extracted.domain else -1,
            'UsingIP': 1 if is_local_ip else -2 if parsed_url.hostname and parsed_url.hostname.replace('.', '').isdigit() else -1,
            'SubDomains': -4 if subdomain_count >= 2 else -2 if subdomain_count == 1 else 0,
            'Entropy': entropy,
            'HTTPS': https
        }
        return pd.Series(features)
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return pd.Series({'AnchorURL': -1, 'WebsiteTraffic': -1, 'AgeofDomain': -1, 'PageRank': -1, 
                          'PrefixSuffix-': -1, 'UsingIP': -1, 'SubDomains': 0, 'Entropy': 0, 'HTTPS': -1})

# Check for URL column
possible_url_columns = ['URL', 'url', 'website', 'domain']
url_column = next((col for col in possible_url_columns if col in csv_data.columns), None)
if url_column is None:
    print("Warning: No URL column found. Using pre-existing features if available.")
    selected_features = [col for col in csv_data.columns if col != 'class' and csv_data[col].std() > 0]
else:
    csv_data['URL'] = csv_data[url_column]
    selected_features = ['AnchorURL', 'WebsiteTraffic', 'AgeofDomain', 'PageRank', 
                        'PrefixSuffix-', 'UsingIP', 'SubDomains', 'Entropy', 'HTTPS']

# Verify no duplicate features
if len(selected_features) != len(set(selected_features)):
    raise ValueError(f"Duplicate features detected in selected_features: {selected_features}")

# Check for class column and valid labels
if 'class' not in csv_data.columns:
    raise ValueError("Dataset must contain a 'class' column")
if not set(csv_data['class']).issubset({-1, 1}):
    raise ValueError("Class labels must be -1 or 1")

# Extract features if URL column exists
if url_column:
    existing_columns = csv_data.columns
    feature_df = csv_data.apply(extract_features_for_training, axis=1)
    feature_df.columns = [f"new_{col}" if col in existing_columns else col for col in feature_df.columns]
    selected_features = [f"new_{col}" if col in existing_columns else col for col in selected_features]
    csv_data = pd.concat([csv_data, feature_df], axis=1)
    csv_data = csv_data.loc[:, ~csv_data.columns.duplicated()]
else:
    csv_data = csv_data[selected_features + ['class']]

# Remove constant features
selected_features = [col for col in selected_features if csv_data[col].std() > 0]
if not selected_features:
    raise ValueError("No features with variance found. Check feature extraction or dataset.")

print("Data Shape:", csv_data.shape)
print("\nClass Distribution:\n", csv_data['class'].value_counts())
print("\nFeature Stats:\n", csv_data[selected_features].describe())

Data Shape: (11054, 32)

Class Distribution:
 class
 1    6157
-1    4897
Name: count, dtype: int64

Feature Stats:
               Index       UsingIP       LongURL      ShortURL       Symbol@  \
count  11054.000000  11054.000000  11054.000000  11054.000000  11054.000000   
mean    5526.500000      0.313914     -0.633345      0.738737      0.700561   
std     3191.159272      0.949495      0.765973      0.674024      0.713625   
min        0.000000     -1.000000     -1.000000     -1.000000     -1.000000   
25%     2763.250000     -1.000000     -1.000000      1.000000      1.000000   
50%     5526.500000      1.000000     -1.000000      1.000000      1.000000   
75%     8289.750000      1.000000     -1.000000      1.000000      1.000000   
max    11053.000000      1.000000      1.000000      1.000000      1.000000   

       Redirecting//  PrefixSuffix-    SubDomains         HTTPS  DomainRegLen  \
count   11054.000000   11054.000000  11054.000000  11054.000000  11054.000000   
mean     

In [5]:
# Dataset preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import joblib

# Prepare data
X = csv_data[selected_features]
y = csv_data['class'].map({-1: 0, 1: 1})

# Add synthetic data with SMOTE
smote = SMOTE(random_state=42)
X_aug, y_aug = smote.fit_resample(X, y)
augmented_data = pd.DataFrame(X_aug, columns=selected_features)
augmented_data['label'] = y_aug
print("\nAugmented Data Shape:", augmented_data.shape)

# Save scaler for later use
scaler = StandardScaler()
X_aug_scaled = scaler.fit_transform(X_aug)
joblib.dump(scaler, '/kaggle/working/scaler.pkl')


Augmented Data Shape: (12314, 32)


['/kaggle/working/scaler.pkl']

In [6]:
# Splitting dataset into training and testing
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_aug, y_aug, test_size=0.2, random_state=42)
print("Training Shape:", X_train.shape, y_train.shape)
print("Testing Shape:", X_test.shape, y_test.shape)

# Scale features
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Training Shape: (9851, 31) (9851,)
Testing Shape: (2463, 31) (2463,)


In [7]:
# Training Random Forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'estimator__n_estimators': [100, 200],
    'estimator__max_depth': [10, 20, None],
    'estimator__min_samples_split': [2, 5]
}
grid_search_rf = GridSearchCV(
    CalibratedClassifierCV(RandomForestClassifier(class_weight='balanced', random_state=42), method='sigmoid', cv=5),
    param_grid_rf, scoring='f1', cv=5, n_jobs=-1
)
grid_search_rf.fit(X_train_scaled, y_train)
best_rf_params = grid_search_rf.best_params_
print(f"Best Random Forest Params: {best_rf_params}")

# Train Random Forest with best parameters
rf_model = CalibratedClassifierCV(
    RandomForestClassifier(**{k.split('__')[1]: v for k, v in best_rf_params.items()},
                          class_weight='balanced', random_state=42),
    method='sigmoid', cv=5
)
rf_model.fit(X_train_scaled, y_train)
joblib.dump(rf_model, '/kaggle/working/calibrated_random_forest_model.pkl')
print("Random Forest model trained and saved.")

Best Random Forest Params: {'estimator__max_depth': 20, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 100}
Random Forest model trained and saved.


In [8]:
# Testing Random Forest model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Evaluate Random Forest
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_proba_rf = rf_model.predict_proba(X_test_scaled)[:, 1]
rf_results = {
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'Precision': precision_score(y_test, y_pred_rf, pos_label=1),
    'Recall': recall_score(y_test, y_pred_rf, pos_label=1),
    'F1-Score': f1_score(y_test, y_pred_rf, pos_label=1),
    'ROC-AUC': roc_auc_score(y_test, y_pred_proba_rf)
}
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("\nRandom Forest Metrics:")
print(f"Accuracy: {rf_results['Accuracy']:.4f}")
print(f"Precision: {rf_results['Precision']:.4f}")
print(f"Recall: {rf_results['Recall']:.4f}")
print(f"F1-Score: {rf_results['F1-Score']:.4f}")
print(f"ROC-AUC: {rf_results['ROC-AUC']:.4f}")
print(f"Confusion Matrix:\n{cm_rf}")

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title('Random Forest Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('/kaggle/working/random_forest_cm.png')
plt.close()

# Find optimal threshold
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, y_pred_proba_rf)
optimal_idx_rf = np.argmax(precision_rf * recall_rf)
optimal_threshold_rf = thresholds_rf[optimal_idx_rf]
print(f"Random Forest Optimal Threshold: {optimal_threshold_rf:.2f}")

# Feature importance
feature_importance_rf = pd.Series(rf_model.calibrated_classifiers_[0].estimator.feature_importances_, index=selected_features)
print("\nRandom Forest Feature Importances:\n", feature_importance_rf.sort_values(ascending=False))


Random Forest Metrics:
Accuracy: 0.9683
Precision: 0.9602
Recall: 0.9761
F1-Score: 0.9681
ROC-AUC: 0.9968
Confusion Matrix:
[[1203   49]
 [  29 1182]]
Random Forest Optimal Threshold: 0.64

Random Forest Feature Importances:
 HTTPS                  0.316367
AnchorURL              0.196381
WebsiteTraffic         0.083207
SubDomains             0.075179
Index                  0.060662
PrefixSuffix-          0.039338
LinksInScriptTags      0.033984
RequestURL             0.031653
DomainRegLen           0.019056
ServerFormHandler      0.017614
LinksPointingToPage    0.015102
AgeofDomain            0.012805
UsingIP                0.011426
GoogleIndex            0.010206
DNSRecording           0.009569
PageRank               0.009270
LongURL                0.006886
UsingPopupWindow       0.005128
ShortURL               0.004994
HTTPSDomainURL         0.004890
InfoEmail              0.004554
StatsReport            0.003883
Redirecting//          0.003868
Symbol@                0.003804
Favic

In [9]:
# Training XGBoost model
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV

# Train XGBoost
xgb_model = CalibratedClassifierCV(
    XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=100,
                  scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]), random_state=42),
    method='sigmoid', cv=5
)
xgb_model.fit(X_train_scaled, y_train)
joblib.dump(xgb_model, '/kaggle/working/calibrated_xgboost_model.pkl')
print("XGBoost model trained and saved.")

XGBoost model trained and saved.


In [10]:
# Testing XGBoost model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test_scaled)
y_pred_proba_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]
xgb_results = {
    'Accuracy': accuracy_score(y_test, y_pred_xgb),
    'Precision': precision_score(y_test, y_pred_xgb, pos_label=1),
    'Recall': recall_score(y_test, y_pred_xgb, pos_label=1),
    'F1-Score': f1_score(y_test, y_pred_xgb, pos_label=1),
    'ROC-AUC': roc_auc_score(y_test, y_pred_proba_xgb)
}
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print("\nXGBoost Metrics:")
print(f"Accuracy: {xgb_results['Accuracy']:.4f}")
print(f"Precision: {xgb_results['Precision']:.4f}")
print(f"Recall: {xgb_results['Recall']:.4f}")
print(f"F1-Score: {xgb_results['F1-Score']:.4f}")
print(f"ROC-AUC: {xgb_results['ROC-AUC']:.4f}")
print(f"Confusion Matrix:\n{cm_xgb}")

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title('XGBoost Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('/kaggle/working/xgboost_cm.png')
plt.close()

# Find optimal threshold
precision_xgb, recall_xgb, thresholds_xgb = precision_recall_curve(y_test, y_pred_proba_xgb)
optimal_idx_xgb = np.argmax(precision_xgb * recall_xgb)
optimal_threshold_xgb = thresholds_xgb[optimal_idx_xgb]
print(f"XGBoost Optimal Threshold: {optimal_threshold_xgb:.2f}")

# Feature importance
feature_importance_xgb = pd.Series(xgb_model.calibrated_classifiers_[0].estimator.feature_importances_, index=selected_features)
print("\nXGBoost Feature Importances:\n", feature_importance_xgb.sort_values(ascending=False))


XGBoost Metrics:
Accuracy: 0.9659
Precision: 0.9600
Recall: 0.9711
F1-Score: 0.9655
ROC-AUC: 0.9965
Confusion Matrix:
[[1203   49]
 [  35 1176]]
XGBoost Optimal Threshold: 0.77

XGBoost Feature Importances:
 HTTPS                  0.475534
AnchorURL              0.123709
PrefixSuffix-          0.057779
ServerFormHandler      0.032643
WebsiteTraffic         0.024551
LinksInScriptTags      0.022267
RequestURL             0.020588
ShortURL               0.019069
SubDomains             0.018667
DNSRecording           0.018476
LinksPointingToPage    0.014928
GoogleIndex            0.014698
LongURL                0.014462
StatusBarCust          0.014311
Redirecting//          0.011992
UsingIP                0.011905
PageRank               0.010342
HTTPSDomainURL         0.009686
AgeofDomain            0.009104
InfoEmail              0.008937
Index                  0.008531
DomainRegLen           0.008518
Favicon                0.008076
UsingPopupWindow       0.007778
WebsiteForwarding      

In [11]:
# Training Neural Network model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.calibration import CalibratedClassifierCV

# Define Neural Network
def create_nn():
    model = Sequential([
        Input(shape=(len(selected_features),)),
        Dense(16, activation='relu'),
        Dropout(0.3),
        Dense(8, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Custom Keras wrapper with early stopping
class CalibratedKerasClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, epochs=50, batch_size=32):
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.calibrated = None
    
    def fit(self, X, y):
        self.model = create_nn()
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, 
                       validation_split=0.2, callbacks=[early_stopping], verbose=0)
        self.calibrated = CalibratedClassifierCV(
            KerasClassifier(model=create_nn, epochs=self.epochs, batch_size=self.batch_size, verbose=0),
            method='sigmoid', cv=5
        )
        self.calibrated.fit(X, y)
        return self
    
    def predict(self, X):
        return self.calibrated.predict(X)
    
    def predict_proba(self, X):
        return self.calibrated.predict_proba(X)

# Train Neural Network
nn_model = CalibratedKerasClassifier(epochs=50, batch_size=32)
nn_model.fit(X_train_scaled, y_train)
joblib.dump(nn_model, '/kaggle/working/calibrated_neural_network_model.pkl')
print("Neural Network model trained and saved.")

2025-06-17 05:45:58.777175: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750139158.962031      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750139159.014356      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1750139171.333610      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
I0000 00:00:1750139174.298526     132 service.cc:148] XLA service 0x7e960c00a9e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1750139174.299066     1

Neural Network model trained and saved.


In [12]:
# Testing Neural Network model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Evaluate Neural Network
y_pred_nn = nn_model.predict(X_test_scaled)
y_pred_proba_nn = nn_model.predict_proba(X_test_scaled)[:, 1]
nn_results = {
    'Accuracy': accuracy_score(y_test, y_pred_nn),
    'Precision': precision_score(y_test, y_pred_nn, pos_label=1),
    'Recall': recall_score(y_test, y_pred_nn, pos_label=1),
    'F1-Score': f1_score(y_test, y_pred_nn, pos_label=1),
    'ROC-AUC': roc_auc_score(y_test, y_pred_proba_nn)
}
cm_nn = confusion_matrix(y_test, y_pred_nn)
print("\nNeural Network Metrics:")
print(f"Accuracy: {nn_results['Accuracy']:.4f}")
print(f"Precision: {nn_results['Precision']:.4f}")
print(f"Recall: {nn_results['Recall']:.4f}")
print(f"F1-Score: {nn_results['F1-Score']:.4f}")
print(f"ROC-AUC: {nn_results['ROC-AUC']:.4f}")
print(f"Confusion Matrix:\n{cm_nn}")

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title('Neural Network Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('/kaggle/working/neural_network_cm.png')
plt.close()

# Find optimal threshold
precision_nn, recall_nn, thresholds_nn = precision_recall_curve(y_test, y_pred_proba_nn)
optimal_idx_nn = np.argmax(precision_nn * recall_nn)
optimal_threshold_nn = thresholds_nn[optimal_idx_nn]
print(f"Neural Network Optimal Threshold: {optimal_threshold_nn:.2f}")


Neural Network Metrics:
Accuracy: 0.9553
Precision: 0.9457
Recall: 0.9645
F1-Score: 0.9550
ROC-AUC: 0.9925
Confusion Matrix:
[[1185   67]
 [  43 1168]]
Neural Network Optimal Threshold: 0.60


In [13]:
# Comparing models and conclusion
import pandas as pd

# Combine results
results = {
    'Random Forest': rf_results,
    'XGBoost': xgb_results,
    'Neural Network': nn_results
}
results_df = pd.DataFrame(results).T
print("\nModel Comparison:\n", results_df)

# Identify best model
best_model_name = results_df['F1-Score'].idxmax()
best_model_results = results[best_model_name]
print(f"\nBest Model: {best_model_name} (F1-Score: {best_model_results['F1-Score']:.4f})")

# Save best model
models = {'Random Forest': rf_model, 'XGBoost': xgb_model, 'Neural Network': nn_model}
best_model = models[best_model_name]
joblib.dump(best_model, f'/kaggle/working/calibrated_{best_model_name.lower().replace(" ", "_")}_model_v9.pkl')
print(f"Saved calibrated_{best_model_name.lower().replace(' ', '_')}_model_v9.pkl to /kaggle/working/")

# Prediction function
def extract_features(url):
    try:
        parsed_url = urllib.parse.urlparse(url if url.startswith(('http://', 'https://')) else f'http://{url}')
        extracted = tldextract.extract(url)
        domain_len = len(extracted.domain)
        subdomain_count = len(extracted.subdomain.split('.')) if extracted.subdomain else 0
        path = parsed_url.path.lower()
        domain = extracted.domain.lower()
        suspicious_keywords = ['login', 'secure', 'account', 'verify', 'update', 'auth']
        tld = extracted.suffix.lower()
        is_local_ip = parsed_url.hostname and parsed_url.hostname.startswith('192.168.')
        entropy = calculate_entropy(domain)
        https = 1 if parsed_url.scheme == 'https' else -1
        features = {
            'AnchorURL': 1 if any(kw in path or kw in domain for kw in suspicious_keywords) else -1,
            'WebsiteTraffic': 1 if domain_len < 7 and tld in ['com', 'org', 'edu', 'gov'] else -1,
            'AgeofDomain': 1 if domain_len < 9 and tld in ['com', 'org', 'gov'] else -1,
            'PageRank': 1 if domain_len < 7 and tld in ['com', 'org', 'edu', 'gov'] else -1,
            'PrefixSuffix-': 1 if '-' in extracted.domain else -1,
            'UsingIP': 1 if is_local_ip else -2 if parsed_url.hostname and parsed_url.hostname.replace('.', '').isdigit() else -1,
            'SubDomains': -4 if subdomain_count >= 2 else -2 if subdomain_count == 1 else 0,
            'Entropy': entropy,
            'HTTPS': https
        }
        return features
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return {'AnchorURL': -1, 'WebsiteTraffic': -1, 'AgeofDomain': -1, 'PageRank': -1, 
                'PrefixSuffix-': -1, 'UsingIP': -1, 'SubDomains': 0, 'Entropy': 0, 'HTTPS': -1}

def predict_safety(url, model, scaler, threshold=0.5):
    features = extract_features(url)
    input_df = pd.DataFrame([features], columns=selected_features)
    input_df = input_df.astype(float)
    input_df_scaled = scaler.transform(input_df)
    safety_score = model.predict_proba(input_df_scaled)[0, 0] * 100
    label = 'Likely Safe' if safety_score >= (1 - threshold) * 100 else 'Likely Phishing'
    return {'url': url, 'safety_score': round(safety_score, 2), 'label': label, 'features': features}

# Test URLs
test_urls = [
    "https://www.google.com",
    "https://fake-bank-login.com",
    "http://192.168.1.1/login",
    "https://paypal-security-login.com",
    "https://www.wikipedia.org",
    "https://www.netflix.com",
    "https://secure-login-bank.com",
    "https://www.linkedin.com",
    "http://update-your-account.com",
    "https://www.bbc.co.uk",
    "https://gov.engdwpid.icu/cn",
    "https://skymailre-validate.weebly.com/"
]

# Test predictions
optimal_threshold = {'Random Forest': optimal_threshold_rf, 'XGBoost': optimal_threshold_xgb, 'Neural Network': optimal_threshold_nn}
print(f"\n{best_model_name} URL Safety Predictions (Threshold: {optimal_threshold[best_model_name]:.2f}):")
for url in test_urls:
    result = predict_safety(url, best_model, scaler, optimal_threshold[best_model_name])
    print(f"URL: {result['url']} -> Safety Score: {result['safety_score']}% ({result['label']})")
    print(f"Features: {result['features']}")

# Conclusion
print("\nConclusion:")
print(f"The {best_model_name} model performed the best with an F1-Score of {best_model_results['F1-Score']:.4f}. "
      "It demonstrates strong capability in distinguishing between legitimate and phishing URLs. "
      "Key features influencing predictions include Entropy, HTTPS, and AnchorURL, as seen in feature importance (for tree-based models). "
      "The model is saved and can be used for real-time URL safety predictions.")


Model Comparison:
                 Accuracy  Precision    Recall  F1-Score   ROC-AUC
Random Forest   0.968331   0.960195  0.976053  0.968059  0.996825
XGBoost         0.965895   0.960000  0.971098  0.965517  0.996506
Neural Network  0.955339   0.945749  0.964492  0.955029  0.992461

Best Model: Random Forest (F1-Score: 0.9681)
Saved calibrated_random_forest_model_v9.pkl to /kaggle/working/

Random Forest URL Safety Predictions (Threshold: 0.64):
URL: https://www.google.com -> Safety Score: 87.64% (Likely Safe)
Features: {'AnchorURL': -1, 'WebsiteTraffic': 1, 'AgeofDomain': 1, 'PageRank': 1, 'PrefixSuffix-': -1, 'UsingIP': -1, 'SubDomains': -2, 'Entropy': 1.9182958340544893, 'HTTPS': 1}
URL: https://fake-bank-login.com -> Safety Score: 1.4% (Likely Phishing)
Features: {'AnchorURL': 1, 'WebsiteTraffic': -1, 'AgeofDomain': -1, 'PageRank': -1, 'PrefixSuffix-': 1, 'UsingIP': -1, 'SubDomains': 0, 'Entropy': 3.373557262275185, 'HTTPS': 1}
URL: http://192.168.1.1/login -> Safety Score: 89.73%

In [14]:
# Training LightGBM model
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
import joblib

# Train LightGBM
lgbm_model = CalibratedClassifierCV(
    LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, verbose=-1),
    method='sigmoid', cv=5
)
lgbm_model.fit(X_train_scaled, y_train)
joblib.dump(lgbm_model, '/kaggle/working/calibrated_lightgbm_model.pkl')
print("LightGBM model trained and saved.")

LightGBM model trained and saved.


In [15]:
# Testing LightGBM model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Evaluate LightGBM
y_pred_lgbm = lgbm_model.predict(X_test_scaled)
y_pred_proba_lgbm = lgbm_model.predict_proba(X_test_scaled)[:, 1]
lgbm_results = {
    'Accuracy': accuracy_score(y_test, y_pred_lgbm),
    'Precision': precision_score(y_test, y_pred_lgbm, pos_label=1),
    'Recall': recall_score(y_test, y_pred_lgbm, pos_label=1),
    'F1-Score': f1_score(y_test, y_pred_lgbm, pos_label=1),
    'ROC-AUC': roc_auc_score(y_test, y_pred_proba_lgbm)
}
cm_lgbm = confusion_matrix(y_test, y_pred_lgbm)
print("\nLightGBM Metrics:")
print(f"Accuracy: {lgbm_results['Accuracy']:.4f}")
print(f"Precision: {lgbm_results['Precision']:.4f}")
print(f"Recall: {lgbm_results['Recall']:.4f}")
print(f"F1-Score: {lgbm_results['F1-Score']:.4f}")
print(f"ROC-AUC: {lgbm_results['ROC-AUC']:.4f}")
print(f"Confusion Matrix:\n{cm_lgbm}")

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_lgbm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title('LightGBM Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('/kaggle/working/lightgbm_cm.png')
plt.close()

# Find optimal threshold
precision_lgbm, recall_lgbm, thresholds_lgbm = precision_recall_curve(y_test, y_pred_proba_lgbm)
optimal_idx_lgbm = np.argmax(precision_lgbm * recall_lgbm)
optimal_threshold_lgbm = thresholds_lgbm[optimal_idx_lgbm]
print(f"LightGBM Optimal Threshold: {optimal_threshold_lgbm:.2f}")

# Feature importance
feature_importance_lgbm = pd.Series(lgbm_model.calibrated_classifiers_[0].estimator.feature_importances_, index=selected_features)
print("\nLightGBM Feature Importances:\n", feature_importance_lgbm.sort_values(ascending=False))


LightGBM Metrics:
Accuracy: 0.9655
Precision: 0.9600
Recall: 0.9703
F1-Score: 0.9651
ROC-AUC: 0.9966
Confusion Matrix:
[[1203   49]
 [  36 1175]]
LightGBM Optimal Threshold: 0.70

LightGBM Feature Importances:
 Index                  480
HTTPS                  257
WebsiteTraffic         248
LinksInScriptTags      246
SubDomains             229
AnchorURL              212
LinksPointingToPage    148
RequestURL             123
AgeofDomain             95
PrefixSuffix-           87
ServerFormHandler       78
DomainRegLen            76
PageRank                69
DNSRecording            66
IframeRedirection       59
GoogleIndex             59
UsingIP                 57
LongURL                 51
ShortURL                48
InfoEmail               46
UsingPopupWindow        39
Favicon                 35
Redirecting//           32
WebsiteForwarding       28
AbnormalURL             28
NonStdPort              24
HTTPSDomainURL          22
DisableRightClick       19
StatusBarCust           15
Symbo

In [16]:
# Training CatBoost model
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV
import joblib

# Train CatBoost
catboost_model = CalibratedClassifierCV(
    CatBoostClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, verbose=0),
    method='sigmoid', cv=5
)
catboost_model.fit(X_train_scaled, y_train)
joblib.dump(catboost_model, '/kaggle/working/calibrated_catboost_model.pkl')
print("CatBoost model trained and saved.")

CatBoost model trained and saved.


In [17]:
# Testing CatBoost model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Evaluate CatBoost
y_pred_catboost = catboost_model.predict(X_test_scaled)
y_pred_proba_catboost = catboost_model.predict_proba(X_test_scaled)[:, 1]
catboost_results = {
    'Accuracy': accuracy_score(y_test, y_pred_catboost),
    'Precision': precision_score(y_test, y_pred_catboost, pos_label=1),
    'Recall': recall_score(y_test, y_pred_catboost, pos_label=1),
    'F1-Score': f1_score(y_test, y_pred_catboost, pos_label=1),
    'ROC-AUC': roc_auc_score(y_test, y_pred_proba_catboost)
}
cm_catboost = confusion_matrix(y_test, y_pred_catboost)
print("\nCatBoost Metrics:")
print(f"Accuracy: {catboost_results['Accuracy']:.4f}")
print(f"Precision: {catboost_results['Precision']:.4f}")
print(f"Recall: {catboost_results['Recall']:.4f}")
print(f"F1-Score: {catboost_results['F1-Score']:.4f}")
print(f"ROC-AUC: {catboost_results['ROC-AUC']:.4f}")
print(f"Confusion Matrix:\n{cm_catboost}")

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_catboost, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title('CatBoost Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('/kaggle/working/catboost_cm.png')
plt.close()

# Find optimal threshold
precision_catboost, recall_catboost, thresholds_catboost = precision_recall_curve(y_test, y_pred_proba_catboost)
optimal_idx_catboost = np.argmax(precision_catboost * recall_catboost)
optimal_threshold_catboost = thresholds_catboost[optimal_idx_catboost]
print(f"CatBoost Optimal Threshold: {optimal_threshold_catboost:.2f}")

# Feature importance
feature_importance_catboost = pd.Series(catboost_model.calibrated_classifiers_[0].estimator.feature_importances_, index=selected_features)
print("\nCatBoost Feature Importances:\n", feature_importance_catboost.sort_values(ascending=False))


CatBoost Metrics:
Accuracy: 0.9675
Precision: 0.9624
Recall: 0.9719
F1-Score: 0.9671
ROC-AUC: 0.9964
Confusion Matrix:
[[1206   46]
 [  34 1177]]
CatBoost Optimal Threshold: 0.56

CatBoost Feature Importances:
 AnchorURL              20.821653
HTTPS                  15.746782
WebsiteTraffic          9.538633
SubDomains              6.193686
PrefixSuffix-           5.505230
Index                   4.666806
ServerFormHandler       4.342038
LinksInScriptTags       4.092566
LinksPointingToPage     3.698499
DNSRecording            3.379207
RequestURL              3.182867
UsingIP                 3.058244
AgeofDomain             2.671087
PageRank                1.889003
DomainRegLen            1.749532
GoogleIndex             1.724020
LongURL                 1.359689
Favicon                 0.923789
AbnormalURL             0.905219
UsingPopupWindow        0.698898
InfoEmail               0.665896
HTTPSDomainURL          0.646896
Redirecting//           0.603568
IframeRedirection       0.510

In [18]:
# Training SVM model
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
import joblib

# Train SVM
svm_model = CalibratedClassifierCV(
    SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42),
    method='sigmoid', cv=5
)
svm_model.fit(X_train_scaled, y_train)
joblib.dump(svm_model, '/kaggle/working/calibrated_svm_model.pkl')
print("SVM model trained and saved.")

SVM model trained and saved.


In [19]:
# Testing SVM model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Evaluate SVM
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_proba_svm = svm_model.predict_proba(X_test_scaled)[:, 1]
svm_results = {
    'Accuracy': accuracy_score(y_test, y_pred_svm),
    'Precision': precision_score(y_test, y_pred_svm, pos_label=1),
    'Recall': recall_score(y_test, y_pred_svm, pos_label=1),
    'F1-Score': f1_score(y_test, y_pred_svm, pos_label=1),
    'ROC-AUC': roc_auc_score(y_test, y_pred_proba_svm)
}
cm_svm = confusion_matrix(y_test, y_pred_svm)
print("\nSVM Metrics:")
print(f"Accuracy: {svm_results['Accuracy']:.4f}")
print(f"Precision: {svm_results['Precision']:.4f}")
print(f"Recall: {svm_results['Recall']:.4f}")
print(f"F1-Score: {svm_results['F1-Score']:.4f}")
print(f"ROC-AUC: {svm_results['ROC-AUC']:.4f}")
print(f"Confusion Matrix:\n{cm_svm}")

# Plot confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title('SVM Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('/kaggle/working/svm_cm.png')
plt.close()

# Find optimal threshold
precision_svm, recall_svm, thresholds_svm = precision_recall_curve(y_test, y_pred_proba_svm)
optimal_idx_svm = np.argmax(precision_svm * recall_svm)
optimal_threshold_svm = thresholds_svm[optimal_idx_svm]
print(f"SVM Optimal Threshold: {optimal_threshold_svm:.2f}")


SVM Metrics:
Accuracy: 0.9566
Precision: 0.9473
Recall: 0.9653
F1-Score: 0.9562
ROC-AUC: 0.9899
Confusion Matrix:
[[1187   65]
 [  42 1169]]
SVM Optimal Threshold: 0.59


In [31]:
# Preprocessing for LSTM model
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Check for URL column
if 'URL' in csv_data.columns:
    # Prepare URL character sequences
    max_len = 100  # Maximum URL length
    tokenizer = Tokenizer(char_level=True, lower=True)
    tokenizer.fit_on_texts(csv_data['URL'])
    X_seq = tokenizer.texts_to_sequences(csv_data['URL'])
    X_seq_padded = pad_sequences(X_seq, maxlen=max_len, padding='post', truncating='post')

    # Split data for LSTM
    X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(X_seq_padded, y_aug, test_size=0.2, random_state=42)
    print("LSTM Training Shape:", X_train_seq.shape, y_train_seq.shape)
    print("LSTM Testing Shape:", X_test_seq.shape, y_test_seq.shape)
else:
    print("No 'URL' column found in dataset. Skipping LSTM preprocessing.")
    X_train_seq, X_test_seq, y_train_seq, y_test_seq = None, None, None, None
    tokenizer = None

No 'URL' column found in dataset. Skipping LSTM preprocessing.


In [33]:
# Comparing all models and conclusion
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
import joblib

# Combine results (exclude LSTM if not preprocessed)
results = {
    'Random Forest': rf_results,
    'XGBoost': xgb_results,
    'Neural Network': nn_results,
    'LightGBM': lgbm_results,
    'CatBoost': catboost_results,
    'SVM': svm_results
}
if 'URL' in csv_data.columns and 'lstm_results' in globals():
    results['LSTM'] = lstm_results

results_df = pd.DataFrame(results).T
print("\nModel Comparison:\n", results_df)

# Identify best model
best_model_name = results_df['F1-Score'].idxmax()
best_model_results = results[best_model_name]
print(f"\nBest Model: {best_model_name} (F1-Score: {best_model_results['F1-Score']:.4f})")

# Plot ROC curves
plt.figure(figsize=(8, 6))
models = {
    'Random Forest': rf_model,
    'XGBoost': xgb_model,
    'Neural Network': nn_model,
    'LightGBM': lgbm_model,
    'CatBoost': catboost_model,
    'SVM': svm_model
}
for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_pred_proba):.2f})')
if 'URL' in csv_data.columns and 'lstm_model' in globals():
    y_pred_proba_lstm = lstm_model.predict_proba(X_test_seq)[:, 1]
    fpr_lstm, tpr_lstm, _ = roc_curve(y_test_seq, y_pred_proba_lstm)
    plt.plot(fpr_lstm, tpr_lstm, label=f'LSTM (AUC = {roc_auc_score(y_test_seq, y_pred_proba_lstm):.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend()
plt.savefig('/kaggle/working/roc_curves_all_models.png')
plt.close()

# Save best model
models_all = {
    'Random Forest': rf_model,
    'XGBoost': xgb_model,
    'Neural Network': nn_model,
    'LightGBM': lgbm_model,
    'CatBoost': catboost_model,
    'SVM': svm_model
}
if 'URL' in csv_data.columns and 'lstm_model' in globals():
    models_all['LSTM'] = lstm_model
best_model = models_all[best_model_name]
joblib.dump(best_model, f'/kaggle/working/calibrated_{best_model_name.lower().replace(" ", "_")}_model_v10.pkl')
print(f"Saved calibrated_{best_model_name.lower().replace(' ', '_')}_model_v10.pkl to /kaggle/working/")

# Update predict_safety for LSTM compatibility
def predict_safety(url, model, scaler, tokenizer=None, max_len=100, threshold=0.5, is_lstm=False):
    if is_lstm and tokenizer is not None:
        seq = tokenizer.texts_to_sequences([url])
        seq_padded = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
        safety_score = model.predict_proba(seq_padded)[0, 0] * 100
        label = 'Likely Safe' if safety_score >= (1 - threshold) * 100 else 'Likely Phishing'
        return {'url': url, 'safety_score': round(safety_score, 2), 'label': label, 'features': 'Character sequences'}
    else:
        features = extract_features(url)
        input_df = pd.DataFrame([features], columns=selected_features)
        input_df = input_df.astype(float)
        input_df_scaled = scaler.transform(input_df)
        safety_score = model.predict_proba(input_df_scaled)[0, 0] * 100
        label = 'Likely Safe' if safety_score >= (1 - threshold) * 100 else 'Likely Phishing'
        return {'url': url, 'safety_score': round(safety_score, 2), 'label': label, 'features': features}

# Test URLs
test_urls = [
    "https://www.google.com",
    "https://fake-bank-login.com",
    "http://192.168.1.1/login",
    "https://paypal-security-login.com",
    "https://www.wikipedia.org",
    "https://www.netflix.com",
    "https://secure-login-bank.com",
    "https://www.linkedin.com",
    "http://update-your-account.com",
    "https://www.bbc.co.uk",
    "https://gov.engdwpid.icu/cn",
    "https://skymailre-validate.weebly.com/"
]

# Test predictions with best model
optimal_thresholds = {
    'Random Forest': optimal_threshold_rf,
    'XGBoost': optimal_threshold_xgb,
    'Neural Network': optimal_threshold_nn,
    'LightGBM': optimal_threshold_lgbm,
    'CatBoost': optimal_threshold_catboost,
    'SVM': optimal_threshold_svm
}
if 'URL' in csv_data.columns and 'lstm_model' in globals():
    optimal_thresholds['LSTM'] = optimal_threshold_lstm

print(f"\n{best_model_name} URL Safety Predictions (Threshold: {optimal_thresholds[best_model_name]:.2f}):")
for url in test_urls:
    result = predict_safety(url, best_model, scaler, tokenizer, max_len, optimal_thresholds[best_model_name], is_lstm=(best_model_name == 'LSTM'))
    print(f"URL: {result['url']} -> Safety Score: {result['safety_score']}% ({result['label']})")
    print(f"Features: {result['features']}")

# Conclusion
print("\nConclusion:")
print(f"The {best_model_name} model achieved the highest F1-Score of {best_model_results['F1-Score']:.4f}. "
      "Among the new models, LightGBM, CatBoost, and SVM provide strong alternatives. "
      "The LSTM model was skipped or included based on the availability of raw URLs. "
      "Tree-based models (Random Forest, LightGBM, CatBoost) often excel due to feature importance insights, while SVM offers robust performance for non-linear patterns. "
      "The best model is saved for deployment.")


Model Comparison:
                 Accuracy  Precision    Recall  F1-Score   ROC-AUC
Random Forest   0.968331   0.960195  0.976053  0.968059  0.996825
XGBoost         0.965895   0.960000  0.971098  0.965517  0.996506
Neural Network  0.955339   0.945749  0.964492  0.955029  0.992461
LightGBM        0.965489   0.959967  0.970273  0.965092  0.996634
CatBoost        0.967519   0.962388  0.971924  0.967132  0.996417
SVM             0.956557   0.947326  0.965318  0.956237  0.989886

Best Model: Random Forest (F1-Score: 0.9681)
Saved calibrated_random_forest_model_v10.pkl to /kaggle/working/

Random Forest URL Safety Predictions (Threshold: 0.64):
URL: https://www.google.com -> Safety Score: 87.64% (Likely Safe)
Features: {'AnchorURL': -1, 'WebsiteTraffic': 1, 'AgeofDomain': 1, 'PageRank': 1, 'PrefixSuffix-': -1, 'UsingIP': -1, 'SubDomains': -2, 'Entropy': 1.9182958340544893, 'HTTPS': 1}
URL: https://fake-bank-login.com -> Safety Score: 1.4% (Likely Phishing)
Features: {'AnchorURL': 1, 'Webs