In [2]:
import numpy as np

X = np.load("X_features.npy")
y = np.load("y_labels.npy")


In [None]:
X


array([[225,  58,  32, ...,   1,   0,   0],
       [ 81,   1,  15, ...,   0,   0,   0],
       [177,  47,  19, ...,   0,   1,   1],
       ...,
       [ 17,   0,   3, ...,   0,   0,   0],
       [ 18,   0,   3, ...,   0,   0,   0],
       [ 17,   0,   2, ...,   0,   0,   0]])

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_scaled = X_scaled.reshape((X_scaled.shape[0], X_scaled.shape[1], 1))


In [None]:
print(X_scaled.shape)


(549346, 21, 1)


In [None]:
# If labels are strings, map them; if they are already 0/1, this won’t break anything
if y.dtype.type is np.str_ or y.dtype.type is np.object_:
    print("Converting labels 'good'/'bad' to 0/1...")
    y = np.where(y == "bad", 1, 0)

# Make sure it's int
y = y.astype(int)

unique, counts = np.unique(y, return_counts=True)
print("Label distribution:", dict(zip(unique, counts)))


Label distribution: {np.int64(0): np.int64(392924), np.int64(1): np.int64(156422)}


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


Train shape: (439476, 21) (439476,)
Test shape: (109870, 21) (109870,)


In [None]:
scaler = StandardScaler()

# Fit on train only, transform both train & test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("After scaling:")
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


After scaling:
X_train_scaled shape: (439476, 21)
X_test_scaled shape: (109870, 21)


In [None]:
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_scaled  = X_test_scaled.reshape((X_test_scaled.shape[0],  X_test_scaled.shape[1],  1))

print("Final train shape:", X_train_scaled.shape)
print("Final test shape:", X_test_scaled.shape)


Final train shape: (439476, 21, 1)
Final test shape: (109870, 21, 1)


In [None]:
from sklearn.utils.class_weight import compute_class_weight


In [None]:
classes = np.unique(y_train)
class_weights_vals = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)

class_weights = {int(c): w for c, w in zip(classes, class_weights_vals)}

print("Class weights:", class_weights)


Class weights: {0: np.float64(0.6990478432520304), 1: np.float64(1.7559794465266068)}


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


In [None]:
timesteps = X_train_scaled.shape[1]   # 23
channels  = X_train_scaled.shape[2]   # 1

inputs = Input(shape=(timesteps, channels))

# --- CNN Block ---
x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(inputs)
x = BatchNormalization()(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.3)(x)

x = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.3)(x)

# --- LSTM Block ---
x = LSTM(64, return_sequences=False)(x)
x = Dropout(0.4)(x)

# --- Dense Block ---
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)

outputs = Dense(1, activation='sigmoid')(x)

model = Model(inputs=inputs, outputs=outputs)

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc'),
    ]
)

model.summary()


In [None]:
checkpoint_path = "best_cnn_lstm_phishing.keras"

callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    ),
    ModelCheckpoint(
        filepath=checkpoint_path,
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        verbose=1,
        min_lr=1e-6
    )
]


In [None]:
batch_size = 512   # you can try 256 or 1024 depending on GPU/CPU
epochs = 20

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.1,          # 10% of train data for validation
    epochs=epochs,
    batch_size=batch_size,
    class_weight=class_weights,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/20
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7640 - auc: 0.8251 - loss: 0.4986 - precision: 0.5722 - recall: 0.6862
Epoch 1: val_loss improved from inf to 0.36697, saving model to best_cnn_lstm_phishing.keras
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 11ms/step - accuracy: 0.7640 - auc: 0.8252 - loss: 0.4986 - precision: 0.5723 - recall: 0.6863 - val_accuracy: 0.8418 - val_auc: 0.9099 - val_loss: 0.3670 - val_precision: 0.6975 - val_recall: 0.7846 - learning_rate: 0.0010
Epoch 2/20
[1m771/773[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 0.8309 - auc: 0.9000 - loss: 0.3957 - precision: 0.6768 - recall: 0.7790
Epoch 2: val_loss improved from 0.36697 to 0.33197, saving model to best_cnn_lstm_phishing.keras
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.8309 - auc: 0.9000 - loss: 0.3957 - precision: 0.6769 - recall: 0.7790 - val_accuracy

In [None]:
test_loss, test_acc, test_prec, test_rec, test_auc = model.evaluate(X_test_scaled, y_test, verbose=1)

print(f"Test Loss     : {test_loss:.4f}")
print(f"Test Accuracy : {test_acc:.4f}")
print(f"Test Precision: {test_prec:.4f}")
print(f"Test Recall   : {test_rec:.4f}")
print(f"Test AUC      : {test_auc:.4f}")


[1m3434/3434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.8879 - auc: 0.9456 - loss: 0.2757 - precision: 0.7863 - recall: 0.8330
Test Loss     : 0.2767
Test Accuracy : 0.8873
Test Precision: 0.7853
Test Recall   : 0.8313
Test AUC      : 0.9454


In [None]:
# Predicted probabilities
y_proba = model.predict(X_test_scaled, batch_size=1024)
y_pred = (y_proba >= 0.5).astype(int).reshape(-1)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("ROC AUC:", roc_auc_score(y_test, y_proba))


[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
Confusion Matrix:
[[71475  7110]
 [ 5277 26008]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9312    0.9095    0.9203     78585
           1     0.7853    0.8313    0.8077     31285

    accuracy                         0.8873    109870
   macro avg     0.8583    0.8704    0.8640    109870
weighted avg     0.8897    0.8873    0.8882    109870

ROC AUC: 0.9454484395559305


In [None]:
# Save final model
model.save("cnn_lstm_phishing_final.keras")

# Save scaler
import joblib
joblib.dump(scaler, "feature_scaler.pkl")

print("Saved cnn_lstm_phishing_final.keras and feature_scaler.pkl")


Saved cnn_lstm_phishing_final.keras and feature_scaler.pkl


In [None]:
import tensorflow as tf
import joblib
import numpy as np

# Load trained model
model = tf.keras.models.load_model("cnn_lstm_phishing_final.keras")

# Load scaler
scaler = joblib.load("feature_scaler.pkl")


In [None]:
!pip install tldextract

Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-3.0.1-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-3.0.1-py2.py3-none-any.whl (4.5 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-3.0.1 tldextract-5.3.0


In [None]:
import re
import tldextract
from urllib.parse import urlparse, parse_qs

SUSPICIOUS_WORDS = [
    "secure", "account", "verify",
    "login", "update", "bank", "confirm"
]

def extract_features(url):
    features = {}

    parsed = urlparse(url)

    # ---- Lexical Features ----
    features["url_length"] = len(url)
    features["num_digits"] = sum(c.isdigit() for c in url)
    features["num_special_chars"] = sum(not c.isalnum() for c in url)
    features["num_dots"] = url.count(".")
    features["has_ip"] = int(bool(re.search(r"\d+\.\d+\.\d+\.\d+", url)))
    features["has_at_symbol"] = int("@" in url)
    features["has_double_slash"] = int(url.count("//") > 1)
    features["has_hyphen"] = int("-" in url)
    features["protocol_http"] = int(parsed.scheme == "http")
    features["protocol_https"] = int(parsed.scheme == "https")

    # ---- Domain & Subdomain ----
    ext = tldextract.extract(url)
    domain = ext.domain or ""
    subdomain = ext.subdomain or ""
    suffix = ext.suffix or ""

    features["domain_length"] = len(domain)
    features["subdomain_length"] = len(subdomain)
    features["num_subdomains"] = subdomain.count(".") + 1 if subdomain else 0
    features["tld_length"] = len(suffix)

    features["suspicious_subdomain"] = int(
        any(word in subdomain.lower() for word in SUSPICIOUS_WORDS)
    ) if subdomain else 0

    # ---- Path & Query ----
    features["path_length"] = len(parsed.path)
    features["query_length"] = len(parsed.query)

    params = parse_qs(parsed.query)
    features["num_params"] = len(params)

    lower_url = url.lower()
    features["has_login_keyword"] = int("login" in lower_url)
    features["has_secure_keyword"] = int("secure" in lower_url)
    features["has_update_keyword"] = int("update" in lower_url)

    # Dummy WHOIS (needed to keep input shape same)
   # features["domain_age_days"] = 0
   # features["domain_expiry_days"] = 0

    return np.array(list(features.values()))


In [None]:
def predict_url(url):
    # Extract features
    feats = extract_features(url)

    # Convert to 2D for scaler (1,23)
    feats = feats.reshape(1, -1)

    # Scale
    feats_scaled = scaler.transform(feats)

    # Reshape for CNN + LSTM (1, 23, 1)
    feats_scaled = feats_scaled.reshape((1, feats_scaled.shape[1], 1))

    # Predict
    prob = model.predict(feats_scaled)[0][0]

    # Threshold = 0.5
    label = "BAD (Phishing)" if prob >= 0.5 else "GOOD (Safe)"

    return label, prob


In [None]:
url = "https://secure-login-paypal.com/update-info"
label, prob = predict_url(url)

print("URL:", url)
print("Prediction:", label)
print("Probability:", prob)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step
URL: https://secure-login-paypal.com/update-info
Prediction: BAD (Phishing)
Probability: 0.99894613


In [None]:
urls = [
    "https://www.google.com",
    "http://198.23.44.12/login",
    "https://secure-paypal-verification.com/update",
    "https://github.com",
]

for u in urls:
    label, prob = predict_url(u)
    print(f"{u}  -->  {label} ({prob:.4f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
https://www.google.com  -->  BAD (Phishing) (0.8820)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
http://198.23.44.12/login  -->  BAD (Phishing) (0.9999)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
https://secure-paypal-verification.com/update  -->  BAD (Phishing) (0.9966)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
https://github.com  -->  BAD (Phishing) (0.8828)


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)


Train: (439476, 21) (439476,)
Test : (109870, 21) (109870,)


In [4]:
X_train

array([[22,  0,  2, ...,  0,  0,  0],
       [66,  1, 12, ...,  0,  0,  0],
       [90, 10, 13, ...,  0,  0,  0],
       ...,
       [65,  6, 13, ...,  0,  0,  0],
       [66,  0,  9, ...,  0,  0,  0],
       [24,  0,  5, ...,  0,  0,  0]])

In [5]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric='logloss',
    n_jobs=-1
)

xgb.fit(X_train, y_train)


In [7]:
!pip install catboost scikit-learn


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [8]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    iterations=300,
    depth=8,
    learning_rate=0.1,
    loss_function="Logloss",
    verbose=False,
)

cat.fit(X_train, y_train)
cat.save_model("cat_model.cbm")
print("cat_model.cbm saved!")



cat_model.cbm saved!


In [9]:
X_train

array([[22,  0,  2, ...,  0,  0,  0],
       [66,  1, 12, ...,  0,  0,  0],
       [90, 10, 13, ...,  0,  0,  0],
       ...,
       [65,  6, 13, ...,  0,  0,  0],
       [66,  0,  9, ...,  0,  0,  0],
       [24,  0,  5, ...,  0,  0,  0]])

In [10]:
# Train predictions
xgb_train_pred = xgb.predict_proba(X_train)[:,1]
cat_train_pred = cat.predict_proba(X_train)[:,1]

stack_train = np.column_stack((xgb_train_pred, cat_train_pred))

# Test predictions
xgb_test_pred = xgb.predict_proba(X_test)[:,1]
cat_test_pred = cat.predict_proba(X_test)[:,1]

stack_test = np.column_stack((xgb_test_pred, cat_test_pred))


In [11]:
from sklearn.linear_model import LogisticRegression

meta = LogisticRegression()
meta.fit(stack_train, y_train)


In [13]:
!pip install joblib



In [16]:
import joblib


In [17]:
# Final stacked output
final_pred = meta.predict(stack_test)
final_prob = meta.predict_proba(stack_test)[:,1]
joblib.dump(meta, "meta_model.pkl")
print("Saved meta_model.pkl (correct model)!")




Saved meta_model.pkl (correct model)!


In [18]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(confusion_matrix(y_test, final_pred))
print(classification_report(y_test, final_pred, digits=4))
print("AUC:", roc_auc_score(y_test, final_prob))


[[75319  3266]
 [ 6696 24589]]
              precision    recall  f1-score   support

           0     0.9184    0.9584    0.9380     78585
           1     0.8827    0.7860    0.8316     31285

    accuracy                         0.9093    109870
   macro avg     0.9006    0.8722    0.8848    109870
weighted avg     0.9082    0.9093    0.9077    109870

AUC: 0.9553839491332983


In [19]:
import joblib

joblib.dump(xgb, "xgb_model.pkl")
joblib.dump(cat, "cat_model.pkl")
joblib.dump(meta, "stacked_meta.pkl")

print("Models saved!")


Models saved!


In [20]:
def predict_url(url):
    feats = extract_features(url).reshape(1, -1)



    # Base model predictions
    xgb_p = xgb.predict_proba(feats)[:,1]
    cat_p = cat.predict_proba(feats)[:,1]

    stack_input = np.column_stack((xgb_p, cat_p))

    final_p = meta.predict(stack_input)[0]
    prob = meta.predict_proba(stack_input)[0][1]

    label = "BAD (Phishing)" if final_p == 1 else "GOOD (Safe)"
    return label, prob


In [22]:
urls = [
    "https://www.google.com",
    "https://github.com",
    "http://198.23.44.12/login",
    "https://secure-paypal-verification.com/update"
]

for u in urls:
    label, prob = predict_url(u)
    print(f"{u}  →  {label} ({prob:.4f})")


NameError: name 'extract_features' is not defined