In [2]:
import numpy as np

X = np.load("X_features.npy")
y = np.load("y_labels.npy")


In [None]:
X


array([[225,  58,  32, ...,   1,   0,   0],
       [ 81,   1,  15, ...,   0,   0,   0],
       [177,  47,  19, ...,   0,   1,   1],
       ...,
       [ 17,   0,   3, ...,   0,   0,   0],
       [ 18,   0,   3, ...,   0,   0,   0],
       [ 17,   0,   2, ...,   0,   0,   0]])

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape, y_train.shape)
print("Test :", X_test.shape, y_test.shape)


Train: (439476, 21) (439476,)
Test : (109870, 21) (109870,)


In [4]:
X_train

array([[22,  0,  2, ...,  0,  0,  0],
       [66,  1, 12, ...,  0,  0,  0],
       [90, 10, 13, ...,  0,  0,  0],
       ...,
       [65,  6, 13, ...,  0,  0,  0],
       [66,  0,  9, ...,  0,  0,  0],
       [24,  0,  5, ...,  0,  0,  0]])

In [5]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric='logloss',
    n_jobs=-1
)

xgb.fit(X_train, y_train)


In [7]:
!pip install catboost scikit-learn


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [8]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    iterations=300,
    depth=8,
    learning_rate=0.1,
    loss_function="Logloss",
    verbose=False,
)

cat.fit(X_train, y_train)
cat.save_model("cat_model.cbm")
print("cat_model.cbm saved!")



cat_model.cbm saved!


In [9]:
X_train

array([[22,  0,  2, ...,  0,  0,  0],
       [66,  1, 12, ...,  0,  0,  0],
       [90, 10, 13, ...,  0,  0,  0],
       ...,
       [65,  6, 13, ...,  0,  0,  0],
       [66,  0,  9, ...,  0,  0,  0],
       [24,  0,  5, ...,  0,  0,  0]])

In [10]:
# Train predictions
xgb_train_pred = xgb.predict_proba(X_train)[:,1]
cat_train_pred = cat.predict_proba(X_train)[:,1]

stack_train = np.column_stack((xgb_train_pred, cat_train_pred))

# Test predictions
xgb_test_pred = xgb.predict_proba(X_test)[:,1]
cat_test_pred = cat.predict_proba(X_test)[:,1]

stack_test = np.column_stack((xgb_test_pred, cat_test_pred))


In [11]:
from sklearn.linear_model import LogisticRegression

meta = LogisticRegression()
meta.fit(stack_train, y_train)


In [13]:
!pip install joblib



In [16]:
import joblib


In [17]:
# Final stacked output
final_pred = meta.predict(stack_test)
final_prob = meta.predict_proba(stack_test)[:,1]
joblib.dump(meta, "meta_model.pkl")
print("Saved meta_model.pkl (correct model)!")




Saved meta_model.pkl (correct model)!


In [18]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(confusion_matrix(y_test, final_pred))
print(classification_report(y_test, final_pred, digits=4))
print("AUC:", roc_auc_score(y_test, final_prob))


[[75319  3266]
 [ 6696 24589]]
              precision    recall  f1-score   support

           0     0.9184    0.9584    0.9380     78585
           1     0.8827    0.7860    0.8316     31285

    accuracy                         0.9093    109870
   macro avg     0.9006    0.8722    0.8848    109870
weighted avg     0.9082    0.9093    0.9077    109870

AUC: 0.9553839491332983


In [19]:
import joblib

joblib.dump(xgb, "xgb_model.pkl")
joblib.dump(cat, "cat_model.pkl")
joblib.dump(meta, "stacked_meta.pkl")

print("Models saved!")


Models saved!


In [20]:
def predict_url(url):
    feats = extract_features(url).reshape(1, -1)



    # Base model predictions
    xgb_p = xgb.predict_proba(feats)[:,1]
    cat_p = cat.predict_proba(feats)[:,1]

    stack_input = np.column_stack((xgb_p, cat_p))

    final_p = meta.predict(stack_input)[0]
    prob = meta.predict_proba(stack_input)[0][1]

    label = "BAD (Phishing)" if final_p == 1 else "GOOD (Safe)"
    return label, prob
