In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv("/content/updated_dataset.csv")

# Define features and target
feature_columns = ["ip", "nb_qm", "nb_www", "ratio_digits_url", "phish_hints",
                   "nb_hyperlinks", "domain_in_title", "domain_age",
                   "google_index", "page_rank"]
X = df[feature_columns]
y = df["label"]  # 0 = benign, 1 = phishing

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Save model
import joblib
joblib.dump(model, "xgboost_phishing_model.pkl")


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.95


['xgboost_phishing_model.pkl']

In [None]:
!pip install lime


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=9153fc4cff4f75241c3b9feef6eee2f47ae4390ab60f468070d5022783c0fd3d
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
import pandas as pd
import lime.lime_tabular
import numpy as np
import joblib

# Load dataset
df = pd.read_csv("/content/updated_dataset.csv")

# Load trained model
model = joblib.load("/content/xgboost_phishing_model.pkl")

# Define feature columns
feature_columns = ["ip", "nb_qm", "nb_www", "ratio_digits_url", "phish_hints",
                   "nb_hyperlinks", "domain_in_title", "domain_age",
                   "google_index", "page_rank"]
X = df[feature_columns]

# Initialize LIME Explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    X.values,
    feature_names=feature_columns,
    class_names=['Benign', 'Phishing'],
    mode="classification"
)

# Function to convert LIME explanation into readable text
def generate_text_explanation(lime_explanation):
    explanation_text = []

    for feature, effect in lime_explanation:
        if "ip" in feature:
            explanation_text.append("The presence of an IP address in the URL increases the likelihood of phishing.")
        elif "nb_qm" in feature:
            explanation_text.append("A high number of question marks in the URL suggests suspicious activity.")
        elif "nb_www" in feature:
            explanation_text.append("The occurrence of 'www' in an unusual place within the URL may indicate phishing.")
        elif "ratio_digits_url" in feature:
            explanation_text.append("A high proportion of digits in the URL is often associated with phishing websites.")
        elif "phish_hints" in feature:
            explanation_text.append("Certain words in the URL, commonly used in phishing attacks, increase the risk of being a phishing site.")
        elif "nb_hyperlinks" in feature:
            explanation_text.append("An unusually high number of hyperlinks on the page may suggest phishing behavior.")
        elif "domain_in_title" in feature:
            explanation_text.append("If the domain name is present in the page title, it is less likely to be phishing.")
        elif "domain_age" in feature:
            explanation_text.append("A newly registered domain is more likely to be used for phishing.")
        elif "google_index" in feature:
            explanation_text.append("If Google has indexed the site, it is more likely to be legitimate.")
        elif "page_rank" in feature:
            explanation_text.append("A higher page rank generally indicates a more trustworthy website.")

    return " ".join(explanation_text)

# Generate explanations for each row
explanations = []
for i in range(len(X)):
    exp = explainer.explain_instance(X.iloc[i].values, model.predict_proba, num_features=5)
    readable_explanation = generate_text_explanation(exp.as_list())
    explanations.append(readable_explanation)

# Add explanations to dataset
df["lime_explanation"] = explanations

# Save the updated dataset
df.to_csv("dataset_with_readable_explanations.csv", index=False)
