In [1]:
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib  # For saving the trained model


In [2]:
# 🚀 **Step 1: Load the Dataset**
file_path = "urldata.csv"  # Change this to your dataset path
df = pd.read_csv(file_path)


In [3]:
# Drop unnecessary columns
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)


In [4]:
# Convert labels to numerical format
df["label"] = df["label"].map({"benign": 0, "malicious": 1})

In [5]:
# Reduce dataset size to 100,000 samples for memory efficiency
df_sampled = df.groupby("label").sample(n=50000, random_state=42)  # Balance both classes


In [6]:
# 🚀 **Step 2: Feature Extraction**
def extract_url_features(url):
    return {
        "url_length": len(url),
        "num_hyphens": url.count("-"),
        "num_underscores": url.count("_"),
        "num_slashes": url.count("/"),
        "num_digits": sum(c.isdigit() for c in url),
        "num_subdomains": url.count(".") - 1,
        "contains_ip": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "num_special_chars": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?")
    }


In [7]:
# Apply feature extraction
df_features = df_sampled["url"].apply(lambda x: extract_url_features(x))
df_features = pd.DataFrame(df_features.tolist())  # Convert to DataFrame

In [8]:
# Labels
y_sampled = df_sampled["label"]

In [9]:
# 🚀 **Step 3: Train-Test Split**
X_train, X_test, y_train, y_test = train_test_split(df_features, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled)

In [10]:
# 🚀 **Step 4: Train XGBoost Model**
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False
)

In [11]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [12]:
# 🚀 **Step 5: Evaluate the Model**
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [13]:
print(f"🔹 Model Accuracy: {accuracy:.4f}")
print("🔹 Classification Report:\n", report)

🔹 Model Accuracy: 0.8465
🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85     10000
           1       0.86      0.82      0.84     10000

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000



In [24]:
def extract_url_features(url):
    import re
    return {
        "url_length": len(url),
        "num_hyphens": url.count("-"),
        "num_underscores": url.count("_"),
        "num_slashes": url.count("/"),
        "num_digits": sum(c.isdigit() for c in url),
        "num_subdomains": url.count(".") - 1,
        "contains_ip": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "num_special_chars": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?")
    }

# 🚀 **Step 2: Prepare the URL for Prediction**
url_to_test = "http://free-bitcoin-claim.com"  # Replace with any URL you want to test

# Convert URL to feature format
url_features = pd.DataFrame([extract_url_features(url_to_test)])

# 🚀 **Step 3: Make Prediction Using Your Trained Model**
prediction = xgb_model.predict(url_features)

# 🚀 **Step 4: Display the Result**
result = "Malicious" if prediction[0] == 1 else "Benign (Safe)"
print(f"🔹 URL: {url_to_test} → Prediction: {result}")



🔹 URL: http://free-bitcoin-claim.com → Prediction: Malicious


In [25]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

# 🚀 **Step 1: Prepare Data (Use Your Existing Feature Extraction)**
df_sampled = df.groupby("label").sample(n=50000, random_state=42)  # Balanced dataset

# Convert URLs to extracted features
df_features = df_sampled["url"].apply(lambda x: extract_url_features(x))
df_features = pd.DataFrame(df_features.tolist())

# Labels
y_sampled = df_sampled["label"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(df_features, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled)


In [26]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)

# 🚀 **Step 3: Define Hyperparameter Grid for Tuning**
param_grid = {
    "n_estimators": [100, 200, 300, 400],  # Number of trees
    "max_depth": [3, 6, 9],  # Depth of trees
    "learning_rate": [0.01, 0.1, 0.2, 0.3],  # Step size shrinkage
    "subsample": [0.7, 0.8, 0.9],  # Fraction of samples used per tree
    "colsample_bytree": [0.7, 0.8, 0.9],  # Fraction of features used per tree
    "gamma": [0, 0.1, 0.2, 0.3],  # Minimum loss reduction required for a split
}


In [27]:
# 🚀 **Step 4: Perform Randomized Search for Best Hyperparameters**
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=20,  # Number of different combinations to try
    scoring="accuracy",
    cv=3,  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1,  # Use all CPU cores
    random_state=42
)

random_search.fit(X_train, y_train)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [28]:
# 🚀 **Step 5: Train Model with Best Parameters**
best_params = random_search.best_params_
print(f"✅ Best Parameters: {best_params}")

optimized_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    **best_params  # Use best found hyperparameters
)

optimized_model.fit(X_train, y_train)

# 🚀 **Step 6: Evaluate New Model**
y_pred = optimized_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"🔹 Improved Model Accuracy: {accuracy:.4f}")
print("🔹 Improved Classification Report:\n", report)

✅ Best Parameters: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.9}


Parameters: { "use_label_encoder" } are not used.



🔹 Improved Model Accuracy: 0.8532
🔹 Improved Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86     10000
           1       0.87      0.83      0.85     10000

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000

[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=200, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.3, max_depth=3, n_estimators=200, subsample=0.7; total time=   0.5s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.1, max_depth=9, n_estimators=300, subsample=0.8; total time=   1.5s
[CV] END colsample_bytree=0.8, gamma=0.2, learning_rate=0.2, max_depth=9, n_estimators=200, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.7, gamma=0.2, learning_rate=0.3, max_depth=6, n_estimators=200, subsample=0.

In [29]:
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

# 🚀 **Step 1: Load Dataset**
file_path = "urldata.csv"  # Change this to your dataset path
df = pd.read_csv(file_path)

# Drop unnecessary columns if present
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)

# Convert labels to numerical format
df["label"] = df["label"].map({"benign": 0, "malicious": 1})

# Reduce dataset size to 100,000 samples for memory efficiency
df_sampled = df.groupby("label").sample(n=50000, random_state=42)  # Balance both classes

# 🚀 **Step 2: Feature Extraction (Advanced URL Features)**
def extract_url_features(url):
    suspicious_words = ["secure", "login", "verify", "update", "free", "gift", "money", "account"]
    
    return {
        "url_length": len(url),
        "num_hyphens": url.count("-"),
        "num_underscores": url.count("_"),
        "num_slashes": url.count("/"),
        "num_digits": sum(c.isdigit() for c in url),
        "num_subdomains": url.count(".") - 1,
        "contains_ip": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "num_special_chars": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?"),
        "digit_ratio": sum(c.isdigit() for c in url) / len(url),  # New feature
        "special_char_ratio": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?") / len(url),  # New feature
        "suspicious_word_count": sum(1 for word in suspicious_words if word in url)  # New feature
    }

# Apply feature extraction
df_features = df_sampled["url"].apply(lambda x: extract_url_features(x))
df_features = pd.DataFrame(df_features.tolist())

# Labels
y_sampled = df_sampled["label"]

# 🚀 **Step 3: Split Data into Train and Test Sets**
X_train, X_test, y_train, y_test = train_test_split(df_features, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled)

# 🚀 **Step 4: Define Hyperparameter Search Space**
param_dist = {
    "n_estimators": [100, 200, 300, 400],
    "max_depth": [3, 6, 9, 12],
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "subsample": [0.6, 0.7, 0.8, 0.9],
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9],
    "gamma": [0, 0.1, 0.2, 0.3, 0.4],
}

# 🚀 **Step 5: Perform Randomized Search with K-Fold Cross-Validation**
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)

# Define K-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,  # Number of random combinations to test
    scoring="accuracy",
    cv=kfold,  # Using K-Fold CV
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

# Get best parameters from Randomized Search
best_params_random = random_search.best_params_
print(f"✅ Best Parameters from Randomized Search: {best_params_random}")

# 🚀 **Step 6: Train Final XGBoost Model with Best Parameters**
optimized_xgb = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    **best_params_random  # Apply best parameters
)

optimized_xgb.fit(X_train, y_train)

# 🚀 **Step 7: Make Predictions**
y_pred = optimized_xgb.predict(X_test)

# 🚀 **Step 8: Evaluate Model Performance**
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"🔹 Final Model Accuracy: {accuracy:.4f}")
print("🔹 Classification Report:\n", report)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

✅ Best Parameters from Randomized Search: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.9}


Parameters: { "use_label_encoder" } are not used.



🔹 Final Model Accuracy: 0.8619
🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.86     10000
           1       0.88      0.84      0.86     10000

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000



In [34]:
pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [30]:
import pandas as pd
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

# 🚀 **Step 1: Load Dataset**
file_path = "urldata.csv"  # Change this to your dataset path
df = pd.read_csv(file_path)

# Drop unnecessary columns
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)

# Convert labels to numerical format
df["label"] = df["label"].map({"benign": 0, "malicious": 1})

# Reduce dataset size to 100,000 samples for memory efficiency
df_sampled = df.groupby("label").sample(n=50000, random_state=42)  # Balance both classes

# 🚀 **Step 2: Feature Extraction (Advanced URL Features)**
def extract_url_features(url):
    suspicious_words = ["secure", "login", "verify", "update", "free", "gift", "money", "account"]
    
    return {
        "url_length": len(url),
        "num_hyphens": url.count("-"),
        "num_underscores": url.count("_"),
        "num_slashes": url.count("/"),
        "num_digits": sum(c.isdigit() for c in url),
        "num_subdomains": url.count(".") - 1,
        "contains_ip": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "num_special_chars": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?"),
        "digit_ratio": sum(c.isdigit() for c in url) / len(url),  # New feature
        "special_char_ratio": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?") / len(url),  # New feature
        "suspicious_word_count": sum(1 for word in suspicious_words if word in url)  # New feature
    }

# Apply feature extraction
df_features = df_sampled["url"].apply(lambda x: extract_url_features(x))
df_features = pd.DataFrame(df_features.tolist())

# Labels
y_sampled = df_sampled["label"]

# 🚀 **Step 3: Split Data into Train and Test Sets**
X_train, X_test, y_train, y_test = train_test_split(df_features, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled)

# 🚀 **Step 4: Use the Best Parameters from Randomized Search**
best_params_random = {
    "n_estimators": 300,
    "max_depth": 9,
    "learning_rate": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "gamma": 0.1
}

# 🚀 **Step 5: Perform Grid Search for Fine-Tuning**
param_grid = {
    "n_estimators": [best_params_random["n_estimators"] - 50, best_params_random["n_estimators"], best_params_random["n_estimators"] + 50],
    "max_depth": [best_params_random["max_depth"] - 1, best_params_random["max_depth"], best_params_random["max_depth"] + 1],
    "learning_rate": [best_params_random["learning_rate"] * 0.8, best_params_random["learning_rate"], best_params_random["learning_rate"] * 1.2],
    "subsample": [best_params_random["subsample"]],
    "colsample_bytree": [best_params_random["colsample_bytree"]],
    "gamma": [best_params_random["gamma"]]
}

xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)

# Define K-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=kfold,  # 5-Fold Cross Validation
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# 🚀 **Step 6: Train Final Model with Best Parameters**
best_params_grid = grid_search.best_params_
print(f"✅ Best Parameters from Grid Search: {best_params_grid}")

final_xgb = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    **best_params_grid
)

final_xgb.fit(X_train, y_train)

# 🚀 **Step 7: Make Predictions**
y_pred_final = final_xgb.predict(X_test)

# 🚀 **Step 8: Evaluate Model Performance**
accuracy_final = accuracy_score(y_test, y_pred_final)
report_final = classification_report(y_test, y_pred_final)

print(f"🔹 Final Fine-Tuned Accuracy: {accuracy_final:.4f}")
print("🔹 Final Classification Report:\n", report_final)

# 🚀 **Step 9: Save the Final Optimized Model**
joblib.dump(final_xgb, "final_optimized_xgboost_model.pkl")
print("✅ Final Model saved as 'final_optimized_xgboost_model.pkl'")


Fitting 5 folds for each of 27 candidates, totalling 135 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

✅ Best Parameters from Grid Search: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.08000000000000002, 'max_depth': 10, 'n_estimators': 250, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.



🔹 Final Fine-Tuned Accuracy: 0.8622
🔹 Final Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.87     10000
           1       0.88      0.84      0.86     10000

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000

✅ Final Model saved as 'final_optimized_xgboost_model.pkl'
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.3, max_depth=3, n_estimators=400, subsample=0.8; total time=   1.0s
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.3, max_depth=3, n_estimators=400, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.5s
[CV] END colsample_bytree=0.9, gamma=0.1, learning_rate=0.05, max_depth=3, n_estimators=200, subsample=0.9; total time=   0.5s
[CV] END colsample_bytree=0.8, gamma=0.1

In [38]:
import pandas as pd
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

# 🚀 **Step 1: Load the Dataset**
df = pd.read_csv("urldata.csv")  # Your dataset with PhishTank data

# ✅ Fix: Convert 'benign' and 'malicious' to numerical labels
df["label"] = df["label"].map({"benign": 0, "malicious": 1})

# 🚀 **Step 2: Feature Extraction**
def extract_url_features(url):
    suspicious_words = ["secure", "login", "verify", "update", "free", "gift", "money", "account"]
    
    return {
        "url_length": len(url),
        "num_hyphens": url.count("-"),
        "num_underscores": url.count("_"),
        "num_slashes": url.count("/"),
        "num_digits": sum(c.isdigit() for c in url),
        "num_subdomains": url.count(".") - 1,
        "contains_ip": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "num_special_chars": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?"),
        "digit_ratio": sum(c.isdigit() for c in url) / len(url),
        "special_char_ratio": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?") / len(url),
        "suspicious_word_count": sum(1 for word in suspicious_words if word in url)
    }

df_features = df["url"].apply(lambda x: extract_url_features(x))
df_features = pd.DataFrame(df_features.tolist())

# Labels
y = df["label"]

# 🚀 **Step 3: Split Data**
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.2, random_state=42, stratify=y)

# 🚀 **Step 4: Define Models**
models = {
    "XGBoost": xgb.XGBClassifier(n_estimators=300, max_depth=9, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, gamma=0.1, objective="binary:logistic", eval_metric="logloss", use_label_encoder=False),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    "LightGBM": lgb.LGBMClassifier(n_estimators=300, max_depth=9, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500)
}

# 🚀 **Step 5: Train and Evaluate Each Model**
results = {}
for name, model in models.items():
    print(f"🔹 Training {name}...")
    model.fit(X_train, y_train)  # ✅ FIXED: Now y_train contains only 0 and 1
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"✅ {name} Accuracy: {accuracy:.4f}")
    
    # Save Model
    joblib.dump(model, f"{name.lower().replace(' ', '_')}_model.pkl")
    print(f"✅ {name} model saved as '{name.lower().replace(' ', '_')}_model.pkl'")

    # Store Results
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": classification_report(y_test, y_pred)
    }

# 🚀 **Step 6: Print Final Results**
for name, result in results.items():
    print(f"\n🔹 {name} Model Results:")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print("Classification Report:\n", result["Classification Report"])


🔹 Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



✅ XGBoost Accuracy: 0.9095
✅ XGBoost model saved as 'xgboost_model.pkl'
🔹 Training Random Forest...
✅ Random Forest Accuracy: 0.8864
✅ Random Forest model saved as 'random_forest_model.pkl'
🔹 Training LightGBM...
[LightGBM] [Info] Number of positive: 83550, number of negative: 276590
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1101
[LightGBM] [Info] Number of data points in the train set: 360140, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.231993 -> initscore=-1.197091
[LightGBM] [Info] Start training from score -1.197091
✅ LightGBM Accuracy: 0.9039
✅ LightGBM model saved as 'lightgbm_model.pkl'
🔹 Training Logistic Regression...
✅ Logistic Regression Accuracy: 0.8067
✅ Logistic Regression model saved as 'logistic_regression_model.pkl'

🔹 XGBoost

In [3]:
import pandas as pd
import re
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# 🚀 **Step 1: Load Dataset**
df = pd.read_csv("urldata.csv")  # Your dataset with PhishTank data

# ✅ Convert labels to 0 and 1
df["label"] = df["label"].map({"benign": 0, "malicious": 1})

# 🚀 **Step 2: Feature Extraction**
def extract_url_features(url):
    suspicious_words = ["secure", "login", "verify", "update", "free", "gift", "money", "account"]
    
    return {
        "url_length": len(url),
        "num_hyphens": url.count("-"),
        "num_underscores": url.count("_"),
        "num_slashes": url.count("/"),
        "num_digits": sum(c.isdigit() for c in url),
        "num_subdomains": url.count(".") - 1,
        "contains_ip": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "num_special_chars": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?"),
        "digit_ratio": sum(c.isdigit() for c in url) / len(url),
        "special_char_ratio": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?") / len(url),
        "suspicious_word_count": sum(1 for word in suspicious_words if word in url)
    }

df_features = df["url"].apply(lambda x: extract_url_features(x))
df_features = pd.DataFrame(df_features.tolist())

# Labels
y = df["label"]

# 🚀 **Step 3: Split Data**
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.2, random_state=42, stratify=y)

# 🚀 **Step 4: Define XGBoost Model**
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False)

# 🚀 **Step 5: Define Hyperparameter Space for Randomized Search**
param_dist = {
    "n_estimators": [100, 200, 300, 400, 500],  # Number of trees
    "max_depth": [3, 6, 9, 12],  # Depth of trees
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Step size
    "subsample": [0.6, 0.7, 0.8, 0.9],  # Data sampling
    "colsample_bytree": [0.6, 0.7, 0.8, 0.9],  # Feature sampling
    "gamma": [0, 0.1, 0.2, 0.3, 0.4],  # Minimum loss reduction
}

# 🚀 **Step 6: Perform Randomized Search with K-Fold Cross-Validation**
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,  # Number of random combinations to test
    scoring="accuracy",
    cv=kfold,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

# Get best parameters from Randomized Search
best_params_random = random_search.best_params_
print(f"✅ Best Parameters from Randomized Search: {best_params_random}")

# 🚀 **Step 7: Fine-Tune with Grid Search**
param_grid = {
    "n_estimators": [best_params_random["n_estimators"] - 50, best_params_random["n_estimators"], best_params_random["n_estimators"] + 50],
    "max_depth": [best_params_random["max_depth"] - 1, best_params_random["max_depth"], best_params_random["max_depth"] + 1],
    "learning_rate": [best_params_random["learning_rate"] * 0.8, best_params_random["learning_rate"], best_params_random["learning_rate"] * 1.2],
    "subsample": [best_params_random["subsample"]],
    "colsample_bytree": [best_params_random["colsample_bytree"]],
    "gamma": [best_params_random["gamma"]]
}

grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False),
    param_grid=param_grid,
    scoring="accuracy",
    cv=kfold,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# 🚀 **Step 8: Train Final Model with Best Parameters**
best_params_grid = grid_search.best_params_
print(f"✅ Best Parameters from Grid Search: {best_params_grid}")

final_xgb = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    **best_params_grid
)

final_xgb.fit(X_train, y_train)

# 🚀 **Step 9: Make Predictions**
y_pred_final = final_xgb.predict(X_test)

# 🚀 **Step 10: Evaluate Model Performance**
accuracy_final = accuracy_score(y_test, y_pred_final)
report_final = classification_report(y_test, y_pred_final)

print(f"🔹 Final Fine-Tuned Accuracy: {accuracy_final:.4f}")
print("🔹 Final Classification Report:\n", report_final)

# 🚀 **Step 11: Save the Final Optimized Model**
##joblib.dump(final_xgb, "final_optimized_xgboost_model.pkl")
##print("✅ Final Model saved as 'final_optimized_xgboost_model.pkl'")


AttributeError: module 'xgboost' has no attribute 'XGBClassifier'

In [6]:
import pandas as pd
import re
import joblib
import xgboost as xgb
import 
# 🚀 **Step 1: Load Your Trained XGBoost Model**
model_path = "final_optimized_xgboost_model.pkl"  # Make sure this file exists
optimized_xgb = joblib.load(model_path)

# 🚀 **Step 2: Define Feature Extraction Function**
def extract_url_features(url):
    suspicious_words = ["secure", "login", "verify", "update", "free", "gift", "money", "account"]
    
    return {
        "url_length": len(url),
        "num_hyphens": url.count("-"),
        "num_underscores": url.count("_"),
        "num_slashes": url.count("/"),
        "num_digits": sum(c.isdigit() for c in url),
        "num_subdomains": url.count(".") - 1,
        "contains_ip": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "num_special_chars": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?"),
        "digit_ratio": sum(c.isdigit() for c in url) / len(url),
        "special_char_ratio": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?") / len(url),
        "suspicious_word_count": sum(1 for word in suspicious_words if word in url)
    }

# 🚀 **Step 3: Define URLs to Test**
test_urls = [
    # ✅ Safe (Benign) URLs
    "https://google.com",
    "https://www.facebook.com",
    "https://full-stack-frontend-4b7r.onrender.com/",
    "https://student.srmap.edu.in",
    "https://securexnow.com/",
    
    # ❌ Malicious (Phishing/Scam) URLs
    "https://wordsonawall.net/ub/lmicu/login.php",
    "http://secure-login.bankofamerica-verify.com",
    "http://free-money-giveaway.com",
    "http://update-your-banking-details.com",
    "http://win-a-free-iphone.com",
]

# 🚀 **Step 4: Convert URLs into Features**
test_features = pd.DataFrame([extract_url_features(url) for url in test_urls])

# 🚀 **Step 5: Make Predictions**
test_predictions = optimized_xgb.predict(test_features)

# 🚀 **Step 6: Display Results**
test_results = pd.DataFrame({"URL": test_urls, "Prediction": test_predictions})
test_results["Prediction"] = test_results["Prediction"].map({0: "Benign (Safe)", 1: "Malicious"})

# Print Results
print(test_results)


ModuleNotFoundError: No module named 'xgboost.sklearn'

In [52]:
pip install htmldom


Collecting htmldom
  Downloading htmldom-2.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: htmldom
  Building wheel for htmldom (setup.py) ... [?25ldone
[?25h  Created wheel for htmldom: filename=htmldom-2.0-py3-none-any.whl size=11116 sha256=466d71adc85f17c081765a0509063658820e78d688b41be2480445a72e76b1c6
  Stored in directory: /Users/tejashtarun/Library/Caches/pip/wheels/68/87/53/032cf8ecf90d9446b2f48b82ab76dce9b7021034014f21c58e
Successfully built htmldom
Installing collected packages: htmldom
Successfully installed htmldom-2.0
Note: you may need to restart the kernel to use updated packages.


In [54]:
import joblib

# Load the pickle file
dataset_path = "dataset_A_05_2020_p10.pickle"  # Replace with your actual file name
data = joblib.load(dataset_path)

# Check the type of the loaded object
print(f"✅ Loaded Data Type: {type(data)}")

# If it's a dictionary, print the keys
if isinstance(data, dict):
    print(f"🔹 Keys in the dataset: {data.keys()}")


✅ Loaded Data Type: <class 'dict'>
🔹 Keys in the dataset: dict_keys(['http://loveslife.biz/', 'http://www.team-meble.pl/', 'http://base.etagy.net/login.php', 'https://www.slideshare.net/gregrobertson/tp-sforblog', 'http://www.yourdictionary.com/iconoclast', 'http://www.campisicorradomichele.com/public/_vti_cnf/mo.php', 'https://www.vulkanland-bio-safran.at/wp-admin/install.php', 'https://kinomaxxcinema.wordpress.com/', 'https://umcutrecht.nl/nl/', 'https://www.gyu-kaku.com/', 'http://www.everythingwakeboard.com/Cap/Pack/', 'https://rebrand.ly/9m831w', 'http://hostpoint-admin-panel52358.web65.s177.goserver.host/hostpoint/index.html', 'https://www.kmc.si/', 'https://drive.google.com/file/d/1kNolZ4xw7mgnCSjYsbomy_y4zxlk6zlD/view?usp=sharing', 'https://rebrand.ly/Security_Check_Center', 'https://www.hdofasheville.com/', 'https://www.limerius.com/', 'http://www.whatsapps-invites.zzux.com/', 'http://lbcpzonasegurabeta.rf.gd/', 'http://www.fjyyqp.com/statics/plugin/kindeditor/attached/file/20

In [5]:
import pandas as pd
import re
import joblib
import xgboost as xgb
from urllib.parse import urlparse

# 🚀 Load the trained XGBoost model
model_path = "final_optimized_xgboost_model.pkl"  # Ensure this file exists
optimized_xgb = joblib.load(model_path)

# ✅ Trusted domains whitelist (Override model decision)
TRUSTED_DOMAINS = {
    "chatgpt.com",
    "instagram.com",
    "render.com",
    "x.com",
    "amazon.com",
    "www.amazon.com",
    "linkedin.com",
    "www.linkedin.com",
    "github.com",
    "www.github.com"
}

# 🚀 Function to Normalize and Extract Domain
def normalize_url(url):
    """Normalize URL by removing 'www.' for consistency."""
    if not url.startswith("http"):
        url = "https://" + url  # Ensure URL has a scheme
    
    parsed = urlparse(url)
    domain = parsed.netloc.lower()

    # Convert www.google.com to google.com
    if domain.startswith("www."):
        domain = domain[4:]

    return url, domain

# 🚀 Feature Extraction
def extract_url_features(url):
    suspicious_words = ["secure", "login", "verify", "update", "free", "gift", "money", "account"]

    return {
        "url_length": len(url),
        "num_hyphens": url.count("-"),
        "num_underscores": url.count("_"),
        "num_slashes": url.count("/"),
        "num_digits": sum(c.isdigit() for c in url),
        "num_subdomains": url.count(".") - 1,
        "contains_ip": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", url) else 0,
        "num_special_chars": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?"),
        "digit_ratio": sum(c.isdigit() for c in url) / len(url),
        "special_char_ratio": sum(url.count(c) for c in "!@#$%^&*()+=[]{}|\\:;\"'<>,?") / len(url),
        "suspicious_word_count": sum(1 for word in suspicious_words if word in url)
    }

# 🚀 Test URLs
test_urls = [
    "https://chatgpt.com",
    "https://www.google.com",
    "https://www.facebook.com",
    "https://full-stack-frontend-4b7r.onrender.com/",
    "https://student.srmap.edu.in",
    "https://securexnow.com/",
    
    # ❌ Malicious URLs
    "https://wordsonawall.net/ub/lmicu/login.php",
    "http://secure-login.bankofamerica-verify.com",
    "http://free-money-giveaway.com",
    "http://update-your-banking-details.com",
    "http://win-a-free-iphone.com",
]

# 🚀 Process URLs
normalized_urls = []
domains = []
for url in test_urls:
    norm_url, domain = normalize_url(url)
    normalized_urls.append(norm_url)
    domains.append(domain)

# 🚀 Extract Features
test_features = pd.DataFrame([extract_url_features(url) for url in normalized_urls])

# 🚀 Make Predictions
test_predictions = optimized_xgb.predict(test_features)

# 🚀 Override Model for Trusted Domains
final_predictions = []
for i, domain in enumerate(domains):
    if domain in TRUSTED_DOMAINS:
        final_predictions.append(0)  # Force to "Benign"
    else:
        final_predictions.append(test_predictions[i])

# 🚀 Display Results
test_results = pd.DataFrame({
    "Original URL": test_urls,
    "Normalized URL": normalized_urls,
    "Domain": domains,
    "Prediction": final_predictions
})

# ✅ Convert Predictions to Readable Labels
test_results["Prediction"] = test_results["Prediction"].map({0: "Benign (Safe)", 1: "Malicious"})

# ✅ Print results
print("\n🔹 Updated Phishing Detection Results:")
print(test_results.to_string(index=False))

# ✅ Save results
test_results.to_csv("phishing_detection_results.csv", index=False)
print("\n✅ Results saved as 'phishing_detection_results.csv'")



🔹 Updated Phishing Detection Results:
                                  Original URL                                 Normalized URL                                Domain    Prediction
                           https://chatgpt.com                            https://chatgpt.com                           chatgpt.com Benign (Safe)
                        https://www.google.com                         https://www.google.com                            google.com Benign (Safe)
                      https://www.facebook.com                       https://www.facebook.com                          facebook.com Benign (Safe)
https://full-stack-frontend-4b7r.onrender.com/ https://full-stack-frontend-4b7r.onrender.com/ full-stack-frontend-4b7r.onrender.com Benign (Safe)
                  https://student.srmap.edu.in                   https://student.srmap.edu.in                  student.srmap.edu.in Benign (Safe)
                       https://securexnow.com/                        https://securex