In [12]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Load your datasets
train = pd.read_csv("../data/train_encoded.csv")
test = pd.read_csv("../data/test_encoded.csv")

# Separate features and target
X_train = train.drop(columns=['Label'])
y_train = train['Label']

X_test = test.drop(columns=['Label'])
y_test = test['Label']

# Train the model
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_model.fit(X_train, y_train)

# Evaluate accuracy
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# ---- New Link Prediction ---- #

# Function to manually extract features from a new URL
def extract_features(url):
    features = {}
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['has_at'] = int('@' in url)
    features['has_https'] = int('https' in url.lower())
    features['has_ip'] = int(any(char.isdigit() for char in url.split('/')[2])) if '://' in url else 0
    features['count_suspicious_words'] = sum(word in url.lower() for word in ['secure', 'account', 'update', 'login', 'verify'])
    
    return pd.DataFrame([features])

# Take a new URL input
new_url = input("Enter a new website URL: ")

# Extract features
new_url_features = extract_features(new_url)

# Ensure same columns as training data (fill missing ones with 0)
missing_cols = set(X_train.columns) - set(new_url_features.columns)
for col in missing_cols:
    new_url_features[col] = 0

new_url_features = new_url_features[X_train.columns]  # Reorder columns

# Predict
new_prediction = xgb_model.predict(new_url_features)

# Display result
print("\nPrediction Result:")
if new_prediction[0] == 1:
    print("⚠️ Warning: This URL might be a PHISHING site!")
else:
    print("✅ This URL seems SAFE.")


Model Accuracy: 0.84


Enter a new website URL:  https://briefingday.com/n/20200618/m#commentform



Prediction Result:


In [1]:
import joblib
import pandas as pd

# 1. Load the saved Random Forest model
xg_model = joblib.load("../model/XGboost_model_new.pkl")

# 2. Define a function to extract features from a new link
def extract_features(url):
    features = {}
    features['URL'] = url  # <<< Add this line to fix the missing column
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['has_at'] = int('@' in url)
    features['has_https'] = int('https' in url.lower())
    features['has_ip'] = int(any(char.isdigit() for char in url.split('/')[2])) if '://' in url else 0
    features['count_suspicious_words'] = sum(word in url.lower() for word in ['login', 'secure', 'account', 'update', 'free', 'verify'])
    return features

# 3. Ask the user to input a new web address (URL)
new_url = input("Enter the web address (URL) to check: ")

# 4. Extract features from the entered URL
url_features = extract_features(new_url)

# 5. Convert the features into a DataFrame
input_df = pd.DataFrame([url_features])

# 6. Drop 'URL' column before prediction (since model expects features only)
input_df = input_df.drop(columns=['URL'])

# 7. Make a prediction
prediction = xg_model.predict(input_df)[0]

# 8. Output the result
if prediction == 1:
    print("⚠️ The URL is predicted to be **Phishing**.")
else:
    print("✅ The URL is predicted to be **Legitimate**.")


configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



Enter the web address (URL) to check: 3souls.us/123H/b4rky/b4rky/P1.html?rAtm1d=;8dcd50af3804821120990f23500e12418dcd50af380482


ValueError: feature_names mismatch: ['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'url_length', 'num_dots', 'num_hyphens', 'has_at', 'has_https', 'has_ip', 'count_suspicious_words'] ['url_length', 'num_dots', 'num_hyphens', 'has_at', 'has_https', 'has_ip', 'count_suspicious_words']
expected col_4, col_2, col_0, col_3, col_1 in input data