In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load dataset from local file path
filename = 'PhiUSIIL_Phishing_URL_Dataset.csv/PhiUSIIL_Phishing_URL_Dataset.csv'  # Replace with your actual filename
df = pd.read_csv(filename)

# Display first 10 rows
print(df.head(10))

                                  URL  URLLength                      Domain  \
0    https://www.southbankmosaics.com         31    www.southbankmosaics.com   
1            https://www.uni-mainz.de         23            www.uni-mainz.de   
2      https://www.voicefmradio.co.uk         29      www.voicefmradio.co.uk   
3         https://www.sfnmjournal.com         26         www.sfnmjournal.com   
4  https://www.rewildingargentina.org         33  www.rewildingargentina.org   
5     https://www.globalreporting.org         30     www.globalreporting.org   
6          https://www.saffronart.com         25          www.saffronart.com   
7          https://www.nerdscandy.com         25          www.nerdscandy.com   
8      https://www.hyderabadonline.in         29      www.hyderabadonline.in   
9                 https://www.aap.org         18                 www.aap.org   

   DomainLength  IsDomainIP  TLD  URLSimilarityIndex  CharContinuationRate  \
0            24           0  com         

In [36]:
# Extract simple features from the URL string
df['num_digits'] = df['URL'].str.count(r'\d')
df['num_specials'] = df['URL'].str.count(r'[\W_]')  # non-alphanumeric

columns_to_drop = [
    'Domain', 'DomainLength', 'IsDomainIP', 'TLD', 'URLSimilarityIndex', 'CharContinuationRate',
    'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation',
    'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL',
    'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL',
    'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS',
    'LineOfCode', 'LargestLineLength', 'HasTitle', 'Title', 'DomainTitleMatchScore',
    'URLTitleMatchScore', 'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect',
    'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit',
    'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay',
    'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef',
    'NoOfEmptyRef', 'NoOfExternalRef'
]

df = df.drop(columns=columns_to_drop, errors='ignore')

# Example: Assuming you have the features num_digits, num_specials, etc.
features = df[['num_digits', 'num_specials', 'URLLength']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Update the dataframe with the scaled features
df[['num_digits', 'num_specials', 'URLLength']] = scaled_features

# You can also add more features later like domain extraction, entropy, etc.
X = df[['URLLength','num_digits', 'num_specials']]
y = df['label']  # Assuming 'label' is the target variable

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [38]:
from xgboost import XGBClassifier

model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [39]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['Not Phishing', 'Phishing']))


Accuracy: 0.8197374838312941
              precision    recall  f1-score   support

Not Phishing       0.93      0.63      0.75     20189
    Phishing       0.78      0.96      0.86     26970

    accuracy                           0.82     47159
   macro avg       0.85      0.80      0.80     47159
weighted avg       0.84      0.82      0.81     47159



In [40]:
import pandas as pd

# Function to extract features from a URL
def extract_url_features(url):
    return {
        'URLLength': len(url),
        'num_digits': sum(c.isdigit() for c in url),
        'num_specials': sum(not c.isalnum() for c in url)
    }

# Ask user to input a URL
user_url = input("Enter a URL to check: ")

# Extract features and convert to DataFrame
features = extract_url_features(user_url)
X_new = pd.DataFrame([features])

# Predict using the trained model
predicted_label = model.predict(X_new)[0]

# Map label to human-readable class
predicted_class = "Phishing" if predicted_label == 1 else "Not Phishing"

print(f"Prediction for URL '{user_url}': {predicted_class}")


Prediction for URL ' https://bafybeihnhj2m4rzirdb24gycb4shd53tachcuyqlt4wepo6qr7sdefl7ti.ipfs.dweb.link/ ': Not Phishing
