In [35]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "PhiUSIIL_Phishing_URL_Dataset.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "ndarvind/phiusiil-phishing-url-dataset",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


First 5 records:                                   URL  URLLength                      Domain  \
0    https://www.southbankmosaics.com         31    www.southbankmosaics.com   
1            https://www.uni-mainz.de         23            www.uni-mainz.de   
2      https://www.voicefmradio.co.uk         29      www.voicefmradio.co.uk   
3         https://www.sfnmjournal.com         26         www.sfnmjournal.com   
4  https://www.rewildingargentina.org         33  www.rewildingargentina.org   

   DomainLength  IsDomainIP  TLD  URLSimilarityIndex  CharContinuationRate  \
0            24           0  com               100.0              1.000000   
1            16           0   de               100.0              0.666667   
2            22           0   uk               100.0              0.866667   
3            19           0  com               100.0              1.000000   
4            26           0  org               100.0              1.000000   

   TLDLegitimateProb  URLCharProb

# Feature Selection

Let's select the most important features while removing highly correlated ones:

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 1: Identify numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print('Original numeric features:', list(numeric_cols))

# Step 2: Remove potentially leaky features
leaky_features = [
    # Direct indicators of phishing
    'IsPhishing', 'PhishingProb', 'NoOfPhishyCharacters',
    'HasPhishingTerms', 'PhishyURLPattern',
    'IsBlacklisted', 'IsWhitelisted',
    # Features that might be derived from the label
    'URLSimilarityIndex',  # Might be comparing against known phishing URLs
    'TLDLegitimateProb',   # Might be based on known phishing TLDs
    'URLCharProb',         # Might be based on phishing patterns
    # Features that might leak label information
    'HasHiddenFields',     # Strong indicator of phishing
    'Bank', 'Pay', 'Crypto'  # Keywords likely used to create the dataset
]

# Get initial feature set excluding leaky ones and the label
feature_cols = [col for col in numeric_cols if col != 'label' and col not in leaky_features]
print('\nFeatures after removing leaky ones:', feature_cols)

# Step 3: Find and handle highly correlated features
X_initial = df[feature_cols]
correlation_matrix = X_initial.corr()

# Find highly correlated pairs
correlation_threshold = 0.7
highly_correlated = []
features_to_remove = set()

# First, identify all highly correlated pairs
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:
            feat1, feat2 = correlation_matrix.columns[i], correlation_matrix.columns[j]
            corr = correlation_matrix.iloc[i, j]
            highly_correlated.append((feat1, feat2, corr))

print('\nHighly correlated feature pairs:')
for feat1, feat2, corr in sorted(highly_correlated, key=lambda x: abs(x[2]), reverse=True):
    print(f'{feat1} - {feat2}: {corr:.3f}')

# For each correlated pair, keep the one with higher variance
for feat1, feat2, _ in highly_correlated:
    if feat1 not in features_to_remove and feat2 not in features_to_remove:
        var1 = X_initial[feat1].var()
        var2 = X_initial[feat2].var()
        features_to_remove.add(feat2 if var1 >= var2 else feat1)

# Get final feature set
final_features = [f for f in feature_cols if f not in features_to_remove]
print('\nFinal features after removing correlations:', final_features)

# Step 4: Check feature distributions and remove any suspicious ones
X = df[final_features]
suspicious_features = []
for col in X.columns:
    # Check for features with very low variance
    if X[col].var() < 0.01:
        suspicious_features.append(col)
    # Check for features with suspicious value distributions
    unique_vals = X[col].nunique()
    if unique_vals <= 2 and X[col].value_counts().max() / len(X) > 0.95:
        suspicious_features.append(col)

final_features = [f for f in final_features if f not in suspicious_features]
print('\nFeatures removed due to suspicious distributions:', suspicious_features)
print('\nFinal feature set:', final_features)

# Step 5: Prepare final dataset
X = df[final_features]
y = df['label']

# Step 6: Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 7: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 8: Train and evaluate with cross-validation
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    class_weight='balanced',
    random_state=42
)

cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5)
print('\nCross-validation scores:', cv_scores)
print('Average CV score:', cv_scores.mean())

# Step 9: Train final model and get feature importances
rf.fit(X_train_scaled, y_train)
importances = pd.DataFrame({
    'feature': final_features,
    'importance': rf.feature_importances_
})
importances = importances.sort_values('importance', ascending=False)

print('\nTop 10 most important features:')
print(importances.head(10))

# Step 10: Evaluate on test set
y_pred = rf.predict(X_test_scaled)
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Original numeric features: ['URLLength', 'DomainLength', 'IsDomainIP', 'URLSimilarityIndex', 'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength', 'HasTitle', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef', 'label']

Features after removing leaky ones: ['URLLength', 'DomainLength', 'IsDomainIP', 'CharContinuationR