In [2]:
# Import necessary libraries
import pandas as pd

# Load the dataset
url_data = pd.read_csv('URL_dataset_phishing.csv')

# Display the first few rows to understand the structure
print("Dataset head:\n", url_data.head())
print("\nColumns:\n", url_data.columns)
print("\nData types:\n", url_data.dtypes)
print("\nDataset shape:", url_data.shape)

Dataset head:
                                                  url  length_url  \
0              http://www.crestonwood.com/router.php          37   
1  http://shadetreetechnology.com/V4/validation/a...          77   
2  https://support-appleld.com.secureupdate.duila...         126   
3                                 http://rgipt.ac.in          18   
4  http://www.iracing.com/tracks/gateway-motorspo...          55   

   length_hostname  ip  nb_dots  nb_hyphens  nb_at  nb_qm  nb_and  nb_or  ...  \
0               19   0        3           0      0      0       0      0  ...   
1               23   1        1           0      0      0       0      0  ...   
2               50   1        4           1      0      1       2      0  ...   
3               11   0        2           0      0      0       0      0  ...   
4               15   0        2           2      0      0       0      0  ...   

   domain_in_title  domain_with_copyright  whois_registered_domain  \
0                0 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the target variable 'status' (1 for phishing, 0 for legitimate)
label_encoder = LabelEncoder()
url_data['status'] = label_encoder.fit_transform(url_data['status'])

# Separate features and target
X = url_data.drop(columns=['url', 'status'])  # Dropping 'url' column as it's not needed for training
y = url_data['status']

# Split the dataset into training and testing sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the resulting datasets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Target variable distribution:\n", y.value_counts())

Training set shape: (9144, 87)
Testing set shape: (2286, 87)
Target variable distribution:
 status
0    5715
1    5715
Name: count, dtype: int64


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Random Forest model:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save the trained model
import joblib
joblib.dump(rf_model, "phishing_url_classifier.joblib")

Accuracy of the Random Forest model: 0.9693788276465442

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      1157
           1       0.97      0.96      0.97      1129

    accuracy                           0.97      2286
   macro avg       0.97      0.97      0.97      2286
weighted avg       0.97      0.97      0.97      2286


Confusion Matrix:
 [[1129   28]
 [  42 1087]]


['phishing_url_classifier.joblib']