In [24]:
pip install pandas scikit-learn jupyterlab



In [25]:
# Import our libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
# Make sure this file is in the same folder as your notebook
data = None # Initialize data to None
try:
    data = pd.read_csv('Phishing_Legitimate_full.csv')
except FileNotFoundError:
    print("Error: Dataset file not found. Make sure 'Phishing_Legitimate_full.csv' is in the same folder.")
    print("Please upload the 'Phishing_Legitimate_full.csv' file to proceed.")

# Let's see what we have!
print("--- First 5 Rows ---")
if data is not None: # Check if data was successfully loaded
    print(data.head())
    print("\n--- Data Info ---")
    data.info()
else:
    print("Data not loaded due to previous error.") # Inform the user that data wasn't loaded

--- First 5 Rows ---
   id  NumDots  SubdomainLevel  PathLevel  UrlLength  NumDash  \
0   1        3               1          5         72        0   
1   2        3               1          3        144        0   
2   3        3               1          2         58        0   
3   4        3               1          6         79        1   
4   5        3               0          4         46        0   

   NumDashInHostname  AtSymbol  TildeSymbol  NumUnderscore  ...  \
0                  0         0            0              0  ...   
1                  0         0            0              2  ...   
2                  0         0            0              0  ...   
3                  0         0            0              0  ...   
4                  0         0            0              0  ...   

   IframeOrFrame  MissingTitle  ImagesOnlyInForm  SubdomainLevelRT  \
0              0             0                 1                 1   
1              0             0               

In [26]:
 # Check for any missing (null) values
print("\n--- Missing Values ---")
print(data.isnull().sum())

# Let's see how many phishing (1) vs. legitimate (0) sites we have
print("\n--- Label Count ---")
print(data['CLASS_LABEL'].value_counts())


--- Missing Values ---
id                                    0
NumDots                               0
SubdomainLevel                        0
PathLevel                             0
UrlLength                             0
NumDash                               0
NumDashInHostname                     0
AtSymbol                              0
TildeSymbol                           0
NumUnderscore                         0
NumPercent                            0
NumQueryComponents                    0
NumAmpersand                          0
NumHash                               0
NumNumericChars                       0
NoHttps                               0
RandomString                          0
IpAddress                             0
DomainInSubdomains                    0
DomainInPaths                         0
HttpsInHostname                       0
HostnameLength                        0
PathLength                            0
QueryLength                           0
DoubleSlashInPat

In [27]:
# Define our features (X) and our target (y)
# X will be all columns except 'id' and the target 'CLASS_LABEL'
feature_names = [col for col in data.columns if col not in ['id', 'CLASS_LABEL']]
X = data[feature_names]

# y is the "answer" (0 = legitimate, 1 = phishing)
y = data['CLASS_LABEL']

# Split the data: 80% for training, 20% for testing
# This is crucial! We test the model on data it has NEVER seen before.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set has {X_train.shape[0]} samples")
print(f"Testing set has {X_test.shape[0]} samples")


Training set has 8000 samples
Testing set has 2000 samples


In [28]:
# Create the Random Forest model
# n_estimators=100 means it's a "forest" of 100 decision trees
# random_state=42 ensures we get the same result every time we run it
model = RandomForestClassifier(n_estimators=100, random_state=42)

print("\n--- Training the Model ---")

# Train the model on our training data
model.fit(X_train, y_train)

print("--- Model Training Complete ---")


--- Training the Model ---
--- Model Training Complete ---


In [29]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# --- Check the Results ---

# 1. Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Accuracy --- \n{accuracy * 100:.2f}%")
print("(This is the percentage of predictions the model got right)")

# 2. Confusion Matrix (This is VERY important for security)
print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"True Legitimate (Predicted Legitimate): {cm[0][0]}")
print(f"False Phishing (Predicted Phishing, but was Legitimate): {cm[0][1]}  <- (False Positive)")
print(f"False Legitimate (Predicted Legitimate, but was Phishing): {cm[1][0]}  <- (False Negative) **This is the dangerous one!**")
print(f"True Phishing (Predicted Phishing): {cm[1][1]}")


# 3. Classification Report (Precision & Recall)
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Legitimate (0)', 'Phishing (1)']))
print("Recall (for Phishing): This is the one to watch. It answers 'Of all the real phishing sites, what percentage did we catch?'")

# 4. Feature Importance
# Let's see which "clues" the model thought were most important
print("\n--- Feature Importance ---")
importances = model.feature_importances_
feature_imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
print(feature_imp_df.sort_values(by='importance', ascending=False))


--- Accuracy --- 
98.20%
(This is the percentage of predictions the model got right)

--- Confusion Matrix ---
[[970  18]
 [ 18 994]]
True Legitimate (Predicted Legitimate): 970
False Phishing (Predicted Phishing, but was Legitimate): 18  <- (False Positive)
False Legitimate (Predicted Legitimate, but was Phishing): 18  <- (False Negative) **This is the dangerous one!**
True Phishing (Predicted Phishing): 994

--- Classification Report ---
                precision    recall  f1-score   support

Legitimate (0)       0.98      0.98      0.98       988
  Phishing (1)       0.98      0.98      0.98      1012

      accuracy                           0.98      2000
     macro avg       0.98      0.98      0.98      2000
  weighted avg       0.98      0.98      0.98      2000

Recall (for Phishing): This is the one to watch. It answers 'Of all the real phishing sites, what percentage did we catch?'

--- Feature Importance ---
                               feature  importance
47  PctExtNul