<a href="https://colab.research.google.com/github/surya211099/DetectingPhishingEmail/blob/main/Stack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, log_loss, precision_score,
    recall_score, f1_score, classification_report,
    confusion_matrix
)

import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from xgboost import XGBClassifier
df = pd.read_csv("/content/drive/MyDrive/Dessertation/phishing_numeric_dataset.csv")

features = ['sender', 'receiver', 'subject', 'body', 'urls', 'timestamp']
X = df[features].values
y = df['label'].values  # 0=legit, 1=phishing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

lr_train_prob = lr.predict_proba(X_train)[:,1]
lr_test_prob = lr.predict_proba(X_test)[:,1]
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

rf_train_prob = rf.predict_proba(X_train)[:,1]
rf_test_prob = rf.predict_proba(X_test)[:,1]
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)
xgb.fit(X_train, y_train)

xgb_train_prob = xgb.predict_proba(X_train)[:,1]
xgb_test_prob = xgb.predict_proba(X_test)[:,1]
mlp_base = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

mlp_base.compile(
    optimizer=Adam(0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

mlp_base.fit(
    X_train, y_train,
    epochs=40,
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

mlp_train_prob = mlp_base.predict(X_train).flatten()
mlp_test_prob = mlp_base.predict(X_test).flatten()
X_train_stack = np.column_stack([
    lr_train_prob,
    rf_train_prob,
    xgb_train_prob,
    mlp_train_prob
])

X_test_stack = np.column_stack([
    lr_test_prob,
    rf_test_prob,
    xgb_test_prob,
    mlp_test_prob
])
meta_model = LogisticRegression()
meta_model.fit(X_train_stack, y_train)
y_pred_prob = meta_model.predict_proba(X_test_stack)[:,1]
y_pred = (y_pred_prob > 0.5).astype(int)
acc = accuracy_score(y_test, y_pred)
ll = log_loss(y_test, y_pred_prob)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc)
print("Log Loss:", ll)
print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", cm)


Epoch 1/40


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6220 - loss: 0.6552 - val_accuracy: 0.6833 - val_loss: 0.6075
Epoch 2/40
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6908 - loss: 0.6017 - val_accuracy: 0.7139 - val_loss: 0.5822
Epoch 3/40
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7199 - loss: 0.5742 - val_accuracy: 0.7223 - val_loss: 0.5603
Epoch 4/40
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7229 - loss: 0.5607 - val_accuracy: 0.7297 - val_loss: 0.5410
Epoch 5/40
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7360 - loss: 0.5406 - val_accuracy: 0.7406 - val_loss: 0.5109
Epoch 6/40
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7614 - loss: 0.5064 - val_accuracy: 0.7954 - val_loss: 0.4731
Epoch 7/40
[1m516/516[0m [32m━━━━━━━