In [1]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, SpatialDropout1D



In [4]:
# Load dataset
df = pd.read_csv('data/phishing_email.csv')  # Adjust the file path as necessary

# Preprocess the data
X = df['text_combined'].values  # Assuming the first column is named 'text'
y = df['label'].values  # Assuming the second column is named 'label'

# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000, lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=100)  # Adjust maxlen as necessary

# Encoding labels
le = LabelEncoder()
y = le.fit_transform(y)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model with RNN
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val), verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val, verbose=1)
print(f'Validation Accuracy: {accuracy*100:.2f}%')


Epoch 1/5




[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 15ms/step - accuracy: 0.6896 - loss: 0.5399 - val_accuracy: 0.9568 - val_loss: 0.1260
Epoch 2/5
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9364 - loss: 0.1757 - val_accuracy: 0.9467 - val_loss: 0.1432
Epoch 3/5
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9558 - loss: 0.1249 - val_accuracy: 0.9404 - val_loss: 0.1591
Epoch 4/5
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - accuracy: 0.9596 - loss: 0.1234 - val_accuracy: 0.9658 - val_loss: 0.1122
Epoch 5/5
[1m1032/1032[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 19ms/step - accuracy: 0.9622 - loss: 0.1147 - val_accuracy: 0.9702 - val_loss: 0.0958
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9695 - loss: 0.0946
Validation Accuracy: 97.02%


In [None]:
model.save("phishing_email_detector_2.keras")