In [113]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from sklearn.metrics import classification_report

In [114]:
# Load the CSV file with encoding to avoid UnicodeDecodeError
file_path = 'nigerian_fraud_email.csv'
df_nigerian_scam = pd.read_csv(file_path, index_col=0, names=['text'], header=0, encoding='ISO-8859-1')

# Display the first few rows of the DataFrame
print(len(df))


5187


In [115]:
# Assign classification label (1 for phishing, 0 for non-phishing)
df_nigerian_scam['label'] = 1 

# Check the result
print(df_nigerian_scam.head())

                                                text  label
1  /nURGENT BUSINESS ASSISTANCE AND PARTNERSHIP./...      1
2  /nDear Friend,/n/nI am Mr. Ben Suleman a custo...      1
3  /nFROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ...      1
4  /nFROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ...      1
5  /nDear sir, /n /nIt is with a heart full of ho...      1


In [116]:
# Randomly select 1000 rows from the dataset
df_nigerian_scam = df_nigerian_scam.sample(n=865, random_state=42)

# Drop the first column (index column) from the DataFrame
# df_sample = df_sample.drop(df_sample.columns[0], axis=1)
num_columns = df_nigerian_scam.shape[1]
# print(f"Number of columns: {num_columns}")

# Display the first few rows to confirm
print(df_nigerian_scam.head())

                                                   text  label
3363  /nMy name is Mrs=2E hilda Ahmed from Iraq=2E I...      1
4672  /nHello dear/n/nFrom, Miss ZACHI Medine/nAbidj...      1
3543  /nRepublique de Cote d'Ivoire/nAbidjan,Rue 12,...      1
473   /n/n/nDear Sir./n /n May I seek your indulgenc...      1
1789  /nFROM MRS SUZAN SANKOH/nABIDJAN, IVORY COAST/...      1


In [117]:
# Define the directory path
directory_path = 'AI_phishing_emails/'

# Initialize an empty list to hold the email contents
email_data = []

# Loop through each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.txt'):  # Ensure you only process .txt files
        file_path = os.path.join(directory_path, file_name)
        
        # Open and read the content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            email_content = file.read().strip()  # Read and strip any leading/trailing whitespace
            
        # Append the content to the email_data list
        email_data.append(email_content)

# Create a pandas DataFrame with the loaded emails
df_ai_generated = pd.DataFrame(email_data, columns=['email_text'])

# Display the first few rows of the DataFrame to confirm
print(df_ai_generated.head())


                                          email_text
0  🏋️‍♀️ Fitness Bootcamp Challenge - Sweat, Burn...
1  🎨 Digital Art Contest: Showcase Your Talent!\n...
2  Reminder: Student Account Verification Needed\...
3  Talent Show Audition Reminder\n\nHello Dakota ...
4  Attention Required: Confirm Your Student Accou...


In [118]:
# Assign classification label (1 for human phishing, 0 for AI generated)
df_ai_generated['label'] = 0

In [119]:
print(df_ai_generated.head())

                                          email_text  label
0  🏋️‍♀️ Fitness Bootcamp Challenge - Sweat, Burn...      0
1  🎨 Digital Art Contest: Showcase Your Talent!\n...      0
2  Reminder: Student Account Verification Needed\...      0
3  Talent Show Audition Reminder\n\nHello Dakota ...      0
4  Attention Required: Confirm Your Student Accou...      0


In [120]:
total_data = df_ai_generated.shape[0]
print(f'dataset size:{total_data}')

dataset size:865


In [121]:
# Rename the email_text column to match the nigerian_fraud_df
df_ai_generated.rename(columns={'email_text': 'text'}, inplace=True)

# Combine the datasets
df_combined = pd.concat([df_nigerian_scam, df_ai_generated], ignore_index=True)

# Check for missing values and drop rows with missing text
df_combined = df_combined.dropna(subset=['text'])

# Split the dataset into features and labels
texts = df_combined['text'].values
labels = df_combined['label'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Set parameters
max_words = 20000  # Maximum number of words
embedding_dim = 100  # Embedding dimensions
max_length = 50  # Maximum sequence length

In [122]:
# Prepare the data
# Convert the text data to sequences using a tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_combined['text'])  # 'text' should be the column with email text
sequences = tokenizer.texts_to_sequences(df_combined['text'])
X = pad_sequences(sequences, maxlen=max_length)

# Labels
y = df_combined['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = Sequential()

# Embedding Layer (removed input_length)
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim))

# LSTM Layer
model.add(LSTM(100, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))

# Output Layer
model.add(Dense(1, activation='sigmoid'))  # For binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# Fit the model (train on training data and validate on test data)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9141 - loss: 0.4346 - val_accuracy: 1.0000 - val_loss: 0.0047
Epoch 2/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.9978 - loss: 0.0112 - val_accuracy: 1.0000 - val_loss: 0.0018
Epoch 3/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9998 - loss: 0.0026 - val_accuracy: 1.0000 - val_loss: 0.0040
Epoch 4/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.9993 - loss: 0.0057 - val_accuracy: 1.0000 - val_loss: 7.3710e-04
Epoch 5/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - accuracy: 1.0000 - loss: 7.4339e-04 - val_accuracy: 1.0000 - val_loss: 2.9499e-04
Epoch 6/10
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 1.0000 - loss: 5.0419e-04 - val_accuracy: 1.0000 - val_loss: 2.1051e-04
Epoch 7/10
[1

In [123]:
from sklearn.model_selection import KFold
import numpy as np

# Define number of folds
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Arrays to hold the results of each fold
accuracy_per_fold = []
loss_per_fold = []

# K-Fold Cross Validation
for train_index, val_index in kf.split(X):
    # Split the data into training and validation sets for this fold
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Define the model (same architecture as before)
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=embedding_dim))
    model.add(LSTM(100, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model on this fold's training data
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), verbose=0)

    # Evaluate the model on the validation data for this fold
    scores = model.evaluate(X_val, y_val, verbose=0)
    print(f"Score for fold {len(accuracy_per_fold) + 1}: {model.metrics_names[1]} = {scores[1]}, loss = {scores[0]}")
    
    accuracy_per_fold.append(scores[1])
    loss_per_fold.append(scores[0])

# Print the average accuracy and loss across all folds
print(f"Average accuracy: {np.mean(accuracy_per_fold)}")
print(f"Average loss: {np.mean(loss_per_fold)}")


Score for fold 1: compile_metrics = 1.0, loss = 0.0006355492514558136
Score for fold 2: compile_metrics = 1.0, loss = 9.029968350660056e-05
Score for fold 3: compile_metrics = 1.0, loss = 0.0029084773268550634
Score for fold 4: compile_metrics = 1.0, loss = 0.0012048076605424285
Score for fold 5: compile_metrics = 1.0, loss = 8.031608740566298e-05
Average accuracy: 1.0
Average loss: 0.0009838900019531137
