In [None]:
# ========== IMPORTS ==========

import pandas as pd
import random
import re

In [None]:
# ========== HANDLE ENRON DATA ==========

df_enron_old = pd.read_csv('Phishing-Email-Dataset/Enron.csv')
fortune = pd.read_csv('Phishing-Email-Dataset/fortune500.csv')
companies = fortune['company']
df_enron_old.info()
df_enron_old.head()

In [None]:
# ========== FUNCTIONS ==========

def replace_enron(text: str) -> str:
    if not isinstance(text, str):
        return text
    # Replace each instance of 'enron' (case-insensitive) with a randomly chosen company
    return re.sub(r'enron', lambda _: random.choice(companies), text, flags=re.IGNORECASE)


In [None]:
# ========== REPLACE 'ENRON' ==========

text_columns = df_enron_old.select_dtypes(include='object').columns
for col in text_columns:
    df_enron_old[col] = df_enron_old[col].apply(replace_enron)
df_enron_old.to_csv('Enron_New.csv', index=False)

In [None]:
# ========== READ DATASETS ==========

df_ceas = pd.read_csv('./Phishing-Email-Dataset/CEAS_08.csv')
df_enron_new = pd.read_csv('./Phishing-Email-Dataset/Enron_New.csv')
df_ling = pd.read_csv('./Phishing-Email-Dataset/Ling.csv')
df_nazario = pd.read_csv('./Phishing-Email-Dataset/Nazario.csv')
df_nigerian = pd.read_csv('./Phishing-Email-Dataset/Nigerian_Fraud.csv')
df_assassin = pd.read_csv('./Phishing-Email-Dataset/SpamAssasin.csv')

In [None]:
# ========== DATASET INFO ==========

print('=============== CEAS 08  ===============')
df_ceas.info()
print('=============== ENRON    ===============')
df_enron_new.info()
print('=============== LING     ===============')
df_ling.info()
print('=============== NAZARIO  ===============')
df_nazario.info()
print('=============== NIGERIAN ===============')
df_nigerian.info()
print('=============== ASSASSIN ===============')
df_assassin.info()


In [None]:
# ========== TRIM COLUMNS ==========

df_trimmed_ceas = df_ceas[['subject', 'body', 'label']]
df_trimmed_enron = df_enron_new[['subject', 'body', 'label']]
df_trimmed_ling = df_ling[['subject', 'body', 'label']]
df_trimmed_nazario = df_nazario[['subject', 'body', 'label']]
df_trimmed_nigerian = df_nigerian[['subject', 'body', 'label']]
df_trimmed_assassin = df_assassin[['subject', 'body', 'label']]

print('=============== CEAS 08  ===============')
df_trimmed_ceas.info()
print('=============== ENRON    ===============')
df_trimmed_enron.info()
print('=============== LING     ===============')
df_trimmed_ling.info()
print('=============== NAZARIO  ===============')
df_trimmed_nazario.info()
print('=============== NIGERIAN ===============')
df_trimmed_nigerian.info()
print('=============== ASSASSIN ===============')
df_trimmed_assassin.info()


In [None]:
# ========== CONCATENATE ==========

df_complete = pd.concat(
    [df_trimmed_ceas,
     df_trimmed_enron,
     df_trimmed_assassin,
     df_trimmed_ling,
     df_trimmed_nazario,
     df_trimmed_nigerian],
    axis=0,
    ignore_index=True
)
df_complete.info()

In [None]:
# ========== REMOVE INVALID DATA ==========

df_complete.dropna(inplace=True)
df_complete.drop_duplicates(inplace=True)
df_complete.info()

In [None]:
# ========== COUNT PHISHING/LEGITIMATE ==========

print(df_complete['label'].value_counts())
print(df_trimmed_enron['label'].value_counts())

In [None]:
# ========== EXPORT DATA ==========

df_complete.to_csv('../emails.csv', index=False)