### Jose Nazario's phishing dataset

Perform operations: 
- load as mbox
- extract header and body if needed
- data cleaning


1. Load the dataset from mbox.

In [197]:
import mailbox
import os
import pandas as pd
import re
import cleantext

from bs4 import BeautifulSoup, NavigableString
from functools import wraps
from collections import defaultdict

In [2]:
def retry_args(func):
    @wraps(func)
    def wrapper(first_arg, second_arg, *args, **kwargs):
        try:
            result = func(first_arg, second_arg, *args, **kwargs)
            return result
        except Exception as err:
            print(f"Error: {err}. Retrying with argument.")   
            arg_list = [('iso-8859-1',), ('utf-8',)]
            for arg in arg_list:
                try:
                    result = func(first_arg, second_arg, *arg, *args, **kwargs)
                    return result
                except Exception as err:
                    print(f"Error: {err}. Trying next argument: {arg}.")
            print(f'All attempts failed for: {second_arg}')
            return 'decoding_error'
    return wrapper

In [159]:
@retry_args
def extract_message_body(msg, key, content_charset=''):
    print(msg)
    for part in msg.walk():
    # this way multipart is decoded at second time (first iteration is header+payload, which results in None)
        if part.is_multipart():
            pass
        elif not part.is_multipart():
            try:   
                if not content_charset:
                    charset_to_decode = part.get_content_charset()
                    if charset_to_decode is None:
                        content_charset = 'utf-8'
                    else:
                        content_charset = charset_to_decode
                try:
                    msg_body = part.get_payload(decode=True).decode(content_charset)
                    if msg_body:
                        # sometimes multipart returns False but payload is empty
                        return msg_body
                except LookupError as lerr:
                    print(lerr)
                    print(f'error at: {key}')
                    raise Exception
            except UnicodeDecodeError as uderr:
                print(uderr)
                print(f'error at: {key}')
                raise Exception

In [160]:
print(extract_message_body(mailbox.mbox('lstm_datasets\jose_phishing_dataset\\20051114.mbox')[258], 258))

Return-Path: <tom@empal.com>
X-Original-To: username@login.domain.com
Delivered-To: username@login.domain.com
Received: from mail2.domain.com (mail2.domain.com [192.168.3.4])
	by naughty.domain.com (Postfix) with ESMTP id 0B0EC536E1F
	for <username@login.domain.com>; Mon, 26 Sep 2005 03:59:36 -0400 (EDT)
Received: from mail2.domain.com (localhost.domain.com [127.0.0.1])
	by mail2.domain.com (Postfix) with ESMTP id 7D5626FA38C
	for <username@domain.com>; Mon, 26 Sep 2005 03:54:46 -0400 (EDT)
Received: from empal.com (unknown [221.0.207.43])
	by mail2.domain.com (Postfix) with ESMTP
	for <username@domain.com>; Mon, 26 Sep 2005 03:54:45 -0400 (EDT)
From: tom@empal.com
To: username@domain.com
Subject: please give me a kiss
Date: Mon, 26 Sep 2005 16:54:17 +0900
MIME-Version: 1.0
Content-Type: multipart/mixed;
	boundary="----=_NextPart_000_0004_2A135258.962D08FE"
X-Priority: 3
X-MSMail-Priority: Normal
X-Spam-Checker-Version: SpamAssassin 3.0.2 (2004-11-16)
X-Spam-Level: *
X-Spam-Status: No,

In [4]:
def extract_message_header(mbox_msg, values_to_extract):
    temp_dict = {}
    for value_to_extract in values_to_extract:
        extracted_val = mbox_msg.get(value_to_extract)
        temp_dict[value_to_extract] = extracted_val
    return temp_dict    

In [90]:
def mbox_file_to_pd(files_dir):
    res = defaultdict(list)
    for file_ in os.listdir(files_dir):
        mbox_files = mailbox.mbox(files_dir + file_)
        print(f"Current file: {file_}")
        for key in mbox_files.iterkeys():
            try:
                mbox_msg = mbox_files[key]
            except UnicodeDecodeError as uderr:
                print(uderr)
                print(f'Malformed key: {key} at mbox_files: {mbox_files}')
                continue
            msg_body = extract_message_body(mbox_msg, key)
            #msg_header_dict = extract_message_header()
            res['filename'].append(file_)
            res['email_body'].append(msg_body)
            res['file_key'].append(key)
    df = pd.DataFrame(res)
    return df

In [117]:
df_phishing_raw = mbox_file_to_pd('lstm_datasets\jose_phishing_dataset\\')

Current file: 20051114.mbox
'utf-8' codec can't decode byte 0xa9 in position 4650: invalid start byte
error at: 0
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xa0 in position 1646: invalid start byte
error at: 4
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xff in position 5100: invalid start byte
error at: 5
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xff in position 5100: invalid start byte
error at: 6
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xff in position 5100: invalid start byte
error at: 7
Error: . Retrying with argument.
'ascii' codec can't decode byte 0xad in position 2415: ordinal not in range(128)
error at: 8
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xff in position 5100: invalid start byte
error at: 10
Error: . Retrying with argument.
'ascii' codec can't decode byte 0xad in position 2412: ordinal not in range(128)
error at: 16
Error: . Retrying with argument.
'utf

In [121]:
df_phishing_raw[df_phishing_raw['email_body'] == 'decoding_error']

Unnamed: 0,filename,email_body,file_key


In [122]:
df_phishing_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10706 entries, 0 to 10705
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   filename    10706 non-null  object
 1   email_body  10705 non-null  object
 2   file_key    10706 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 251.0+ KB


In [123]:
df_phishing_raw['filename'].unique()

array(['20051114.mbox', 'phishing-2015', 'phishing-2016', 'phishing-2017',
       'phishing-2018', 'phishing-2019', 'phishing-2020', 'phishing-2021',
       'phishing-2022', 'phishing0.mbox', 'phishing1.mbox',
       'phishing2.mbox', 'phishing3.mbox', 'private-phishing4.mbox'],
      dtype=object)

Check for duplicates text, corrupted data.

In [124]:
df_phishing_raw[df_phishing_raw['email_body'].isna()]

Unnamed: 0,filename,email_body,file_key
2007,phishing-2019,,155


In [125]:
df_phishing_raw = df_phishing_raw[~df_phishing_raw['email_body'].isna()]

In [126]:
df_phishing_raw.head(10)

Unnamed: 0,filename,email_body,file_key
0,20051114.mbox,<html>\n<head>\n<!-- extraneous meta tag rem...,0
1,20051114.mbox,"&nbsp;<body bgcolor=#ffffff><div align=""left"">...",1
2,20051114.mbox,"&nbsp;<body bgcolor=#ffffff><div align=""left"">...",2
3,20051114.mbox,"<html><p><font face=""Arial""><A HREF=""https://w...",3
4,20051114.mbox,"\n\n<xbody bgcolor=""#ffffff""><!--Header code s...",4
5,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,5
6,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,6
7,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,7
8,20051114.mbox,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",8
9,20051114.mbox,To report spam please visit our site at\nhttp:...,9


In [127]:
df_phishing_raw.reset_index(drop=True, inplace=True)

In [128]:
df_phishing_raw[df_phishing_raw['email_body'].duplicated()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2057 entries, 2 to 10704
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   filename    2057 non-null   object
 1   email_body  2057 non-null   object
 2   file_key    2057 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 64.3+ KB


In [129]:
df_phishing_raw[df_phishing_raw['email_body'] == '']

Unnamed: 0,filename,email_body,file_key


In [131]:
df_phishing_raw[df_phishing_raw['email_body'].isna()]

Unnamed: 0,filename,email_body,file_key


Leaving duplicates, as they might have different header features that could be used in future work.

Save raw df for future purpose.

In [130]:
from joblib import load, dump
dump(df_phishing_raw, 'backup_dumps\df_phishing_raw')

['backup_dumps\\df_phishing_raw']

Perform preprocessing:
* extract message from HTML
* lowercasing all characters 
* removing nonalphanumeric characters, special characters, punctuation marks, whitespaces caused by removal, stopwords, HTML elements.
* replacing URLs with fixed string
* replacing e-mail with fixed string
  
Perform tokenization:
* tokenization based on white spaces

In [199]:
def extract_from_html(body):
    try:
        soup = BeautifulSoup(body, 'html.parser')
        for tag in soup.find_all(True):
            if tag.has_attr('href'):
                tag.insert_before(NavigableString(' '))
                tag.replace_with(tag['href'])
            elif tag.has_attr('src'):
                tag.insert_before(NavigableString(' '))
                tag.replace_with(tag['src'])
            elif tag.has_attr('data'):
                tag.insert_before(NavigableString(' '))
                tag.replace_with(tag['data'])
        text = soup.get_text()
        return text
    except Exception as e:
        print(f'Exception occured at: {e} with {body}')
        return "to_manual_extraction"

In [181]:
body = """<HTML><HEAD><META content="MSHTML 6.00.2900.2627" name=GENERATOR></HEAD><BODY vLink=#ff0000 aLink=#0000ff link=#ff0000 bgColor=#ffffff leftMargin=0topMargin=0 marginwidth="0" marginheight="0"><tablebackground="http://www.ncua.gov/images/prop/bannerStabwide2.jpg"border=0 cellpadding=0 cellspacing=0 width="600" align="left">  <TR>    <TD vAlign=top width="18%">  <TR>    <TD><A href="http://www.ncua.gov/index.html"><IMG height=98      alt="NCUA Seal" src="http://www.ncua.gov/images/prop/blueseal98.gif" width=100      align=left border=0></A></TD>    <TD width="550">      <H2>National Credit Union Administration </H2></TD></TR>  <TR class=Outline>    <TD colSpan=2>      <DIV class=OutlineArea align=right><A name=TopBar></A><A      href="http://www.ncua.gov/#SiteNavigation"><IMG height=8      alt="Skip to Site Navigation" src="primapaginafcu_files/invisible.gif"      width=8 border=0></A><A href="http://www.ncua.gov/#PageNavigation"><IMG      height=8 alt="Skip to Page Navigation"       src="primapaginafcu_files/invisible.gif" width=8 border=0></A>      <A href="http://www.ncua.gov/CreditUnionResources/index.htm">Resources for      Credit Unions </A>| <A       href="http://www.ncua.gov/ConsumerInformation/index.htm">Resources for       Consumers</A> | <A href="http://www.ncua.gov/indexnews.html">News </A>| <A       href="http://search.ncua.gov/">Search</A></DIV></TD></TR></TBODY></TABLE>      <br clear="all">      <hr width="600" style="height: 1px" color="#00000" align="left">      <TABLE cellSpacing=0 cellPadding=0 width=600 align=left border=0>        <TBODY>        <TR vAlign=top>          <TD width=400>            <TABLE cellSpacing=0 cellPadding=5 width="100%" border=0>              <TBODY>              <TR vAlign=top>                <TD>                  <TABLE cellSpacing=0 cellPadding=0 width="100%" border=0>                    <TBODY>                    <TR>                      <TD class=pp_heading align=left><font face="Arial" style="font-size: 15px"><b>Account Info                      Verification</b></font></TD></TR></TBODY></TABLE></TD></TR>              <TR>                <TD><font face="Arial" style="font-size: 12px">Dear FCU holder account,</P><BR><P>As part of our security measures,                  we regularly screen activity in Federal Credit Unions (FCU) network.<BR>We recently                   noticed the following issue on your account: A recent review                   of your account determined that we require some additional                   information from you in order to provide you with secure                   service. Case ID Number: PP-065-617-349 For your protection,                   we have limited access to your account until additional                   security measures can be completed. We apologize for any                   inconvenience this may cause. Please log in to your FCU account to                   restore your access as soon as possible.</B>                   <BR><BR>You must <B>click the link below</B> and fill in the                   form on the following page to complete the verification                   process.<BR><BR>                  <TABLE cellSpacing=0 cellPadding=1 width="75%" align=left                   bgColor=#ffe65c border=0>                    <TBODY>                    <TR>                      <TD>                        <TABLE cellSpacing=0 cellPadding=4 width="100%"                         align=center bgColor=#fffecd border=0>                          <TBODY>                          <TR>                            <TD class=pp_sansserif align=middle><A                               href="http://www.intracon.com/www.ncua.gov/update.php">Click                              here to update your                      account</A></TD></TR></TBODY></TABLE></TD></TR></TBODY></TABLE><BR><BR><BR>In                   accordance with NCUA User Agreement, your account access                  will remain limited until the issue has been resolved.                   Unfortunately, if access to your account remains limited for                   an extended period of time, it may result in further                   limitations or eventual account closure. We encourage you to                   log in to your FCU account as soon as possible to help                   avoid this. We thank you for your prompt attention to this matter. Please                   understand that this is a security measure intended to help                   protect you and your account.<BR>                  <P>We apologize for any inconvenience.                  <P>Sincerely, NCUA Account Review Department </P></TD></TR>              <TR>                <TD>                  <HR class=dotted>                </TD></TR>              <TR>                <TD>                  <TABLE cellSpacing=0 cellPadding=0 width="100%" border=0>                    <TBODY>                    <TR>                      <TD class=pp_footer><P>Please do not reply to this e-mail.                         Mail sent to this address cannot be answered.</P><BR>                    <TR>                      <TD><IMG height=10 src="" width=1                   border=0></TD></TR></TBODY></TABLE></TD></TR>              <TR>                <TD><BR><SPAN             class=pp_footer><BR><BR></SPAN></TD></TR></TBODY></TABLE></TD>          <TD><IMG height=1 src="http://www.ncua.gov/images/prop/logo.gif" width=10             border=0></TD>          <TD vAlign=top width=190>            <TABLE cellSpacing=0 cellPadding=1 width="100%" bgColor=#cccccc             border=0>              <TBODY>              <TR>                <TD>                  <TABLE cellSpacing=0 cellPadding=0 width="100%"                   bgColor=#ffffff border=0>                    <TBODY>                    <TR>                      <TD>                        <TABLE cellSpacing=0 cellPadding=5 width="100%"                         bgColor=#eeeeee border=0>                          <TBODY>                          <TR>                            <TD class=pp_sidebartextbold align=middle>About                               NCUA</TD></TR></TBODY></TABLE>                        <TABLE cellSpacing=0 cellPadding=5 width="100%" border=0>                          <TBODY>                          <TR>                            <TD class=pp_sidebartext>                            <font face="Arial" style="font-size: 12px">The National Credit Union                              Administration (NCUA) is the independent federal                              agency that charters and supervises federal credit                              unions. NCUA, backed of the full faith and credit                              of the U.S. government, operates the National                              Credit Union Share Insurance Fund (NCUSIF)                              insuring the savings of 80 million account holders                              in all federal credit unions and many                              state-chartered credit unions. During the 1990s                              and into the 21st century, credit unions have been                              healthy and growing. Credit union failures remain                              low and the Share Insurance Fund maintains a                              healthy equity level. The National Credit Union                              Administration (NCUA) is comitted to maintain a                              safe environment for over 80 million account                              holders in all federal credit unions and many                              state-chartered credit unions. Protecting the                              security of holders account and of the Federal                              Credit Unions (FCU) network is our primary                              concern.</TD></TR></TBODY></TABLE></body></html>"""
extract_from_html(body)

'    http://www.ncua.gov/index.html  National Credit Union Administration    http://www.ncua.gov/#SiteNavigationhttp://www.ncua.gov/#PageNavigation http://www.ncua.gov/CreditUnionResources/index.htm| http://www.ncua.gov/ConsumerInformation/index.htm | http://www.ncua.gov/indexnews.html| http://search.ncua.gov/              Account Info                      Verification  Dear FCU holder account,As part of our security measures,                  we regularly screen activity in Federal Credit Unions (FCU) network.We recently                   noticed the following issue on your account: A recent review                   of your account determined that we require some additional                   information from you in order to provide you with secure                   service. Case ID Number: PP-065-617-349 For your protection,                   we have limited access to your account until additional                   security measures can be completed. We apologize for any              

In [198]:
body = """<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><A href="http://pages.ebay.com/" target=_blank> <IMG alt="Register for eBay"src="http://pics.ebay.com/aw/pics/register/HeaderRegister_387x40.gif"border=0 width=387 height=40></A><DIV><FONT face=Arial size=2><TABLE cellSpacing=0 cellPadding=0 width=600 border=0><TBODY><TR><TDcolSpan=2><IMG height=10 alt=" " src="http://pics.ebay.com/aw/pics/spacer.gif" width=1></TD></TR><TR><TD bgColor=#ffcc00 colSpan=2><IMGheight=2 alt=" " src="http://pics.ebay.com/aw/pics/spacer.gif" width=1></TD></TR><TR bgColor=#ffe580><TD width=25><IMG height=3 alt="" src="http://pics.ebay.com/aw/pics/sitewide/leftLine_16x3.gif" width=16 align=middle></TD><TD vAlign=center width=575><TABLE cellSpacing=0 cellPadding=1 width="100%" border=0><TBODY><TR><TD vAlign=center noWrap><B>Dear valued customer</B></TD><TD vAlign=center noWrap align=right><A href="http://pages.ebay.com/help/new/signin.html" target=_blank onfiltered="return openHelpWindow(this.href);"><IMG height=14 src="http://pics.ebay.com/aw/pics/listings/questionMark_14x14.gif" width=14 border=0></A><IMG height=1 alt=" " src="http://pics.ebay.com/aw/pics/spacer.gif" width=4><FONT face="Arial, Helvetica, sans-serif" size=2><A href="http://pages.ebay.com/help/new/signin.html" target=_blank onfiltered="return openHelpWindow(this.href);">Need Help?</A></FONT><IMG height=1 alt=" " src="http://pics.ebay.com/aw/pics/spacer.gif"width=2></TD></TR></TBODY></TABLE></TD></TR><TR><TD bgColor=#ffcc00 colSpan=2><IMG height=2 alt=" " src="http://pics.ebay.com/aw/pics/spacer.gif" width=1></TD></TR></TBODY></TABLE></FONT><DIV></DIV><DIV><FONT face=Arial size=2></FONT></DIV><DIV><FONT face=Arial size=2><DIV><B><FONT face=Arial size=2><DIV><DIV style="width: 605; height: 224"><B><FONT face=arial> We regret to inform you that your eBay account could be suspended if you don't re-update your account information. To resolve this problems please </FONT> <A target="_blank" href="http://confirm-ebaymain.com/?eBaylSAPI.dll&VerifyRegistration"><FONT face=arial color=#0000ff>click here</FONT></a></B><FONT face=arial> and re-enter your account information. If your problems could not be resolved your account will be suspended for a period of 3-4 days, after this period your account will be terminated.<br><BR>For the User Agreement, Section 9, we may immediately issue a warning, temporarily suspend, indefinitely suspend or terminate your membership and refuse to provide our services to you if we believe that your actions may cause financial loss or legal liability for you, our users or us. We may also take these actions if we are unable to verify or authenticate any information you provide to us.<br><BR>Due to the suspension of this account, please be advised you are prohibited from using eBay in any way. This includes the registering of a new account. Please note that this suspension does not relieve you of your agreed-upon obligation to pay any fees you may owe to eBay.</FONT></DIV><B></B></DIV><B></B></FONT></B></DIV><B><FONT face=Arial size=2><B></B></FONT></B></FONT></DIV><FONT face=Arial size=2><B><FONT face=Arial size=2><B></B></FONT></B></FONT></DIV><DIV><FONT face=Arial size=2><B><FONT face=Arial size=2><B><DIV></DIV><DIV></DIV><DIV><BR><FONT face=arial>Regards,Safeharbor Department eBay, Inc</FONT></B></DIV></FONT></B><FONT size=2></FONT><B></B><DIV><B><FONT face=arial size=2>The eBay team.</FONT></B></DIV><DIV><FONT face=arial><FONT size=2></FONT><B></B></FONT></DIV><DIV><B><FONT face=arial size=2>This is an automatic message. Please do not reply.</FONT></B></DIV><DIV><FONT face=Arial size=2></FONT><B></B></DIV><DIV><B><BR><TABLE cellSpacing=0 cellPadding=0 width=599 bgColor=#ffcc00 border=0><TBODY><TR><TD height=2><IMG height=2 src="http://pics.ebaycom/aw/pics/spacer.gif" width=2></TD></TR></TBODY></TABLE><CURSIVEsrc="http://include.ebay.com/aw/pics/js/stats/ss.js"></SCRIPT><IMG height=1 width=1 border=0 name=s_i_ebay> <CURSIVE src="http://include.ebay.com/aw/pics/js/stats/ss2.js"></SCRIPT><P><TABLE cellSpacing=0 cellPadding=0 width=600 border=0><TBODY><TR><TD colSpan=2><BR><HR align=center width=500><BR><DIV align=center><FONT face="Arial, Verdana, Helvetica, sans-serif" size=2><A href="http://pages.ebay.com/community/index.html?ssPageName=f:f:ann:US" target=_blank>Announcements</A>   |   <A href="http://cgi4.ebay.com/aw-cgi/eBayISAPI.dll?RegisterShow&ssPageName=f:f:reg:US" target=_blank>Register</A>   |   <A href="http://pages.ebay.com/help/confidence/hub.html?ssPageName=f:f:stips:US" target=_blank>Safe Trading Tips</A>   |   <A href="http://pages.ebay.com/help/policies/hub.html?ssPageName=f:f:policy:US" target=_blank>Policies</A>   |   <A href="http://pages.ebay.com/help/new/feedback.html" target=_blank onfiltered="return openHelpWindow(this.href);">Feedback Forum</A>   |   <A href="http://pages.ebay.com/community/aboutebay/index.html?ssPageName=f:f:ebayinc:US" target=_blank>About eBay</A></FONT></DIV><BR></TD></TR><TR><TD vAlign=top align=left width=450 height=31><FONTface="Arial, Verdana, Helvetica, sans-serif" size=1>Copyright ÿFFFFA91995-2004 eBay Inc. All Rights Reserved.<BR>Designated trademarks and brands are the property of their respective owners.<BR>Use of this Web site constitutes acceptance of the eBay <A href="http://pages.ebay.com/help/policies/user-agreement.html" target=_blank onfiltered="return openHelpWindow(this.href);">User Agreement</A> and <A href="http://pages.ebay.com/help/policies/privacy-policy.html" target=_blank onfiltered="return openHelpWindow(this.href);">Privacy Policy</A>.</FONT><BR></TD><TD vAlign=topalign=right width=150 height=31><FONT face="Arial, Verdana, Helvetica, sans-serif" size=1><A href="http://pages.ebay.com/help/policies/privacy-policy.html" target=_blank onfiltered="return openHelpWindow(this.href);"><IMG height=31 alt=TrustE src="http://pics.ebay.com/aw/pics/truste_button.gif" width=116 align=middle border=0></A></FONT></TD></TR></TBODY></TABLE></P><CURSIVE language=_JavaScript><!-- var cbc,cbf;if (cbc){ writeFooter(); if (cbf){  fullCB(); }}//--></SCRIPT></STRONG></FONT></DIV></DIV></DIV></DIV></DIV></DIV></DIV></DIV></DIV></DIV></DIV></DIV></DIV></DIV></tr></table></font>"""
extract_from_html(body)



In [7]:
from nltk.corpus import stopwords
def tokenize(body, stopwords=stopwords):
    body = body.split(" ")
    body = [token for token in body if not token in stopwords.words('english')]
    return body

In [200]:
def preprocess_body(body):
    body = body.lower()
    body = extract_from_html(body)
    if body == 'to_manual_extraction':
        return body
    try:
        body = cleantext.replace_urls(body, replace_with="fixedstringurl")
        body = re.sub(r'\S+@\S+', 'fixedstringemails', body)
        body = re.sub(r"[^a-zA-Z0-9\s]", '', body) 
        body = " ".join(body.split())
    except Exception as e:
        print(f'Exception occured at: {e} with {body}')
        return 'to_manual_extraction'
    return body    

In [201]:
df_phishing_raw['preprocessed_body'] = df_phishing_raw['email_body'].apply(preprocess_body)



In [202]:
df_phishing_raw.head(10)

Unnamed: 0,filename,email_body,file_key,preprocessed_body
0,20051114.mbox,<html>\n<head>\n<!-- extraneous meta tag rem...,0,ebay suspension fixedstringurl fixedstringurl ...
1,20051114.mbox,"&nbsp;<body bgcolor=#ffffff><div align=""left"">...",1,dear lasalle member as part of our continuing ...
2,20051114.mbox,"&nbsp;<body bgcolor=#ffffff><div align=""left"">...",2,dear lasalle member as part of our continuing ...
3,20051114.mbox,"<html><p><font face=""Arial""><A HREF=""https://w...",3,fixedstringurl i dont pokemon in 1878 in 1874 ...
4,20051114.mbox,"\n\n<xbody bgcolor=""#ffffff""><!--Header code s...",4,fixedstringurl fixedstringurl fixedstringurl f...
5,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,5,fixedstringurl fixedstringurl fixedstringurl f...
6,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,6,fixedstringurl fixedstringurl fixedstringurl f...
7,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,7,fixedstringurl fixedstringurl fixedstringurl f...
8,20051114.mbox,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",8,untitled document fixedstringurl urgent safeha...
9,20051114.mbox,To report spam please visit our site at\nhttp:...,9,to report spam please visit our site at fixeds...


In [203]:
df_phishing_raw[df_phishing_raw['preprocessed_body'] == 'to_manual_extraction']

Unnamed: 0,filename,email_body,file_key,preprocessed_body


In [204]:
df_phishing_raw[df_phishing_raw['preprocessed_body'] == ''].head(20)

Unnamed: 0,filename,email_body,file_key,preprocessed_body


In [191]:
df_phishing_raw = df_phishing_raw[df_phishing_raw['preprocessed_body'] != '']
df_phishing_raw.reset_index(drop=True, inplace=True)

In [206]:
df_phishing_raw['tokenized_body'] = df_phishing_raw['preprocessed_body'].apply(tokenize)

In [208]:
df_phishing_raw.head(10)

Unnamed: 0,filename,email_body,file_key,preprocessed_body,tokenized_body
0,20051114.mbox,<html>\n<head>\n<!-- extraneous meta tag rem...,0,ebay suspension fixedstringurl fixedstringurl ...,"[ebay, suspension, fixedstringurl, fixedstring..."
1,20051114.mbox,"&nbsp;<body bgcolor=#ffffff><div align=""left"">...",1,dear lasalle member as part of our continuing ...,"[dear, lasalle, member, part, continuing, comm..."
2,20051114.mbox,"&nbsp;<body bgcolor=#ffffff><div align=""left"">...",2,dear lasalle member as part of our continuing ...,"[dear, lasalle, member, part, continuing, comm..."
3,20051114.mbox,"<html><p><font face=""Arial""><A HREF=""https://w...",3,fixedstringurl i dont pokemon in 1878 in 1874 ...,"[fixedstringurl, dont, pokemon, 1878, 1874, sh..."
4,20051114.mbox,"\n\n<xbody bgcolor=""#ffffff""><!--Header code s...",4,fixedstringurl fixedstringurl fixedstringurl f...,"[fixedstringurl, fixedstringurl, fixedstringur..."
5,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,5,fixedstringurl fixedstringurl fixedstringurl f...,"[fixedstringurl, fixedstringurl, fixedstringur..."
6,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,6,fixedstringurl fixedstringurl fixedstringurl f...,"[fixedstringurl, fixedstringurl, fixedstringur..."
7,20051114.mbox,<DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><DIV><...,7,fixedstringurl fixedstringurl fixedstringurl f...,"[fixedstringurl, fixedstringurl, fixedstringur..."
8,20051114.mbox,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",8,untitled document fixedstringurl urgent safeha...,"[untitled, document, fixedstringurl, urgent, s..."
9,20051114.mbox,To report spam please visit our site at\nhttp:...,9,to report spam please visit our site at fixeds...,"[report, spam, please, visit, site, fixedstrin..."


In [209]:
dump(df_phishing_raw, 'backup_dumps\df_phishing_tokenized')

['backup_dumps\\df_phishing_tokenized']

Attempt to use spacy for tokenization and stopword removal.

In [2]:
from joblib import load
df_phishing_tokenized_spacy = load('backup_dumps\df_phishing_tokenized')

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipes(["tagger", "parser", "ner"])
def tokenize(body):
    doc = nlp(body)
    tokens = [token.text for token in doc if not token.is_stop]
    return tokens

In [12]:
import pandas as pd

def split_text(text, chunk_size=1000000):
    text_len = len(text)
    chunks = []

    for i in range(0, text_len, chunk_size):
        chunk = text[i:i + chunk_size]
        chunks.append(chunk)

    return chunks

def tokenize_and_remove_stopwords(text, nlp):
    tokens = []
    text_chunks = split_text(text)
    
    for doc in nlp.pipe(text_chunks):
        chunk_tokens = [token.text for token in doc if not token.is_stop]
        tokens.extend(chunk_tokens)
    
    return tokens






In [13]:
df_phishing_tokenized_spacy["tokenized_body_spacy_preprocessed"] = df_phishing_tokenized_spacy["preprocessed_body"].apply(lambda x: tokenize_and_remove_stopwords(x, nlp))




In [6]:
df_phishing_tokenized_spacy['tokenized_body_spacy'] = df_phishing_tokenized_spacy['preprocessed_body'].apply(tokenize)



In [14]:
df_phishing_tokenized_spacy

Unnamed: 0,filename,email_body,file_key,preprocessed_body,tokenized_body,tokenized_body_spacy,tokenized_body_spacy_preprocessed
0,20051114.mbox,<html>\n<head>\n<!-- extraneous meta tag rem...,0,ebay suspension fixedstringurl fixedstringurl ...,"[ebay, suspension, fixedstringurl, fixedstring...","[ebay, suspension, fixedstringurl, fixedstring...","[ebay, suspension, fixedstringurl, fixedstring..."
1,20051114.mbox,"&nbsp;<body bgcolor=#ffffff><div align=""left"">...",1,dear lasalle member as part of our continuing ...,"[dear, lasalle, member, part, continuing, comm...","[dear, lasalle, member, continuing, commitment...","[dear, lasalle, member, continuing, commitment..."
2,20051114.mbox,"&nbsp;<body bgcolor=#ffffff><div align=""left"">...",2,dear lasalle member as part of our continuing ...,"[dear, lasalle, member, part, continuing, comm...","[dear, lasalle, member, continuing, commitment...","[dear, lasalle, member, continuing, commitment..."
3,20051114.mbox,"<html><p><font face=""Arial""><A HREF=""https://w...",3,fixedstringurl i dont pokemon in 1878 in 1874 ...,"[fixedstringurl, dont, pokemon, 1878, 1874, sh...","[fixedstringurl, nt, pokemon, 1878, 1874, shall]","[fixedstringurl, nt, pokemon, 1878, 1874, shall]"
4,20051114.mbox,"\n\n<xbody bgcolor=""#ffffff""><!--Header code s...",4,fixedstringurl fixedstringurl fixedstringurl f...,"[fixedstringurl, fixedstringurl, fixedstringur...","[fixedstringurl, fixedstringurl, fixedstringur...","[fixedstringurl, fixedstringurl, fixedstringur..."
...,...,...,...,...,...,...,...
10564,private-phishing4.mbox,\nYour email Address require \nsecurity update...,3529,your email address require security updates an...,"[email, address, require, security, updates, v...","[email, address, require, security, updates, v...","[email, address, require, security, updates, v..."
10565,private-phishing4.mbox,"<p> Dear Customer, <p>\n<p> Your account have ...",3530,dear customer your account have been detected ...,"[dear, customer, account, detected, fraud, ale...","[dear, customer, account, detected, fraud, ale...","[dear, customer, account, detected, fraud, ale..."
10566,private-phishing4.mbox,Dear jose@monkey.org \n Your two incoming mai...,3531,dear fixedstringemails your two incoming mails...,"[dear, fixedstringemails, two, incoming, mails...","[dear, fixedstringemails, incoming, mails, pla...","[dear, fixedstringemails, incoming, mails, pla..."
10567,private-phishing4.mbox,"<p> Dear Customer, </p\n<p> Your Online accoun...",3532,dear customer your online account has been tem...,"[dear, customer, online, account, temporary, l...","[dear, customer, online, account, temporary, l...","[dear, customer, online, account, temporary, l..."
