### Jose Nazario's phishing dataset

Perform operations: 
- load as mbox
- extract header and body if needed
- data cleaning


1. Load the dataset from mbox.

In [1]:
import mailbox
import os
import pandas as pd
from bs4 import BeautifulSoup
from functools import wraps
from collections import defaultdict

In [2]:
def retry_args(func):
    @wraps(func)
    def wrapper(first_arg, second_arg, *args, **kwargs):
        try:
            result = func(first_arg, second_arg, *args, **kwargs)
            return result
        except Exception as err:
            print(f"Error: {err}. Retrying with argument.")   
            arg_list = [('iso-8859-1',), ('utf-8',)]
            for arg in arg_list:
                try:
                    result = func(first_arg, second_arg, *arg, *args, **kwargs)
                    return result
                except Exception as err:
                    print(f"Error: {err}. Trying next argument: {arg}.")
            print(f'All attempts failed for: {second_arg}')
            return 'decoding_error'
    return wrapper

In [3]:
@retry_args
def extract_message_body(msg, key, content_charset=''):
    for part in msg.walk():
    # this way multipart is decoded at second time (first iteration is header+payload, which results in None)
        if part.is_multipart():
            pass
        elif not part.is_multipart():
            try:   
                if not content_charset:
                    charset_to_decode = part.get_content_charset()
                    if charset_to_decode is None:
                        content_charset = 'utf-8'
                    else:
                        content_charset = charset_to_decode
                try:
                    msg_body = part.get_payload(decode=True).decode(content_charset)
                    return msg_body
                except LookupError as lerr:
                    print(lerr)
                    print(f'error at: {key}')
                    raise Exception
            except UnicodeDecodeError as uderr:
                print(uderr)
                print(f'error at: {key}')
                raise Exception

In [4]:
def extract_message_header(mbox_msg, values_to_extract):
    temp_dict = {}
    for value_to_extract in values_to_extract:
        extracted_val = mbox_msg.get(value_to_extract)
        temp_dict[value_to_extract] = extracted_val
    return temp_dict    

In [9]:
def mbox_file_to_pd(files_dir):
    res = defaultdict(list)
    for file_ in os.listdir(files_dir):
        mbox_files = mailbox.mbox(files_dir + file_)
        print(f"Current file: {file_}")
        for key in mbox_files.iterkeys():
            try:
                mbox_msg = mbox_files[key]
            except UnicodeDecodeError as uderr:
                print(uderr)
                print(f'Malformed key: {key} at mbox_files: {mbox_files}')
                continue
            msg_body = extract_message_body(mbox_msg, key)
            #msg_header_dict = extract_message_header()
            res['filename'].append(file_)
            res['email_body'].append(msg_body)
    df = pd.DataFrame(res)
    return df

In [10]:
df_phishing_raw = mbox_file_to_pd('lstm_datasets\jose_phishing_dataset\\')

Current file: 20051114.mbox
'utf-8' codec can't decode byte 0xa9 in position 4650: invalid start byte
error at: 0
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xa0 in position 1646: invalid start byte
error at: 4
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xff in position 5100: invalid start byte
error at: 5
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xff in position 5100: invalid start byte
error at: 6
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xff in position 5100: invalid start byte
error at: 7
Error: . Retrying with argument.
'ascii' codec can't decode byte 0xad in position 2415: ordinal not in range(128)
error at: 8
Error: . Retrying with argument.
'utf-8' codec can't decode byte 0xff in position 5100: invalid start byte
error at: 10
Error: . Retrying with argument.
'ascii' codec can't decode byte 0xad in position 2412: ordinal not in range(128)
error at: 16
Error: . Retrying with argument.
'utf

In [11]:
df_phishing_raw[df_phishing_raw['email_body'] == 'decoding_error']

Unnamed: 0,filename,email_body


In [13]:
df_phishing_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10706 entries, 0 to 10705
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   filename    10706 non-null  object
 1   email_body  10706 non-null  object
dtypes: object(2)
memory usage: 167.4+ KB


In [12]:
df_phishing_raw['filename'].unique()

array(['20051114.mbox', 'phishing-2015', 'phishing-2016', 'phishing-2017',
       'phishing-2018', 'phishing-2019', 'phishing-2020', 'phishing-2021',
       'phishing-2022', 'phishing0.mbox', 'phishing1.mbox',
       'phishing2.mbox', 'phishing3.mbox', 'private-phishing4.mbox'],
      dtype=object)

2. Save raw df for future purpose.

In [14]:
from joblib import load, dump
dump(df_phishing_raw, 'backup_dumps\df_phishing_raw')

['backup_dumps\\df_phishing_raw']

In [22]:
def extract_from_html(body):
    try:
        soup = BeautifulSoup(body, 'html.parser')
        return soup.get_text()
    except Exception as e:
        print(f'Exception occured at: {e} with {body}')
        return "html_failed_extraction"

In [18]:
mbox_files = mailbox.mbox('lstm_datasets\jose_phishing_dataset\phishing0.mbox')

In [19]:
mbox_file_extract(mbox_files)

<aw-confirm@ebay.com>
<info@info-mrktng.com>
<eBay.705443.53897.0@reply3.ebay.com>
<exchange-robot@paypal.com>
<aw-confirm@ebay.com>
<aw-confirm@ebay.com>
<exchange-robot@paypal.com>
<exchange-robot@paypal.com>
<cqrpbrtp@mspring.net>
<service@paypal.com>
<root@mail.tps.edu.hk>
<service@paypal.com>
'utf-8' codec can't decode byte 0xa9 in position 2841: invalid start byte
error at: 12
Error: . Retrying with argument.
<aw-confirm@ebay.com>
<aw-confirm@ebay.com>
<CandiseROK3413@triggroup.com>
'utf-8' codec can't decode byte 0xa9 in position 1842: invalid start byte
error at: 15
Error: . Retrying with argument.
<service@suntrust.com>
<Bailey.ThereseWV9718@ezysurf.com>
<username-errors@umich.edu>
<customer.support@wamu.com>
<supprefnum059665394980068@suntrust.com>
<username-errors@umich.edu>
<Helmsjmy@ecompare.com>
<exchange-robot@paypal.com>
'utf-8' codec can't decode byte 0xa9 in position 2855: invalid start byte
error at: 23
Error: . Retrying with argument.
<aw-confirm@ebay.com>
<support@

In [34]:
message = mbox_files[47]
for element in message.walk():
    if not element.is_multipart():
        print(element.get_content_type())
        print(element.get_content_charset())
        print(element.get_payload(decode=True).decode('UTF-8'))

text/html
iso-7163-9
<html>
<head>
<title>basel constrict abacus shape</title>
</head>
<body>
Choose the life and freedom you deserve and live the life of your dreams.
<br>
<a href="http://loaf.gcusatwo.info/?aid=460">more info here...</a><br><br>
<a href="http://shagging.gcusatwo.info/?aid=460"><img src="http://fibrosis.gcusatwo.info/i/a02armour" border="0"></a><br><br>

Sincerely,
<p>
Mayra Purcell
<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>
paine quahog agnew velours seeing beast indemnify hobble aides trouser
irritable debugger diversionary expelled nelson schlesinger rapacious loon elm
electrolyte ammerman lunacy veracity shipmen inclement spawn bode vacuole dunce mynah
formant ny peoria gloom qualified slavonic counterattack enunciate
sunny bantam perpendicular discriminant windsor mediocre crowberry glasgow
pendulum b's chromic inapt bettor penance pentane enamel
</body>
</html>



In [None]:
print()