In [1]:

import tensorflow as tf
from bert.tokenization import FullTokenizer
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm



# Preprocessing

In [2]:
# Params for bert model and tokenization
Nsamp = 3500 # number of samples to generate in each class - 'spam', 'not spam'
maxtokens = 300 # the maximum number of tokens per document
maxtokenlen = 300 # the maximum length of each token

## Tokenization

In [3]:
def tokenize(row):
    if row is None or row is '':
        tokens = ""
    else:
        try:
            tokens = row.split(" ")[:maxtokens]
        except:
            tokens=""
    return tokens

  if row is None or row is '':


## Utilizing regular expressions to remove unnecessary characters

In [4]:
def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower()
            token = re.sub(r'[\W\d]', "", token)
            token = token[:maxtokenlen] # truncate token
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

## Stop-word removal

In [5]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')    
print(stopwords) # see default stopwords

def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhenliu15471/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
emails = pd.read_csv('../raw_data/emails.csv')

In [7]:
emails.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [8]:
print("Successfully loaded {} rows and {} columns!".format(emails.shape[0], emails.shape[1]))
print(emails.head())

Successfully loaded 517401 rows and 2 columns!
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [9]:
# take a closer look at the first email
print(emails.loc[0]["message"])

Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 


In [10]:
# Separate headers from the message bodies
import email

def extract_messages(df):
    messages = []
    for item in df["message"]:
        # Return a message object structure from a string
        e = email.message_from_string(item)    
        # get message body  
        message_body = e.get_payload()
        messages.append(message_body)
    print("Successfully retrieved message body from e-mails!")
    return messages

bodies = extract_messages(emails)

Successfully retrieved message body from e-mails!


In [11]:
# extract random 10000 enron email bodies for building dataset
import random
bodies_df = pd.DataFrame(random.sample(bodies, 10000))

# expand default pandas display options to make emails more clearly visible when printed
pd.set_option('display.max_colwidth', 300)

bodies_df.head() # you could do print(bodies_df.head()), but Jupyter displays this nicer for pandas DataFrames

Unnamed: 0,0
0,"---------------------- Forwarded by Darron C Giron/HOU/ECT on 03/21/2001 \n11:13 AM ---------------------------\n\n\nCarole Frank\n03/21/2001 11:05 AM\nTo: Darron C Giron/HOU/ECT@ECT\ncc: \nSubject: Re: 12/29/2000 - MTM \n\nDarron,\n\nThe following are the totals for Tudor BVI and Ospraie. Th..."
1,"Ted,\n\nGetting back to work has been a bit of a challenge after a weekend of \nindulgence and excess. It took everything I had to get in the car and start \nthe drive back to Houston Tuesday night (especially since I stopped by my \nsister's and tooled around on Lake Austin in the boat for a w..."
2,"Girl, you know he's lying. He's a trip, but it was a good sermon."
3,FYI\n----- Forwarded by Mark Taylor/HOU/ECT on 08/04/2000 10:58 AM -----\n\n\tJon Barrett@MGLTD\n\t08/04/2000 10:58 AM\n\t\t\n\t\t To: Justin Boyd/LON/ECT@ECT\n\t\t cc: Mark Taylor/HOU/ECT@ECT\n\t\t Subject: Re: MG - US Regulatory Status...\n\nJustin\n\nSorry for the delay in catching up with my...
4,"Attached is a draft proposal for the Chairman, pls review and share comments! \nThank you in advance for your anticipated cooperation. \n"


# Fraudulent email corpus

In [12]:
filepath = "../raw_data/fradulent_emails.txt"
with open(filepath, 'r',encoding="latin1") as file:
    data = file.read()
    
# split on a code word appearing close to the beginning of each email
fraud_emails = data.split("From r")

print("Successfully loaded {} spam emails!".format(len(fraud_emails)))

Successfully loaded 3978 spam emails!


In [13]:
fraud_bodies = extract_messages(pd.DataFrame(fraud_emails,columns=["message"],dtype=str))
fraud_bodies_df = pd.DataFrame(fraud_bodies[1:])

fraud_bodies_df.head() # you could do print(fraud_bodies_df.head()), but Jupyter displays this nicer for pandas DataFrames

Successfully retrieved message body from e-mails!


Unnamed: 0,0
0,"FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-27-587908.\nE-MAIL: (james_ngola2002@maktoob.com).\n\nURGENT BUSINESS ASSISTANCE AND PARTNERSHIP.\n\n\nDEAR FRIEND,\n\nI AM ( DR.) JAMES NGOLA, THE PERSONAL ASSISTANCE TO THE LATE CONGOLESE (PRESIDENT LAURENT KABILA) WHO WAS ASSASSINATED BY HIS BODY G..."
1,"Dear Friend,\n\nI am Mr. Ben Suleman a custom officer and work as Assistant controller of the Customs and Excise department Of the Federal Ministry of Internal Affairs stationed at the Murtala Mohammed International Airport, Ikeja, Lagos-Nigeria.\n\nAfter the sudden death of the former Head of s..."
2,"FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM \nCHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL \nADDRESS:obong_715@epatra.com \n\nATTENTION:PRESIDENT,CEO Sir/ Madam. \n\nThis letter might surprise you because we have met\nneither in person nor by correspondence. But I believe\nit is..."
3,"FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF ELEME KINGDOM \nCHIEF DANIEL ELEME, PHD, EZE 1 OF ELEME.E-MAIL \nADDRESS:obong_715@epatra.com \n\nATTENTION:PRESIDENT,CEO Sir/ Madam. \n\nThis letter might surprise you because we have met\nneither in person nor by correspondence. But I believe\nit is..."
4,"Dear sir, \n \nIt is with a heart full of hope that I write to seek your help in respect of the context below. I am Mrs. Maryam Abacha the former first lady of the former Military Head of State of Nigeria General Sani Abacha whose sudden death occurred on 8th of June 1998 as a result of cardiac ..."


# Dataset 

In [57]:
import random

# Convert everything to lower-case, truncate to maxtokens and truncate each token to maxtokenlen
EnronEmails = bodies_df.iloc[:,0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)
EnronEmails_df = EnronEmails.to_frame()
EnronEmails_df['target'] = 1



phishingEmails = fraud_bodies_df.iloc[:,0].apply(tokenize)
phishingEmails = phishingEmails.apply(stop_word_removal)
phishingEmails = phishingEmails.apply(reg_expressions)
phishingEmails = phishingEmails.sample(Nsamp)
phishingEmails_df = phishingEmails.to_frame()
phishingEmails_df['target'] = 0


raw_data = pd.concat([EnronEmails_df, phishingEmails_df], ignore_index=True)

In [58]:
raw_data = raw_data.rename(columns={raw_data.columns[0]: 'text'})

In [59]:
raw_data

Unnamed: 0,text,target
0,"[august, st, around, corner, thats, deadline, entering, the, private, drawing, years, sporting, clays, tournament, if, your, registration, payment, next, tuesday, name, goes, hat, a, great, shotgunvisit, website, wwwhoustonenergyorg, details, awesome, event, send, registration, today, look, forw...",1
1,"[the, following, expense, report, ready, approvalemployee, name, michelle, y, lokaystatus, last, changed, by, michelle, y, lokayexpense, report, name, expenses, report, total, amount, due, employee, to, approve, expense, report, click, following, link, concur, expensehttpexpensexmsenroncom]",1
2,"[the, gisb, constellation, impasse, regarding, ucc, issues, cannot, agree, to, so, vehicle, interruptible, deals, i, keep, you, adviseddebra, perlingiereenron, north, america, corplegal, department, smith, street, eb, houston, texas, dperlinenroncomphone, fax, ]",1
3,"[attached, enfolio, gisb, spot, contracts, lets, discuss]",1
4,"[best, wishes, aep, markshawna, flynn, , pm, , to, mark, e, haedickehouectect, barbara, n, grayhouectect, julia, murrayhouectect, cc, , subject, farewell, thank, youdear, mark, barbara, julia, thank, much, opportunity, work, ews, legal, my, seven, years, here, wonderful, learning, experience, th...",1
...,...,...
6995,"[attnmy, name, mr, kelvin, taylor, i, first, son, ofliberias, former, president, charles, taylor, before, heleft, office, instruct, look, capable, handwho, help, us, invest, money, foreign, countryfor, period, five, years, trustee, moneywill, , intrest, share, each, contract, renewed, fiveyears,...",0
6996,"[mr, timi, alaibeexecutive, director, projectcniger, delta, development, commissioncold, secretariat, complexcvictorialislandc, lagoseteladear, sirci, timi, alaibec, executive, director, finance, niger, delta, development, commission, , nddc, emy, commssion, , the, niger, delta, development, com...",0
6997,"[table, border, width, cellpadding, cellpaddingtrtd, bgcolorffffffpplot, , victoria, garden, citybremail, a, hrefmailtopaulltsamailcozapaulltsamailcozaappintroduction, my, name, mike, paul, i, know, proposal, come, youbras, surprise, havenot, met, either, physically, anybrcorrespondenceppnbspi, ...",0
6998,"[from, , mr, dave, bentondate, , , , kindest, attentionmy, name, mr, dave, benton, i, portugal, , i, diagnosed, esophageal, cancer, it, defiled, forms, medical, treatment, right, i, months, live, according, medical, experts, i, particularly, lived, life, well, i, never, really, cared, anyone, no...",0


In [30]:
raw_data.to_csv('my_data.csv', index=False)

In [61]:
data_df = raw_data.sample(frac=1,random_state=42).reset_index(drop=True)

In [67]:
data_df['text'] = data_df['text'].apply(' '.join)

In [68]:
data_df

Unnamed: 0,text,target
0,dear friendmy proposal surprising personal contact before however i sincerely seek confidence transactionwhich i propose person transparencyhonesty high caliber let first start introducing properly you my name ron sinclear personal assistance haitian president i apologize i infringed privacy it...,0
1,forwarded john arnoldhouect am matthew arnold pmto john arnoldhouectect tom mcquadehouectectcc subject wv love story forwarded matthew arnoldhouect pm jonathon pielop amto mo bawahouectect matthew arnoldhouectect brian orourkehouectectcc subject,1
2,weekly meeting withrick buybill bradforddebbie brackettphilippe bibited murphydavid portstephen stockvince kaminskibeth perlmansally beck inline attachment follows from date subject inline attachment follows from date subject inline attachment follows from date subject inline attachment follows ...,1
3,brad richter jay webb charge enrononline brad responsible commercial side things jay handles operations original messagefrom yoho lisa sentfriday august amtotaylor mark e legalsubjectandy zippermarkwho replaced andy zipper is andy still enronthankslisa,1
4,,0
...,...,...
6995,,0
6996,dear sir assistance required for acquisition of estate i write inform desire acquire estatesor landed properties country behalf thedirector contracts finance allocations thefederal ministry works housing nigeriaconsidering strategic influentialposition would want transaction asstrictly confident...,0
6997,,0
6998,fromamrseaminaea belloe attnathe managing director before i proceedc may i humbly introduce goodselfc my name mrse aminaeaebelloc iraqi refugee cmy husband recentlyc one personal aid president iraq formerly overthrown power american government e prior last serious crisis still ravaging countrycw...,0


In [69]:
data_df['text'] = data_df['text'].str.replace("[\[\]_]", "", regex=True)

In [75]:
data_df = data_df[data_df['text'] != '']

In [92]:
data_df

Unnamed: 0,text,target
0,dear friendmy proposal surprising personal contact before however i sincerely seek confidence transactionwhich i propose person transparencyhonesty high caliber let first start introducing properly you my name ron sinclear personal assistance haitian president i apologize i infringed privacy it...,0
1,forwarded john arnoldhouect am matthew arnold pmto john arnoldhouectect tom mcquadehouectectcc subject wv love story forwarded matthew arnoldhouect pm jonathon pielop amto mo bawahouectect matthew arnoldhouectect brian orourkehouectectcc subject,1
2,weekly meeting withrick buybill bradforddebbie brackettphilippe bibited murphydavid portstephen stockvince kaminskibeth perlmansally beck inline attachment follows from date subject inline attachment follows from date subject inline attachment follows from date subject inline attachment follows ...,1
3,brad richter jay webb charge enrononline brad responsible commercial side things jay handles operations original messagefrom yoho lisa sentfriday august amtotaylor mark e legalsubjectandy zippermarkwho replaced andy zipper is andy still enronthankslisa,1
5,htmlhtmlheadtitleuntitledtitlemeta httpequivcontenttype contenttexthtml charsetisoheadbody bgcolorffffff text link vlinkcc alinkffdiv aligncentertable border cellpadding cellspacing width tr tdimg srchttpimagespostdirectcommasterimagescleargif width height bordertd tdimg srchttpimagespostdirectc...,1
...,...,...
6993,contenttype textplaincontenttransferencoding bitfrommiss joan ahmedemail ahmedjohnsonyahoopltel greetings i sincerely hope message meet well i joan ahmed daughter eng kamal ahmeda construction engineer head balad regional reconstruction projecthe assassinated last january balad north baghdad ...,0
6994,ryan thomas office tomorrow morning interview real time trading position e power hour desk please confirm availability interview ryan don baughman juan hernandez kayne coulter john forneythanksjohnny,1
6996,dear sir assistance required for acquisition of estate i write inform desire acquire estatesor landed properties country behalf thedirector contracts finance allocations thefederal ministry works housing nigeriaconsidering strategic influentialposition would want transaction asstrictly confident...,0
6998,fromamrseaminaea belloe attnathe managing director before i proceedc may i humbly introduce goodselfc my name mrse aminaeaebelloc iraqi refugee cmy husband recentlyc one personal aid president iraq formerly overthrown power american government e prior last serious crisis still ravaging countrycw...,0


In [93]:
from sklearn.model_selection import train_test_split
train_dataset, tv_dataset = train_test_split(data_df, test_size=0.4, shuffle=True, stratify=None, random_state=2021)

In [94]:
test_dataset_raw, valid_dataset = train_test_split(tv_dataset, test_size=0.5, shuffle=True, stratify=None, random_state=2021)

In [107]:
test_dataset_raw

Unnamed: 0,text,target
2738,from dr rasheed s abubakardear friendmy compliment youi guess letter may come surprise since i previous correspondence youi sending mail behalf chairman tender board independent national electoral commission inec mr settley daze we got contact search reliable person handle confidential transacti...,0
4514,from mrs mariama taylor dakarsenegalewest africaegood dayci would like apply medium cooperation secure opportunity invest joint business countryei substantial capital i honourably intend invest country lucrative business venture advise execute said venture mutual benefits useyour able cooperatio...,0
51,markthanks comments i forwarded email lead attorneythat originally reviewed nda last week made changes to ihave asked call asap two work residual knowledgeclause i believe tied afternoon hear inthe morningthanksbrendamarkgreenbergenroncom wrote brenda john allario forwarded sabre comments nda i...,1
5610,fyi forwarded richard b sandershouect am susan j maraees pmto james b fallonhouectect greg whalleyhouectect richard b sandershouectect richard shapirohoueesees christian yoderhouectect tim beldenhouectect elizabeth sagerhouectect joe hartsoeenron mary hainhouectectcc david parquetsfectect sub...,1
4591,from mrsjane abbas phone attndirectorceo iam mrs jane abbas wife former advisersuliman h abbas charge arms andacquisition current president sierrelonehis excellency ahmed kabbahfollowing civil warin country head state delegated husbandto arrange arm purchase republic southafrica independent arms...,0
...,...,...
1636,are still getting demand charge bill equitrans contract i am still estimating per month,1
1212,that fast received agreement awaiting signature vp he office till wed in meantime im field ops start pulling maintenance files easement records ill let know timing i get it if think anything else like review please let knowbest regardstalk soonkevin,1
3262,my dearit heartfelt hope i write seek cooperation assistance context stated belowmay i first introduce self i drmrs mariam abachathe wife late general sani abacha former military head state president federal republic nigeria died suddenly th june i got contact help sisterinlaw works canadian cha...,0
5293,whadup im happy re love thing love sooooo air lately molly fell head heels cowboy dude evan introduced her to theyve attached hips literally last three weeks amber spends four quality hours day phone marky cries hang up seriously romance sounds much interesting absolutely keep posted unfolds wh...,1


# BERT model

In [90]:
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.metrics import classification_report

from sklearn.preprocessing import LabelEncoder

In [80]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [95]:
# Encode Labels
encoder = LabelEncoder()
train_labels = encoder.fit_transform(train_dataset['target'])
val_labels = encoder.transform(valid_dataset['target'])
test_labels = encoder.transform(test_dataset_raw['target'])

In [96]:
# Tokenize the data
train_encodings = tokenizer(train_dataset['text'].tolist(), truncation=True, padding=True, max_length=256)
valid_encodings = tokenizer(valid_dataset['text'].tolist(), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_dataset_raw['text'].tolist(), truncation=True, padding=True, max_length=256)

In [98]:
# Convert to TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    val_labels
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)

In [83]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [102]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [103]:
from tensorflow.keras.callbacks import EarlyStopping


# Define early stopping
early_stopping = EarlyStopping(
    monitor='val_loss', # usually val_loss or val_acc
    mode='min', # the direction that you want to monitor (min for loss, max for accuracy)
    patience=3, # number of epochs to wait before stopping the training
    verbose=1 # verbosity mode
)

In [104]:
# Train the model
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1,)



<tensorflow.python.keras.callbacks.History at 0x1b5c6a520>

In [105]:
# Get the model predictions
predictions = tf.argmax(model.predict(test_dataset)[0], axis=-1)

In [106]:
from sklearn.metrics import classification_report


# Decode the label numbers back into original labels
y_test_inverse = encoder.inverse_transform(test_dataset_raw['target'])
predictions_inverse = encoder.inverse_transform(predictions.numpy())

# Generate the classification report
report = classification_report(y_test_inverse, predictions_inverse,
                               target_names=encoder.classes_.astype(str))
print(report)

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       519
           1       0.99      1.00      1.00       725

    accuracy                           1.00      1244
   macro avg       1.00      0.99      1.00      1244
weighted avg       1.00      1.00      1.00      1244

