In [1]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# Params for bert model and tokenization
Nsamp = 1000 # number of samples to generate in each class - 'spam', 'not spam'
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

In [3]:
def tokenize(row):
    if row is None or row is '':
        tokens = ""
    else:
        try:
            tokens = row.split(" ")[:maxtokens]
        except:
            tokens=""
    return tokens

  if row is None or row is '':


In [4]:
def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower()
            token = re.sub(r'[\W\d]', "", token)
            token = token[:maxtokenlen] # truncate token
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

In [5]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
print(stopwords) # see default stopwords

def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
bodies_df = pd.read_csv('./bodies.csv')

In [7]:
fraud_bodies_df = pd.read_csv('./fraud_bodies_df.csv')

In [8]:
import random

# Convert everything to lower-case, truncate to maxtokens and truncate each token to maxtokenlen
EnronEmails = bodies_df.iloc[:,0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)

SpamEmails = fraud_bodies_df.iloc[:,0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)

raw_data = pd.concat([SpamEmails,EnronEmails], axis=0).values

In [9]:
print("Shape of combined data represented as numpy array is:")
print(raw_data.shape)
print("Data represented as numpy array is:")
print(raw_data)

# corresponding labels
Categories = ['spam','notspam']
header = ([1]*Nsamp)
header.extend(([0]*Nsamp))

Shape of combined data represented as numpy array is:
(2000,)
Data represented as numpy array is:
[list(['dear', 'i', 'am', 'mrs', 'sussy', 'ander', 'kolingba', 'i', 'submite', 'my', 'life', 'to', 'you', 'tackling', 'our', 'immediatelyporblem', 'iam', 'the', 'wife', 'of', 'former', 'military', 'head', 'of', 'state', 'in', 'the', 'centeral', 'africanrepublicgeneral', 'ander', 'kolingba', 'the', 'leader', 'of', 'the', 'failed', 'coup', 'in', 'centralafricanam', 'under', 'hiding', 'with', 'my', 'son', 'patrcie', 'which', 'i', 'dont', 'want', 'enybody', 'toknow', 'my', 'way', 'about', 'beacuse', 'of', 'the', 'renent', 'indiscriminiate', 'arrest', 'of', 'top', 'militaryofficersmostly', 'for', 'my', 'husband', 'tribe', 'which', 'was', 'widely', 'regarded', 'as', 'anethnicvendta', 'in', 'the', 'central', 'african', 'rupeblic', 'and', 'on', 'the', 'world', 'serviceof', 'thatmy', 'futher', 'staying', 'in', 'the', 'country', 'which', 'will', 'no', 'longer', 'savebefore', 'the', 'coupof', 'my', '

In [10]:
# function for shuffling data in unison with labels/header
def unison_shuffle(a, b):
    p = np.random.permutation(len(b))
    data = a[p]
    header = np.asarray(b)[p]
    return data, header

# function for converting data into the right format, due to the difference in required format from sklearn models
# we expect a single string per email here, versus a list of tokens for the sklearn models previously explored
def convert_data(raw_data,header):
    converted_data, labels = [], []
    for i in range(raw_data.shape[0]):
        out = ' '.join(raw_data[i])
        converted_data.append(out)
        labels.append(header[i])
        #print(i)
    converted_data = np.array(converted_data, dtype=object)[:, np.newaxis]

    return converted_data, np.array(labels)

raw_data, header = unison_shuffle(raw_data, header)

# split into independent 70% training and 30% testing sets
#idx = int(0.6*raw_data.shape[0])
# 70% of data for training
#train_x, train_y = convert_data(raw_data[:idx],header[:idx])
# remaining 30% for testing
#valid_x, valid_y = convert_data(raw_data[idx:],header[idx:])

total_size = raw_data.shape[0]

# Calculate indices
idx_train = int(0.6 * total_size)  # end of training set
idx_val = int(0.8 * total_size)  # end of validation set

# Split the data
train_x, train_y = convert_data(raw_data[:idx_train], header[:idx_train])
val_x, val_y = convert_data(raw_data[idx_train:idx_val], header[idx_train:idx_val])
test_x, test_y = convert_data(raw_data[idx_val:], header[idx_val:])

print("train_x/train_y list details, to make sure it is of the right form:")
print(len(train_x))
print(train_x)
print(train_y[:5])
print(train_y.shape)

train_x/train_y list details, to make sure it is of the right form:
1200
[['mr peter langpostfach  bern switzerlandimperative and private i contacting business transfer huge sum money deceased account though i know transaction magnitude make one apprehensive worried i assuring will well end day we decided contact due urgency transaction proposition we discovered abandoned sum us seven million five hundred thousand united states dollars account belongs one foreign customers died along entire family since death none nextofkin relations come forward lay claims money heir we cannot release fund account unless someone applies claim as nextofkin deceased indicated banking guidelines upon discovery seek permission stand next kin deceased documentations carefully worked us funds us released favour beneficiarys next kin it may interest']
 ['jeff confirming initiating call tonightmark d guinney cfaconsultantwatson wyatt investment consulting california street ste san francisco ca   ph  fax']
 ['

In [11]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.4 MB/s[0m eta [36m0:00:0

In [12]:
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [13]:
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [32]:
train_x_list = train_x.flatten().tolist()
train_encodings = tokenizer(train_x_list, truncation=True, padding=True, max_length=256)


In [45]:
val_x_list = val_x.flatten().tolist()
valid_encodings = tokenizer(val_x_list, truncation=True, padding=True, max_length=256)


In [46]:
test_x_list = test_x.flatten().tolist()
test_encodings = tokenizer(test_x_list, truncation=True, padding=True, max_length=256)


In [47]:
print(len(train_encodings['input_ids']))  # The length of encoded texts
print(len(train_y))  # The length of labels


1200
1200


In [48]:
print(len(valid_encodings['input_ids']))  # The length of encoded texts
print(len(val_y))  # The length of labels


400
400


In [49]:
print(len(test_encodings['input_ids']))  # The length of encoded texts
print(len(test_y))  # The length of labels


400
400


In [50]:
# Convert the data to TensorFlow tensors
train_dataset_tf = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_y
))
valid_dataset_tf = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    val_y
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_y
))

In [51]:
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [52]:

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [54]:

# Train the model
model.fit(train_dataset_tf.shuffle(1000).batch(16), epochs=1, batch_size=16, validation_data=valid_dataset_tf.batch(16) )

Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x791ace6bdcf0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x791ace6bdcf0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code

Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x791aab259990>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x791aab259990>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


<keras.callbacks.History at 0x791ace73cd00>

In [57]:
test_dataset_batched = test_dataset.batch(16)

In [58]:
predict = model.predict(test_dataset_batched)



In [61]:
predict

TFSequenceClassifierOutput(loss=None, logits=array([[-2.3925953 ,  2.519008  ],
       [ 2.4782257 , -2.119537  ],
       [-2.4543688 ,  2.5545893 ],
       [ 2.0464957 , -1.8648783 ],
       [-2.4362664 ,  2.567041  ],
       [ 2.3234994 , -2.1407754 ],
       [ 2.5624871 , -2.2931182 ],
       [ 2.5615883 , -2.22689   ],
       [ 2.4788928 , -2.1881914 ],
       [-2.4242406 ,  2.5370417 ],
       [ 2.5244892 , -2.2806783 ],
       [-2.5116146 ,  2.595423  ],
       [-2.5371218 ,  2.5298042 ],
       [ 2.4076655 , -2.0619328 ],
       [-2.2886238 ,  2.357355  ],
       [ 2.230729  , -1.9780084 ],
       [-2.404131  ,  2.5036783 ],
       [-2.3335953 ,  2.3759427 ],
       [ 2.501865  , -2.1430643 ],
       [ 2.0564075 , -1.8866465 ],
       [ 1.9700632 , -1.8024687 ],
       [-2.4690537 ,  2.598548  ],
       [-2.4728963 ,  2.5641425 ],
       [ 2.5039816 , -2.1805983 ],
       [ 2.3484223 , -2.0966306 ],
       [ 1.8033315 , -1.690463  ],
       [-2.4962683 ,  2.5482616 ],
       [ 2

In [59]:
from sklearn.metrics import classification_report

In [62]:
# Convert raw predictions to class predictions
y_pred = np.argmax(predict.logits, axis=-1)

# Get the actual class labels
y_true = []
for features, label in test_dataset:
    y_true.append(label.numpy())
y_true = np.array(y_true)


# Generate a classification report
report = classification_report(y_true, y_pred, target_names=["phishing", "not phishing"])  # replace target_names with your actual class names
print(report)


              precision    recall  f1-score   support

    phishing       0.98      1.00      0.99       202
not phishing       0.99      0.97      0.98       198

    accuracy                           0.98       400
   macro avg       0.99      0.98      0.98       400
weighted avg       0.99      0.98      0.98       400

