In [1]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# Params for bert model and tokenization
Nsamp = 1000 # number of samples to generate in each class - 'spam', 'not spam'
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

In [3]:
def tokenize(row):
    if row is None or row is '':
        tokens = ""
    else:
        try:
            tokens = row.split(" ")[:maxtokens]
        except:
            tokens=""
    return tokens

  if row is None or row is '':


In [4]:
def reg_expressions(row):
    tokens = []
    try:
        for token in row:
            token = token.lower()
            token = re.sub(r'[\W\d]', "", token)
            token = token[:maxtokenlen] # truncate token
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return tokens

In [5]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
print(stopwords) # see default stopwords

def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
bodies_df = pd.read_csv('./bodies.csv')

In [7]:
fraud_bodies_df = pd.read_csv('./fraud_bodies_df.csv')

In [8]:
import random

# Convert everything to lower-case, truncate to maxtokens and truncate each token to maxtokenlen
EnronEmails = bodies_df.iloc[:,0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_word_removal)
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)

SpamEmails = fraud_bodies_df.iloc[:,0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_word_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)

raw_data = pd.concat([SpamEmails,EnronEmails], axis=0).values

In [9]:
print("Shape of combined data represented as numpy array is:")
print(raw_data.shape)
print("Data represented as numpy array is:")
print(raw_data)

# corresponding labels
Categories = ['spam','notspam']
header = ([1]*Nsamp)
header.extend(([0]*Nsamp))

Shape of combined data represented as numpy array is:
(2000,)
Data represented as numpy array is:
[list(['dear', 'i', 'am', 'mrs', 'sussy', 'ander', 'kolingba', 'i', 'submite', 'my', 'life', 'to', 'you', 'tackling', 'our', 'immediatelyporblem', 'iam', 'the', 'wife', 'of', 'former', 'military', 'head', 'of', 'state', 'in', 'the', 'centeral', 'africanrepublicgeneral', 'ander', 'kolingba', 'the', 'leader', 'of', 'the', 'failed', 'coup', 'in', 'centralafricanam', 'under', 'hiding', 'with', 'my', 'son', 'patrcie', 'which', 'i', 'dont', 'want', 'enybody', 'toknow', 'my', 'way', 'about', 'beacuse', 'of', 'the', 'renent', 'indiscriminiate', 'arrest', 'of', 'top', 'militaryofficersmostly', 'for', 'my', 'husband', 'tribe', 'which', 'was', 'widely', 'regarded', 'as', 'anethnicvendta', 'in', 'the', 'central', 'african', 'rupeblic', 'and', 'on', 'the', 'world', 'serviceof', 'thatmy', 'futher', 'staying', 'in', 'the', 'country', 'which', 'will', 'no', 'longer', 'savebefore', 'the', 'coupof', 'my', '

In [10]:
# function for shuffling data in unison with labels/header
def unison_shuffle(a, b):
    p = np.random.permutation(len(b))
    data = a[p]
    header = np.asarray(b)[p]
    return data, header

# function for converting data into the right format, due to the difference in required format from sklearn models
# we expect a single string per email here, versus a list of tokens for the sklearn models previously explored
def convert_data(raw_data,header):
    converted_data, labels = [], []
    for i in range(raw_data.shape[0]):
        out = ' '.join(raw_data[i])
        converted_data.append(out)
        labels.append(header[i])
        #print(i)
    converted_data = np.array(converted_data, dtype=object)[:, np.newaxis]

    return converted_data, np.array(labels)

raw_data, header = unison_shuffle(raw_data, header)

# split into independent 70% training and 30% testing sets
#idx = int(0.6*raw_data.shape[0])
# 70% of data for training
#train_x, train_y = convert_data(raw_data[:idx],header[:idx])
# remaining 30% for testing
#valid_x, valid_y = convert_data(raw_data[idx:],header[idx:])

total_size = raw_data.shape[0]

# Calculate indices
idx_train = int(0.6 * total_size)  # end of training set
idx_val = int(0.8 * total_size)  # end of validation set

# Split the data
train_x, train_y = convert_data(raw_data[:idx_train], header[:idx_train])
val_x, val_y = convert_data(raw_data[idx_train:idx_val], header[idx_train:idx_val])
test_x, test_y = convert_data(raw_data[idx_val:], header[idx_val:])

print("train_x/train_y list details, to make sure it is of the right form:")
print(len(train_x))
print(train_x)
print(train_y[:5])
print(train_y.shape)

train_x/train_y list details, to make sure it is of the right form:
1200
[['mr peter langpostfach  bern switzerlandimperative and private i contacting business transfer huge sum money deceased account though i know transaction magnitude make one apprehensive worried i assuring will well end day we decided contact due urgency transaction proposition we discovered abandoned sum us seven million five hundred thousand united states dollars account belongs one foreign customers died along entire family since death none nextofkin relations come forward lay claims money heir we cannot release fund account unless someone applies claim as nextofkin deceased indicated banking guidelines upon discovery seek permission stand next kin deceased documentations carefully worked us funds us released favour beneficiarys next kin it may interest']
 ['jeff confirming initiating call tonightmark d guinney cfaconsultantwatson wyatt investment consulting california street ste san francisco ca   ph  fax']
 ['

In [11]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.4 MB/s[0m eta [36m0:00:0

In [12]:
import tensorflow as tf
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [13]:
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [32]:
train_x_list = train_x.flatten().tolist()
train_encodings = tokenizer(train_x_list, truncation=True, padding=True, max_length=256)


In [45]:
val_x_list = val_x.flatten().tolist()
valid_encodings = tokenizer(val_x_list, truncation=True, padding=True, max_length=256)


In [46]:
test_x_list = test_x.flatten().tolist()
test_encodings = tokenizer(test_x_list, truncation=True, padding=True, max_length=256)


In [47]:
print(len(train_encodings['input_ids']))  # The length of encoded texts
print(len(train_y))  # The length of labels


1200
1200


In [48]:
print(len(valid_encodings['input_ids']))  # The length of encoded texts
print(len(val_y))  # The length of labels


400
400


In [49]:
print(len(test_encodings['input_ids']))  # The length of encoded texts
print(len(test_y))  # The length of labels


400
400


In [50]:
# Convert the data to TensorFlow tensors
train_dataset_tf = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_y
))
valid_dataset_tf = tf.data.Dataset.from_tensor_slices((
    dict(valid_encodings),
    val_y
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_y
))

In [51]:
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [52]:

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [54]:

# Train the model
model.fit(train_dataset_tf.shuffle(1000).batch(16), epochs=1, batch_size=16, validation_data=valid_dataset_tf.batch(16) )

Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x791ace6bdcf0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x791ace6bdcf0>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code

Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x791aab259990>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x791aab259990>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


<keras.callbacks.History at 0x791ace73cd00>

In [57]:
test_dataset_batched = test_dataset.batch(16)

In [58]:
predict = model.predict(test_dataset_batched)



In [61]:
predict

TFSequenceClassifierOutput(loss=None, logits=array([[-2.3925953 ,  2.519008  ],
       [ 2.4782257 , -2.119537  ],
       [-2.4543688 ,  2.5545893 ],
       [ 2.0464957 , -1.8648783 ],
       [-2.4362664 ,  2.567041  ],
       [ 2.3234994 , -2.1407754 ],
       [ 2.5624871 , -2.2931182 ],
       [ 2.5615883 , -2.22689   ],
       [ 2.4788928 , -2.1881914 ],
       [-2.4242406 ,  2.5370417 ],
       [ 2.5244892 , -2.2806783 ],
       [-2.5116146 ,  2.595423  ],
       [-2.5371218 ,  2.5298042 ],
       [ 2.4076655 , -2.0619328 ],
       [-2.2886238 ,  2.357355  ],
       [ 2.230729  , -1.9780084 ],
       [-2.404131  ,  2.5036783 ],
       [-2.3335953 ,  2.3759427 ],
       [ 2.501865  , -2.1430643 ],
       [ 2.0564075 , -1.8866465 ],
       [ 1.9700632 , -1.8024687 ],
       [-2.4690537 ,  2.598548  ],
       [-2.4728963 ,  2.5641425 ],
       [ 2.5039816 , -2.1805983 ],
       [ 2.3484223 , -2.0966306 ],
       [ 1.8033315 , -1.690463  ],
       [-2.4962683 ,  2.5482616 ],
       [ 2

In [59]:
from sklearn.metrics import classification_report

In [63]:
# Convert raw predictions to class predictions
y_pred = np.argmax(predict.logits, axis=-1)

# Get the actual class labels
y_true = []
for features, label in test_dataset:
    y_true.append(label.numpy())
y_true = np.array(y_true)


# Generate a classification report
report = classification_report(y_true, y_pred, target_names=["not phishing", "phishing"])  # replace target_names with your actual class names
print(report)


              precision    recall  f1-score   support

not phishing       0.98      1.00      0.99       202
    phishing       0.99      0.97      0.98       198

    accuracy                           0.98       400
   macro avg       0.99      0.98      0.98       400
weighted avg       0.99      0.98      0.98       400



In [64]:
test_new = test_x.reshape(-1)

In [65]:
test_new_df = pd.DataFrame(test_new)

In [66]:
test_new_df = pd.DataFrame(test_new, columns=['text'])

In [67]:
test_new_df['target'] = test_y

In [68]:
test_dataset_raw_0 = test_new_df[test_new_df['target']==0]

In [69]:
random_25_rows = test_dataset_raw_0.sample(n=100)

# Build the new DataFrame using the randomly selected rows
test_dataset_raw_0_25 = pd.DataFrame(random_25_rows)

In [70]:
test_dataset_raw_1 = test_new_df[test_new_df['target']==1]

In [71]:
random_1_rows = test_dataset_raw_1.sample(n=25)

# Build the new DataFrame using the randomly selected rows
test_dataset_raw_1_100 = pd.DataFrame(random_1_rows)

In [72]:
merged_df = pd.concat([test_dataset_raw_0_25, test_dataset_raw_1_100], ignore_index=True)

# Shuffle the index
test_125_df = merged_df.sample(frac=1).reset_index(drop=True)

In [74]:
test_encodings_125 = tokenizer(test_125_df['text'].tolist(), truncation=True, padding=True, max_length=256)

test_dataset_125 = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings_125),
    test_125_df['target'].values
))


In [75]:
test_dataset_125_batched = test_dataset_125.batch(16)
y_pred_raw_125 = model.predict(test_dataset_125_batched)

# Convert raw predictions to class predictions
y_pred_125 = np.argmax(y_pred_raw_125.logits, axis=1)

# Get the actual class labels
y_true_125 = []
for features, label in test_dataset_125:
    y_true_125.append(label.numpy())
y_true_125 = np.array(y_true_125)


# Generate a classification report
report_125 = classification_report(y_true_125, y_pred_125, target_names=["not phishing", "phishing"])  # replace target_names with your actual class names
print(report_125)

              precision    recall  f1-score   support

not phishing       1.00      0.99      0.99       100
    phishing       0.96      1.00      0.98        25

    accuracy                           0.99       125
   macro avg       0.98      0.99      0.99       125
weighted avg       0.99      0.99      0.99       125



# chatgpt

In [88]:
test_dataset_raw_1_100.index

Int64Index([315, 321,  34, 290, 150, 173, 393, 371,  46, 390, 398, 392, 154,
            294,  96, 285, 266, 386, 349,  74, 145, 282, 381, 186, 140],
           dtype='int64')

In [89]:
test_dataset_raw_gpt4 = test_dataset_raw_1_100

In [90]:
test_dataset_raw_gpt4['text'][295] = "Hi, I'm Bhamini Lele from Centrafest Technologies Pvt. Ltd. in Mumbai, India, reaching out in my role within Customer Reach and Support. We're thrilled to present our latest product, the Business Card Reader Cum Photo Scanner, Model Plus. This device offers a host of features including dual-sided scanning, memo writing, label printing, customized categories, online category setting, reminder system, import/export to third-party packages, and the capacity to scan different sizes of business cards, maintaining their colour and detail. This product has been designed considering feedback from our key corporate clients, ensuring it meets diverse document handling needs. Our records indicate that you, along with Ms. Sneha and Ms. Shefali, have shown interest in such a device. If you have already purchased and are content with a scanner from us, please disregard this message. I look forward to your response and stand ready to answer any questions or provide further information. Best, Bhamini Lele, Centrafest Technologies Pvt. Ltd."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][295] = "Hi, I'm Bhamini Lele from Centrafest Technologies Pvt. Ltd. in Mumbai, India, reaching out in my role within Customer Reach and Support. We're thrilled to present our latest product, the Business Card Reader Cum Photo Scanner, Model Plus. This device offers a host of features including dual-sided scanning, memo writing, label printing, customized categories, online category setting, reminder system, import/export to third-party packages, and the capacity to scan different sizes of business cards, maintaining their colour and detail. This product has been designed considering feedback from our key corporate clients, ensuring it meets diverse document handling needs. Our records indicate that you, along with Ms. Sneha and Ms. Shefali, have shown 

In [91]:
test_dataset_raw_gpt4['text'][170] = "Attention, I am Mr. Moni Millat, a member of the Independent Committee of Eminent Persons (ICEP) in Switzerland. The ICEP is tasked with identifying dormant bank accounts in Switzerland, originally owned by non-Swiss citizens, that have been inactive since World War II. Intriguingly, in July, the Swiss Bankers Association disclosed a list of such accounts, a majority of which belonged to victims of the Holocaust. The relentless efforts of the ICEP have led to the identification of additional dormant accounts as of December. These accounts range from interest-bearing savings accounts, securities accounts, safe deposit boxes, custody accounts, non-interest-bearing transaction accounts, to numbered accounts. Accounts that are interest-bearing have been accumulating interest since their inception. Claims on these accounts by non-Swiss citizens are managed by the Claims Resolution Tribunal (CRT). Recently, a dormant account named 'Ordner Adele' was discovered, holding a credit balance of US dollars along with accumulated interest. Unfortunately, the intended recipient was a victim of the Holocaust era, leaving no will."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][170] = "Attention, I am Mr. Moni Millat, a member of the Independent Committee of Eminent Persons (ICEP) in Switzerland. The ICEP is tasked with identifying dormant bank accounts in Switzerland, originally owned by non-Swiss citizens, that have been inactive since World War II. Intriguingly, in July, the Swiss Bankers Association disclosed a list of such accounts, a majority of which belonged to victims of the Holocaust. The relentless efforts of the ICEP have led to the identification of additional dormant accounts as of December. These accounts range from interest-bearing savings accounts, securities accounts, safe deposit boxes, custody accounts, non-interest-bearing transaction accounts, to numbered accounts. Accounts that are interest-bearing have

In [92]:
test_dataset_raw_gpt4['text'][54] = "Good day,I'm John Williams, an auditor and computing staff member at a bank here in England. I am reaching out because of an urgent matter concerning an inactive account opened in 2000. No activity has been recorded since 2003, and without immediate action, the substantial funds within will be forfeit. The account holder was Mr. Maxwell Brown, a foreigner and manager of a petrol chemical service in England. A chemical engineer by profession, Mr. Brown passed away in 2003, and since then, no claims have been made on his account, which holds £18,000,000. My investigation confirms that his company is unaware of this account. To prevent these funds from being lost, I am seeking a trustworthy individual to assist in the transfer of this significant amount to a secure foreign account."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][54] = "Good day,I'm John Williams, an auditor and computing staff member at a bank here in England. I am reaching out because of an urgent matter concerning an inactive account opened in 2000. No activity has been recorded since 2003, and without immediate action, the substantial funds within will be forfeit. The account holder was Mr. Maxwell Brown, a foreigner and manager of a petrol chemical service in England. A chemical engineer by profession, Mr. Brown passed away in 2003, and since then, no claims have been made on his account, which holds £18,000,000. My investigation confirms that his company is unaware of this account. To prevent these funds from being lost, I am seeking a trustworthy individual to assist in the transfer of this significant a

In [93]:
test_dataset_raw_gpt4['text'][155] ="Greetings, In the name of the Almighty, I, Engapet Kayce from New Sweden, write to you with sincere intentions. Previously, I was associated with Shell Petroleum in Iraq and was a major offshore oil rig investor, but recent health challenges have significantly altered my circumstances. I've been affected by a stroke that has left half of my body paralyzed, and extensive eye surgery has resulted in poor vision. Despite seeking spiritual and medical remedies over the last three years, my situation remains dire, with my doctor predicting permanent immobilization. Regrettably, I lack close relatives to support me during this challenging time; my only known kin, a distant cousin named Mrs. Linda Lefler, emigrated to Australia ten years ago, and I have lost contact with her. Having surrendered my life to God's will, I am inspired to perform a good deed before my time comes, which is why I am reaching out to you today."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][155] ="Greetings, In the name of the Almighty, I, Engapet Kayce from New Sweden, write to you with sincere intentions. Previously, I was associated with Shell Petroleum in Iraq and was a major offshore oil rig investor, but recent health challenges have significantly altered my circumstances. I've been affected by a stroke that has left half of my body paralyzed, and extensive eye surgery has resulted in poor vision. Despite seeking spiritual and medical remedies over the last three years, my situation remains dire, with my doctor predicting permanent immobilization. Regrettably, I lack close relatives to support me during this challenging time; my only known kin, a distant cousin named Mrs. Linda Lefler, emigrated to Australia ten years ago, and I hav

In [94]:
test_dataset_raw_gpt4['text'][344] ="Subject: Investment Relationship Request from Tabel D. Michael Hello, I'm Tabel Djedje Michael, a 20-year-old, and the younger brother of Maria Michael, who is 24 years old. We are the children of the late Dr. and Mrs. Djedje Michael, our father being a former Minister of the Interior in Ivory Coast, a respected merchant, politician, and a figure of repute in Abidjan, the economic capital of the Ivory Coast. Unfortunately, our father fell victim to political unrest in our country and was assassinated on September 20th, amidst the violence. We were at church when our house was set ablaze by malevolent government agents. Our mother passed away 10 years earlier, and since then, our father took special care of us and never remarried. We currently live with our 90-year-old grandmother. Our father's friend, Mrs. Lim S. Tsing, who works at an information center in Japan, recommended reaching out to a group of contacts for assistance, and from those, we chose you for a business relationship. The purpose of this message is to discuss a potential investment opportunity."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][344] ="Subject: Investment Relationship Request from Tabel D. Michael Hello, I'm Tabel Djedje Michael, a 20-year-old, and the younger brother of Maria Michael, who is 24 years old. We are the children of the late Dr. and Mrs. Djedje Michael, our father being a former Minister of the Interior in Ivory Coast, a respected merchant, politician, and a figure of repute in Abidjan, the economic capital of the Ivory Coast. Unfortunately, our father fell victim to political unrest in our country and was assassinated on September 20th, amidst the violence. We were at church when our house was set ablaze by malevolent government agents. Our mother passed away 10 years earlier, and since then, our father took special care of us and never remarried. We currently li

In [95]:
test_dataset_raw_gpt4['text'][25] ="Subject: Investment and Resettlement Request from Mrs. Hassane Dear Managing Director, Allow me to introduce myself - I am Mrs. Amina Hassane, an Iraqi refugee. My late husband was a personal aide to the former Iraqi president, overthrown by the American government. Amidst the crisis that continues to plague my country, my husband was able to secure a large sum of US $30 million before his untimely passing. With his contacts, we managed to safeguard these funds in Thailand. I am reaching out because I am keen on investing in your country, given its friendly investment environment. I would greatly appreciate your guidance and assistance in making the right investment decisions, as well as purchasing a residential property, as I am planning to move my family there as soon as all logistical details are sorted. It's important to me that we approach this with complete mutual satisfaction in mind."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][25] ="Subject: Investment and Resettlement Request from Mrs. Hassane Dear Managing Director, Allow me to introduce myself - I am Mrs. Amina Hassane, an Iraqi refugee. My late husband was a personal aide to the former Iraqi president, overthrown by the American government. Amidst the crisis that continues to plague my country, my husband was able to secure a large sum of US $30 million before his untimely passing. With his contacts, we managed to safeguard these funds in Thailand. I am reaching out because I am keen on investing in your country, given its friendly investment environment. I would greatly appreciate your guidance and assistance in making the right investment decisions, as well as purchasing a residential property, as I am planning to move

In [96]:
test_dataset_raw_gpt4['text'][63] = "Subject: Urgent Matter Concerning Unclaimed AssetsDear Sir,My name is Dr. Richard Shubane, and I am the Operational Manager in charge of Credit and Foreign Bills at one of South Africa's leading banks, located in the main city of Johannesburg. I'm reaching out to discuss a significant matter concerning a deceased foreign customer of our bank who, along with his entire family, tragically perished in the Concorde plane crash (Flight AF 4590) on July 25, 2000. This client, a prominent industrialist, opened an account with us in 1995, but unfortunately, left no written or oral will associated with the account. Since his death, we've been unable to locate any next of kin to claim the funds, which have remained unclaimed due to the secrecy surrounding the account. The account, worth $106 million USD, was solely in his name and under the management of his company until his demise. Our plan is to initiate a transfer of $6 million USD to ensure a smooth, disappointment-free transaction before proceeding with the transfer of the remaining balance. I've confidentially discussed this matter with the General Manager of the bank and we are prepared to move forward."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][63] = "Subject: Urgent Matter Concerning Unclaimed AssetsDear Sir,My name is Dr. Richard Shubane, and I am the Operational Manager in charge of Credit and Foreign Bills at one of South Africa's leading banks, located in the main city of Johannesburg. I'm reaching out to discuss a significant matter concerning a deceased foreign customer of our bank who, along with his entire family, tragically perished in the Concorde plane crash (Flight AF 4590) on July 25, 2000. This client, a prominent industrialist, opened an account with us in 1995, but unfortunately, left no written or oral will associated with the account. Since his death, we've been unable to locate any next of kin to claim the funds, which have remained unclaimed due to the secrecy surrounding

In [97]:
test_dataset_raw_gpt4['text'][325] = "Subject: Request for Assistance in Business Venture Dear Recipient,My name is Gordon Blackwell, writing to you from Amsterdam, Netherlands. I found your contact via a private internet search, and I am confident in your honesty and commitment to assist in a sensitive business matter. This venture involves funds originally owned by a foreign oil merchant and contractor with the government of the Netherlands who tragically passed away in a Kenya Airways Flight KQ plane crash several years ago. As of July, the deceased's account balance was $4.5 million USD, and the bank is currently seeking a next of kin to claim these funds. Despite the bank's efforts, no family or relative of the deceased has been located. Given the unsuccessful search and the possibility that no next of kin can be found, the management, under the influence of the chairman of the board of directors, is arranging for the funds to be claimed and thus requires your assistance."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][325] = "Subject: Request for Assistance in Business Venture Dear Recipient,My name is Gordon Blackwell, writing to you from Amsterdam, Netherlands. I found your contact via a private internet search, and I am confident in your honesty and commitment to assist in a sensitive business matter. This venture involves funds originally owned by a foreign oil merchant and contractor with the government of the Netherlands who tragically passed away in a Kenya Airways Flight KQ plane crash several years ago. As of July, the deceased's account balance was $4.5 million USD, and the bank is currently seeking a next of kin to claim these funds. Despite the bank's efforts, no family or relative of the deceased has been located. Given the unsuccessful search and the p

In [98]:
test_dataset_raw_gpt4['text'][330] ="Subject: Business Investment Proposal Dear Recipient,I am Mrs. Femia Bangura and I wish to propose a collaborative business venture in your country. Despite my limited knowledge of international business and investment, I have substantial capital that I intend to invest into a lucrative venture in your country, as you advise. My vision is to make you my business partner, trustee, and representative, creating ideas together for the money to be invested, and probably managed. Your assistance is crucial in transferring the money and safeguarding my interests. Upon your acceptance of this proposal, I will provide you with detailed information, the procedures involved, the exact amount, and agree on a mutual percentage interest for your involvement in helping secure the release of the deposit and investing the money. I highly appreciate your immediate attention to this matter, and I look forward to your positive response."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][330] ="Subject: Business Investment Proposal Dear Recipient,I am Mrs. Femia Bangura and I wish to propose a collaborative business venture in your country. Despite my limited knowledge of international business and investment, I have substantial capital that I intend to invest into a lucrative venture in your country, as you advise. My vision is to make you my business partner, trustee, and representative, creating ideas together for the money to be invested, and probably managed. Your assistance is crucial in transferring the money and safeguarding my interests. Upon your acceptance of this proposal, I will provide you with detailed information, the procedures involved, the exact amount, and agree on a mutual percentage interest for your involvement i

In [99]:
test_dataset_raw_gpt4['text'][59] ="Subject: Urgent Assistance Needed Dear Friend,My name is Mrs. Elita Ngoni Tawanda, the widow of the late Ndinonzi Kajengo Tawanda, a farmer from Zimbabwe who was tragically murdered in a land dispute. I found your contact through an online network and felt compelled to reach out. Before my husband's untimely demise, he had safely deposited a consignment with a security company at an undisclosed location, which I can reveal upon receiving your positive response. Although the consignment was declared to contain gemstones for security reasons, it actually holds $12.5 million USD. My husband had anticipated the potential dangers tied to Zimbabwe's land policies and made this precautionary measure. Please note, the security company is unaware of the consignment's true contents. The funds were initially intended for purchasing new agricultural machinery, chemicals for our farms, and establishing new farms in Swaziland prior to the land issues provoked by President Robert Mugabe of Zimbabwe. I am reaching out for your assistance in this matter."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][59] ="Subject: Urgent Assistance Needed Dear Friend,My name is Mrs. Elita Ngoni Tawanda, the widow of the late Ndinonzi Kajengo Tawanda, a farmer from Zimbabwe who was tragically murdered in a land dispute. I found your contact through an online network and felt compelled to reach out. Before my husband's untimely demise, he had safely deposited a consignment with a security company at an undisclosed location, which I can reveal upon receiving your positive response. Although the consignment was declared to contain gemstones for security reasons, it actually holds $12.5 million USD. My husband had anticipated the potential dangers tied to Zimbabwe's land policies and made this precautionary measure. Please note, the security company is unaware of the c

In [100]:
test_dataset_raw_gpt4['text'][114] = "Subject: Confidential Business Proposal Dear Sir/Madam,I am Engr. Ben Maluleke, a native of Cape Town, South Africa, currently serving as the Executive Accountant in the South Africa Department of Mining & Natural Resources. I apologize for using this medium to contact you regarding a transaction of significant importance, but the need for confidentiality and immediate access prompted me to do so. As a member of the South Africa Export Promotion Council (SAEPC) and a representative in a government trade exhibition, my credentials and particulars are solidly established. I am seeking your confidential cooperation to execute a deal that could be beneficial for all involved, and I kindly ask that you keep this transaction a top secret due to its nature. Within our department, my four high-ranking colleagues and I hold overdue payment bills amounting to $27.1 million USD, and we are seeking a trustworthy individual to assist us with this matter."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][114] = "Subject: Confidential Business Proposal Dear Sir/Madam,I am Engr. Ben Maluleke, a native of Cape Town, South Africa, currently serving as the Executive Accountant in the South Africa Department of Mining & Natural Resources. I apologize for using this medium to contact you regarding a transaction of significant importance, but the need for confidentiality and immediate access prompted me to do so. As a member of the South Africa Export Promotion Council (SAEPC) and a representative in a government trade exhibition, my credentials and particulars are solidly established. I am seeking your confidential cooperation to execute a deal that could be beneficial for all involved, and I kindly ask that you keep this transaction a top secret due to its n

In [101]:
test_dataset_raw_gpt4['text'][381] = "Subject: Personal and Confidential Business Proposal Dear Recipient,I am Rev. Dr. Thomas Bella from the Nigerian National Petroleum Corporation based in Victoria Island, Lagos. Following consultations with colleagues and information from the Nigerian Chambers of Commerce and Industry, I am privileged to seek your assistance in transferring $15.5 million USD into your account. This sum resulted from an overinvoiced contract that was executed, commissioned, and paid for three years ago by a foreign contractor. The fund is currently held in a suspense account at the Central Bank of Nigeria. As civil servants, we are not permitted to operate foreign accounts, hence our need for your assistance. The total sum will be divided as follows: 70% for us, 30% for you, with additional deductions for any local or international expenses related to the transfer. I assure you the transfer is risk-free on both sides. If you find this proposal acceptable, please respond promptly."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][381] = "Subject: Personal and Confidential Business Proposal Dear Recipient,I am Rev. Dr. Thomas Bella from the Nigerian National Petroleum Corporation based in Victoria Island, Lagos. Following consultations with colleagues and information from the Nigerian Chambers of Commerce and Industry, I am privileged to seek your assistance in transferring $15.5 million USD into your account. This sum resulted from an overinvoiced contract that was executed, commissioned, and paid for three years ago by a foreign contractor. The fund is currently held in a suspense account at the Central Bank of Nigeria. As civil servants, we are not permitted to operate foreign accounts, hence our need for your assistance. The total sum will be divided as follows: 70% for us, 

In [102]:
test_dataset_raw_gpt4['text'][363] = "Subject: Proposal for Beneficial Collaboration Dear Friend,My name is Mr. Frederick Andrew and I am an externally trained auditor for the Development Bank of Singapore (DBS). I have painstakingly located your contact for personal initiatives related to an investor who shares your last name and left approximately $30 million USD in our bank. No next of kin has stepped forward to claim this amount for the past twelve years. Banking regulations in Singapore require me to notify fiscal authorities about such dormant accounts after a statutory period of twelve years, to prevent them from being marked as unclaimed funds. Given these circumstances, I am reaching out with a proposal to leverage my insider knowledge to help us secure these funds. My proposition is based on the fact that your last name matches the late investor's, thus simplifying the process of securing the funds instead of allowing them to fall into the hands of the Singaporean government. I look forward to discussing this opportunity further."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][363] = "Subject: Proposal for Beneficial Collaboration Dear Friend,My name is Mr. Frederick Andrew and I am an externally trained auditor for the Development Bank of Singapore (DBS). I have painstakingly located your contact for personal initiatives related to an investor who shares your last name and left approximately $30 million USD in our bank. No next of kin has stepped forward to claim this amount for the past twelve years. Banking regulations in Singapore require me to notify fiscal authorities about such dormant accounts after a statutory period of twelve years, to prevent them from being marked as unclaimed funds. Given these circumstances, I am reaching out with a proposal to leverage my insider knowledge to help us secure these funds. My pro

In [103]:
test_dataset_raw_gpt4['text'][197] = "Subject: Urgent Confidential Business Proposal Dear President/CEO,I am Kizie Mulumba, the Finance Director of Standard Bank Ltd., and I'm writing to you with an urgent and confidential business proposition. On June 19th, a foreign oil consultant/contractor with South Africa Mining Corporation, Mr. Richard Moss, deposited a fixed sum of USD 25 million for twelve calendar months in our branch. However, upon maturity, all attempts to reach Mr. Moss were unsuccessful, and further investigation revealed his unfortunate demise in an automobile accident. Our research showed Mr. Moss left no will, and we were unable to locate any next of kin. His official records, including bank deposit paperwork, did not indicate any kin or relations. Consequently, the sum of USD 25 million remains unclaimed in the bank, with the interest being added to the principal amount annually. Given the circumstances, it's unlikely anyone will step forward to claim it. As per South African laws, I am reaching out to you for a possible collaboration to secure these funds. I look forward to discussing this further."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][197] = "Subject: Urgent Confidential Business Proposal Dear President/CEO,I am Kizie Mulumba, the Finance Director of Standard Bank Ltd., and I'm writing to you with an urgent and confidential business proposition. On June 19th, a foreign oil consultant/contractor with South Africa Mining Corporation, Mr. Richard Moss, deposited a fixed sum of USD 25 million for twelve calendar months in our branch. However, upon maturity, all attempts to reach Mr. Moss were unsuccessful, and further investigation revealed his unfortunate demise in an automobile accident. Our research showed Mr. Moss left no will, and we were unable to locate any next of kin. His official records, including bank deposit paperwork, did not indicate any kin or relations. Consequently, th

In [104]:
test_dataset_raw_gpt4['text'][336] ="Subject: Request for Assistance in Funds Transfer Dear Managing Director/CEO, I am Lt. Col. Richard Egwu, former personal security to the late President Laurent Kabila of the Democratic Republic of Congo (DRC). Currently, I am in the Republic of South Africa seeking asylum. I acquired your contact information from the Johannesburg Chamber of Commerce and Industry's business directory. Given your profile, I felt compelled to request your assistance in transferring a sum of USD 25 million to your company or personal account for investment purposes outside Africa. Before President Kabila's untimely death, we knew that rebels would make significant efforts to overthrow the government, which led to widespread looting of the country's treasures. In May, I was tasked with transporting a box marked as 'diplomatic documents' to Zimbabwe, but I diverted it to the Republic of South Africa instead. Upon inspection, I discovered that the box contained USD 25 million. I have since deposited the box for safekeeping and now seek your help in securing these funds."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][336] ="Subject: Request for Assistance in Funds Transfer Dear Managing Director/CEO, I am Lt. Col. Richard Egwu, former personal security to the late President Laurent Kabila of the Democratic Republic of Congo (DRC). Currently, I am in the Republic of South Africa seeking asylum. I acquired your contact information from the Johannesburg Chamber of Commerce and Industry's business directory. Given your profile, I felt compelled to request your assistance in transferring a sum of USD 25 million to your company or personal account for investment purposes outside Africa. Before President Kabila's untimely death, we knew that rebels would make significant efforts to overthrow the government, which led to widespread looting of the country's treasures. In Ma

In [105]:
test_dataset_raw_gpt4['text'][173] ="I'm Mr. Pascal Ibeethe, the manager of United Bank of Africa PLC, UBA. I'm seeking a reliable and trustworthy partner for an important transaction involving a dormant account opened in 1996, which holds twenty-six million USD. The account belonged to Mr. Clayton Reidfield, a foreigner and a chemical engineer by profession, who was the manager of Petro Chemical Services. Sadly, he passed away in 2003. This account has not been operated since then and has no nominated beneficiary. My investigations confirmed that his company remains unaware of this account. If the funds are not claimed soon, they will be forfeited. Therefore, I'm reaching out to you with an urgent proposition to assist in the transfer of this significant sum, trusting in your discretion and integrity now and in the future."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][173] ="I'm Mr. Pascal Ibeethe, the manager of United Bank of Africa PLC, UBA. I'm seeking a reliable and trustworthy partner for an important transaction involving a dormant account opened in 1996, which holds twenty-six million USD. The account belonged to Mr. Clayton Reidfield, a foreigner and a chemical engineer by profession, who was the manager of Petro Chemical Services. Sadly, he passed away in 2003. This account has not been operated since then and has no nominated beneficiary. My investigations confirmed that his company remains unaware of this account. If the funds are not claimed soon, they will be forfeited. Therefore, I'm reaching out to you with an urgent proposition to assist in the transfer of this significant sum, trusting in your disc

In [106]:
test_dataset_raw_gpt4['text'][148] = "Dear Friend,As a practicing doctor in Ivory Coast, West Africa, I'm seeking your assistance regarding a sensitive issue involving a patient of mine from Sierra Leone. She and her son, refugees due to a deadly family conflict, possess a certificate of deposit related to a trunk box stored in a security company, declared as family valuables. They fled their home country and resettled in Ivory Coast. The patient has entrusted me with this confidential information and I believe you can provide us with the necessary assistance in this matter, which could potentially benefit all parties involved. For security purposes, please respond via my alternative email: nandodaniel@yahoo.it."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][148] = "Dear Friend,As a practicing doctor in Ivory Coast, West Africa, I'm seeking your assistance regarding a sensitive issue involving a patient of mine from Sierra Leone. She and her son, refugees due to a deadly family conflict, possess a certificate of deposit related to a trunk box stored in a security company, declared as family valuables. They fled their home country and resettled in Ivory Coast. The patient has entrusted me with this confidential information and I believe you can provide us with the necessary assistance in this matter, which could potentially benefit all parties involved. For security purposes, please respond via my alternative email: nandodaniel@yahoo.it."


In [107]:
test_dataset_raw_gpt4['text'][153] ="I'm Isaac Baka, the Site Inspection Director for the Ministry of Works and Construction in the Republic of Sudan. While overseeing a major contract, we discovered a significant gold deposit at one of the construction sites. As the site director, I reached a mutual agreement with the firm to mine the gold and divide the proceeds. From this, I received $20 million. However, as a civil servant in a war-torn country, I'm unable to introduce these funds into our banking system due to my limited income and the strict civil service guidelines in Sudan. As the civil crisis in Sudan intensifies daily, I've decided it's crucial to move these funds out of the country."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][153] ="I'm Isaac Baka, the Site Inspection Director for the Ministry of Works and Construction in the Republic of Sudan. While overseeing a major contract, we discovered a significant gold deposit at one of the construction sites. As the site director, I reached a mutual agreement with the firm to mine the gold and divide the proceeds. From this, I received $20 million. However, as a civil servant in a war-torn country, I'm unable to introduce these funds into our banking system due to my limited income and the strict civil service guidelines in Sudan. As the civil crisis in Sudan intensifies daily, I've decided it's crucial to move these funds out of the country."


In [108]:
test_dataset_raw_gpt4['text'][399] ="I hope this message finds you well. You may recall that I, Dr. Mr. Michael Howard from Nigeria, once reached out to you for assistance in securing the release of funds accrued from a contract inheritance awarded by our government during a military regime. Although you were unable to assist me at that time, I'm pleased to inform you that I've successfully transferred the funds with the help of a new partner from Brazil. Currently, I am in the Netherlands investing in projects with my share of the sum.However, I haven't forgotten your past efforts and attempts to assist me. As a token of my gratitude, I have set aside five hundred thousand United States dollars as a draft compensation for you. Besides this, I urge you to take care of those in need - the poor, orphans, and the disabled in your community. This gesture would indeed serve a great cause. Once again, thank you for your time and past efforts."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][399] ="I hope this message finds you well. You may recall that I, Dr. Mr. Michael Howard from Nigeria, once reached out to you for assistance in securing the release of funds accrued from a contract inheritance awarded by our government during a military regime. Although you were unable to assist me at that time, I'm pleased to inform you that I've successfully transferred the funds with the help of a new partner from Brazil. Currently, I am in the Netherlands investing in projects with my share of the sum.However, I haven't forgotten your past efforts and attempts to assist me. As a token of my gratitude, I have set aside five hundred thousand United States dollars as a draft compensation for you. Besides this, I urge you to take care of those in need

In [109]:
test_dataset_raw_gpt4['text'][270] ="I am Mrs. Xian, a bank manager at a respected bank in China, and I respectfully request your discretion regarding the content of this message. My reaching out is the result of an independent investigation, not yet disclosed to anyone else. The subject at hand relates to a deceased client of our bank, who held investments worth millions of dollars, which have now reached maturity. This client maintained a discreet relationship with us, never nominated any beneficiaries for his investments, and died intestate. If left unclaimed, these funds would be confiscated by the government or likely misappropriated by unethical executives. Hence, I propose that you, as a foreigner, stand in as the next of kin of the deceased, which is a simple procedure I will guide you through."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][270] ="I am Mrs. Xian, a bank manager at a respected bank in China, and I respectfully request your discretion regarding the content of this message. My reaching out is the result of an independent investigation, not yet disclosed to anyone else. The subject at hand relates to a deceased client of our bank, who held investments worth millions of dollars, which have now reached maturity. This client maintained a discreet relationship with us, never nominated any beneficiaries for his investments, and died intestate. If left unclaimed, these funds would be confiscated by the government or likely misappropriated by unethical executives. Hence, I propose that you, as a foreigner, stand in as the next of kin of the deceased, which is a simple procedure I wi

In [110]:
test_dataset_raw_gpt4['text'][187] ="I am Jeff Yukon, an external auditor for a well-known bank in the United Kingdom. I wish to share an urgent and confidential matter, for which I must first apologize if it breaches your personal ethics. Through a recent audit, I discovered a dormant account holding fifty-two million British pounds, untouched for the past three years. The account owner, a foreigner named Mr. Gregory B. Wilson, tragically died in a plane crash in July. No claim has been made on this account as neither his family members are aware of its existence, nor the fund itself. After discreet discussions with a senior minister official from the federal ministry of finance, we've agreed to seek a reliable foreign partner for this endeavor. Due to my position, I cannot take an active part, but I assure you of success if you follow my instructions, working hand in hand."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][187] ="I am Jeff Yukon, an external auditor for a well-known bank in the United Kingdom. I wish to share an urgent and confidential matter, for which I must first apologize if it breaches your personal ethics. Through a recent audit, I discovered a dormant account holding fifty-two million British pounds, untouched for the past three years. The account owner, a foreigner named Mr. Gregory B. Wilson, tragically died in a plane crash in July. No claim has been made on this account as neither his family members are aware of its existence, nor the fund itself. After discreet discussions with a senior minister official from the federal ministry of finance, we've agreed to seek a reliable foreign partner for this endeavor. Due to my position, I cannot take a

In [111]:
test_dataset_raw_gpt4['text'][196] ="I must apologize if this message doesn't align with your personal ethics. I am a staff member in the Accounts Management section of a well-known bank in the United Kingdom. I've found a dormant account in our system holding a balance of fifteen million British pounds, unoperated for the past several years. Investigations confirmed that the account owner, a foreigner named Austin Martins, unfortunately passed away in August. Neither his family members nor anyone else has claimed this money, mainly because no one is aware of the account's existence. Information from the National Immigration office indicates Mr. Martins was single upon entry to the UK. After a discreet discussion with bank officials, we've decided to seek a reliable foreign partner to handle this matter. We propose a business partnership where you'd stand as the next of kin to the deceased, which would allow the release of funds once due processes are followed. This transaction is risk-free and free of legal troubles as the funds are legitimate, not originating from drug money, money laundering, terrorism or any other illegal activities."

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset_raw_gpt4['text'][196] ="I must apologize if this message doesn't align with your personal ethics. I am a staff member in the Accounts Management section of a well-known bank in the United Kingdom. I've found a dormant account in our system holding a balance of fifteen million British pounds, unoperated for the past several years. Investigations confirmed that the account owner, a foreigner named Austin Martins, unfortunately passed away in August. Neither his family members nor anyone else has claimed this money, mainly because no one is aware of the account's existence. Information from the National Immigration office indicates Mr. Martins was single upon entry to the UK. After a discreet discussion with bank officials, we've decided to seek a reliable foreign partner

In [112]:
test_dataset_raw_gpt4.head()

Unnamed: 0,text,target
315,emailmessagemessage object xcffd emailmessagem...,1
321,dear friendmy name edward moore qcprincipal pa...,1
34,from mr williams kabilatelfax sir urgent bus...,1
290,my name sandra williams united kingdom i year...,1
150,from mrchris kolade marinalagosnigeriaalternat...,1


In [113]:
merged_df_gpt4 = pd.concat([test_dataset_raw_0_25, test_dataset_raw_gpt4], ignore_index=True)

# Shuffle the index
test_125_df_gpt4 = merged_df_gpt4.sample(frac=1).reset_index(drop=True)

In [114]:
test_encodings_125_gpt4 = tokenizer(test_125_df_gpt4['text'].tolist(), truncation=True, padding=True, max_length=256)

test_dataset_125_gpt4 = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings_125_gpt4),
    test_125_df_gpt4['target'].values
))


In [115]:
test_encodings_125_gpt4_batched = test_dataset_125_gpt4.batch(16)
y_pred_raw_125_gpt4 = model.predict(test_encodings_125_gpt4_batched)

# Convert raw predictions to class predictions
y_pred_125_gpt4 = np.argmax(y_pred_raw_125_gpt4.logits, axis=1)

# Get the actual class labels
y_true_125_gpt4 = []
for features, label in test_dataset_125_gpt4:
    y_true_125_gpt4.append(label.numpy())
y_true_125_gpt4 = np.array(y_true_125_gpt4)


# Generate a classification report
report_125_gpt4 = classification_report(y_true_125_gpt4, y_pred_125_gpt4, target_names=["not phishing", "phishing"])
print(report_125_gpt4)

              precision    recall  f1-score   support

not phishing       1.00      0.99      0.99       100
    phishing       0.96      1.00      0.98        25

    accuracy                           0.99       125
   macro avg       0.98      0.99      0.99       125
weighted avg       0.99      0.99      0.99       125



# chatgpt4 random

In [79]:

data1 = {'text': [
    "I hope this message finds you in good health. I am [Your Name], a Human Rights activist from [Your Location]. I am writing to seek your assistance for a mission I am working on, focusing on providing educational materials and healthcare support to underserved communities in our region. The lack of resources is our biggest challenge, and your generous contribution could go a long way to help. I would be extremely grateful for any financial support you can offer.",
    "I am reaching out to you with a unique and potentially lucrative business proposal. I am looking for a trustworthy individual who can assist me in a profitable business venture. Due to my position as a government official, I am currently unable to move a considerable sum of money to a foreign account. I am requesting your assistance in this operation, promising a sizable percentage of the sum for your cooperation.",
    "I am writing this email with a heavy heart, burdened by the financial obstacles preventing me from pursuing my dream of a college education. My name is [Your Name], and I was recently accepted into [College/University Name]. Unfortunately, due to my financial circumstances, I am struggling to pay my tuition fees. I humbly ask for your assistance in helping me continue my education.",
    "I am [Your Name], a friend of [Patient's Name]. Unfortunately, [Patient's Name] has recently been diagnosed with [Disease Name] and the treatment cost is beyond what we can afford. The financial burden is overwhelming and I am asking for your help to raise funds for [Patient's Name]'s medical expenses.",
    "I am writing on behalf of a Non-Governmental Organization that assists refugees. We are currently facing a severe funding crisis and are unable to extend the much-needed support to the displaced victims. I am reaching out to ask for your help. Your donation will help us provide essential supplies, medical care, and educational resources.",
    "I am the coach of a local youth sports team, and we're preparing for the upcoming national championship. Unfortunately, we are struggling with a lack of resources to provide the necessary equipment for our young athletes. I am writing to ask for your generous contribution to help us equip these promising athletes.",
    "I run an animal shelter that provides a safe haven for stray and abandoned animals. We are currently facing a financial crisis and are struggling to provide necessary medical care and shelter for these helpless animals. Your financial support can help us feed, treat, and find homes for these lovely animals.",
    "As a researcher at [Your Institute's Name], I am leading a project on [Briefly Describe the Project]. Despite the promising nature of this research, we are currently facing financial constraints that are hindering our progress. I am writing to ask for your financial support to help us continue our groundbreaking work.",
    "Our charity organization is actively involved in disaster relief work for the victims of the recent [Natural Disaster]. However, due to the scale of the disaster, we are struggling to keep up with the needs of the affected people. Any financial assistance you could provide would greatly help us in our mission.",
    "I am a local artist working on a community art project. The aim is to use art as a medium to promote cultural awareness and unity in our community. Unfortunately, I lack the necessary funds to bring this project to life. Any financial support from your end would help immensely in executing this project."
]}

df1 = pd.DataFrame(data1)

In [80]:
data2 = {'text': [
    "Greetings! I am writing to you from the heart of a small, impoverished school in a rural area. Our school is the only source of education for the children in our community, who display an undeniable eagerness to learn and broaden their horizons. Unfortunately, we lack the basic facilities needed to provide these children with the quality education they deserve. Our classrooms are deteriorating, educational resources are scarce, and our hardworking teachers are struggling with the limited tools at their disposal. We humbly request your generous contributions to help us rebuild our school and create an environment conducive to learning for these deserving children.",
    "Dear friend, this letter finds its way to you with a heavy heart. My name is [Your Name], and I served as a professor at [University Name] for over two decades. Recently, due to a series of unfortunate circumstances, I have found myself grappling with a dire financial situation. From health problems leading to skyrocketing medical bills to the financial market downturn affecting my savings, the road has been challenging. In these difficult times, I find myself turning to the kindness of individuals who may have the means to provide assistance. Any financial help, no matter how small, would go a long way in helping me navigate through these tough times.",
    "Dear Sir/Madam, I am a single parent blessed with three beautiful children. Our world turned upside down when I recently lost my job due to company-wide layoffs as a result of the economic downturn. Since then, our days have been marred by uncertainty and struggles to make ends meet. I am doing my best to provide for my family and keep a roof over our heads, but it is becoming increasingly difficult with each passing day. I am writing to you today, in hope that you might find it in your heart to assist us during this difficult period. Your generous assistance, in any form or amount, will help us greatly.",
    "Hello, I am writing on behalf of our local church, an establishment that has stood as a cornerstone of our community for over a century. Sadly, due to years of wear and tear, the church is in dire need of repairs. The roof is leaking, the paint is peeling, and our heating system is on its last legs. Without sufficient funds, we face the heartbreaking possibility of closing the church. As such, we are seeking donations to help us preserve this important community institution. We believe that with your help, we can restore the church to its former glory and continue serving our community.",
    "Dear friend, I am an independent filmmaker currently working on a project that highlights the effects of climate change on indigenous communities. This documentary aims to bring attention to the often-overlooked victims of our changing environment, showcasing their resilience and struggle in the face of adversity. However, without the necessary funds, it has been challenging to provide this project the justice it deserves. Travel expenses, equipment rental, and post-production costs are proving to be overwhelming. I write to you in hopes of garnering financial support to help complete this crucial project. Any amount, big or small, will bring us closer to our goal.",
]}

df2 = pd.DataFrame(data2)


In [81]:
data3 = {'text': [
    "Greetings! I am writing to you from the heart of a small, impoverished school in a rural area. Our school is the only source of education for the children in our community, who display an undeniable eagerness to learn and broaden their horizons. Unfortunately, we lack the basic facilities needed to provide these children with the quality education they deserve. Our classrooms are deteriorating, educational resources are scarce, and our hardworking teachers are struggling with the limited tools at their disposal. We humbly request your generous contributions to help us rebuild our school and create an environment conducive to learning for these deserving children.",
    "Dear friend, this letter finds its way to you with a heavy heart. My name is [Your Name], and I served as a professor at [University Name] for over two decades. Recently, due to a series of unfortunate circumstances, I have found myself grappling with a dire financial situation. From health problems leading to skyrocketing medical bills to the financial market downturn affecting my savings, the road has been challenging. In these difficult times, I find myself turning to the kindness of individuals who may have the means to provide assistance. Any financial help, no matter how small, would go a long way in helping me navigate through these tough times.",
    "Dear Sir/Madam, I am a single parent blessed with three beautiful children. Our world turned upside down when I recently lost my job due to company-wide layoffs as a result of the economic downturn. Since then, our days have been marred by uncertainty and struggles to make ends meet. I am doing my best to provide for my family and keep a roof over our heads, but it is becoming increasingly difficult with each passing day. I am writing to you today, in hope that you might find it in your heart to assist us during this difficult period. Your generous assistance, in any form or amount, will help us greatly.",
    "Hello, I am writing on behalf of our local church, an establishment that has stood as a cornerstone of our community for over a century. Sadly, due to years of wear and tear, the church is in dire need of repairs. The roof is leaking, the paint is peeling, and our heating system is on its last legs. Without sufficient funds, we face the heartbreaking possibility of closing the church. As such, we are seeking donations to help us preserve this important community institution. We believe that with your help, we can restore the church to its former glory and continue serving our community.",
    "Dear friend, I am an independent filmmaker currently working on a project that highlights the effects of climate change on indigenous communities. This documentary aims to bring attention to the often-overlooked victims of our changing environment, showcasing their resilience and struggle in the face of adversity. However, without the necessary funds, it has been challenging to provide this project the justice it deserves. Travel expenses, equipment rental, and post-production costs are proving to be overwhelming. I write to you in hopes of garnering financial support to help complete this crucial project. Any amount, big or small, will bring us closer to our goal.",
    "Hello, my name is [Your Name] and I am an aspiring entrepreneur. I have developed an innovative product that I believe has the potential to make a significant impact in the [industry name]. I have devoted a great deal of time and personal savings into this project, but I now find myself at a standstill due to insufficient funding. I'm writing to request financial assistance to push this product to market. Your support would greatly help in covering manufacturing, marketing, and distribution costs. I sincerely believe in the potential of this product and I hope you would consider investing in its success.",
    "Dear friend, I am a researcher at [Institution Name] and our team has been working tirelessly to find solutions for environmental challenges. However, scientific research often requires substantial funding. I'm writing to ask for your support. Your generous contribution would allow us to invest in essential equipment, expand our research team, and fund testing and experiments. Every donation brings us one step closer to realizing our goals for a cleaner, more sustainable world.",
    "Hello, my name is [Your Name], and I am a struggling artist. I've dedicated my life to creating art that communicates the human experience, evokes emotion, and sparks thought. Unfortunately, art supplies and exhibition expenses are cost-prohibitive, often creating a barrier to my creative process. I'm writing to kindly ask for your financial assistance, which would provide much-needed support for purchasing materials and showcasing my work. Your generosity would significantly impact my ability to continue creating and sharing my art.",
    "Dear Sir/Madam, I am a recent graduate carrying a substantial student loan. While I am actively seeking employment, my financial situation is precarious at best. Until I can secure a stable job, I am struggling to manage my day-to-day expenses and loan repayments. I am writing to ask for your help during this difficult time. Any assistance, financial or otherwise, would greatly alleviate the burden I am currently facing.",
    "Hello, I am writing to you on behalf of a local animal shelter. We provide a safe haven for abused, abandoned, and stray animals, but operating costs are high and our resources are dwindling. From food and medical supplies to maintenance and staffing, every aspect of running the shelter needs funding. If you are in a position to help, we would greatly appreciate your support. Your donation could give an innocent animal a second chance at life."
]}

df3 = pd.DataFrame(data3)


In [82]:
data_gpt4_random = pd.concat([df1,df2,df3], ignore_index=True)

In [83]:
data_gpt4_random['target'] = 1


In [84]:
merged_df_gpt4_random = pd.concat([test_dataset_raw_0_25, data_gpt4_random], ignore_index=True)

# Shuffle the index
test_125_df_gpt4_random = merged_df_gpt4_random.sample(frac=1).reset_index(drop=True)

In [85]:
test_encodings_125_gpt4_random = tokenizer(test_125_df_gpt4_random['text'].tolist(), truncation=True, padding=True, max_length=256)

test_dataset_125_gpt4_random = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings_125_gpt4_random),
    test_125_df_gpt4_random['target'].values
))


In [87]:
test_encodings_125_gpt4_random_batched = test_dataset_125_gpt4_random.batch(16)
y_pred_raw_125_gpt4_random = model.predict(test_encodings_125_gpt4_random_batched)

# Convert raw predictions to class predictions
y_pred_125_gpt4_random = np.argmax(y_pred_raw_125_gpt4_random.logits, axis=1)

# Get the actual class labels
y_true_125_gpt4_random = []
for features, label in test_dataset_125_gpt4_random:
    y_true_125_gpt4_random.append(label.numpy())
y_true_125_gpt4_random = np.array(y_true_125_gpt4_random)


# Generate a classification report
report_125_gpt4_random = classification_report(y_true_125_gpt4_random, y_pred_125_gpt4_random, target_names=["not phishing", "phishing"])  # replace target_names with your actual class names
print(report_125_gpt4_random)

              precision    recall  f1-score   support

not phishing       0.98      0.99      0.99       100
    phishing       0.96      0.92      0.94        25

    accuracy                           0.98       125
   macro avg       0.97      0.96      0.96       125
weighted avg       0.98      0.98      0.98       125

