<a href="https://colab.research.google.com/github/swguo/Generating-Personalized-Phishing-Emails/blob/main/Bart_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%time
%%capture
!pip install transformers==4.5.0

CPU times: user 82.1 ms, sys: 10 ms, total: 92.1 ms
Wall time: 9.18 s


In [2]:
!nvidia-smi

Wed Mar 16 08:25:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import BartTokenizer, BartConfig, BartForConditionalGeneration

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler
import pprint
pp = pprint.PrettyPrinter(indent=4)
from IPython.display import clear_output

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.10.0+cu111


# Configurations

In [4]:
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'facebook/bart-large' #{bart, bart-large}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<s>",
                    "eos_token": "</s>",
                    "unk_token": "<unk>",                    
                    "pad_token": "<pad>",
                    "sep_token": "<sep>"
                  }
                    
MAXLEN          = 128  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 4
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

# Loading Tokenizer, Config and Model

In [6]:
def get_tokenier(special_tokens=None):
    tokenizer = BartTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print(len(tokenizer))
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = BartConfig.from_pretrained(MODEL, 
                          bos_token_id=tokenizer.bos_token_id,
                          eos_token_id=tokenizer.eos_token_id,
                          sep_token_id=tokenizer.sep_token_id,
                          pad_token_id=tokenizer.pad_token_id,
                          output_hidden_states=False)
    else: 
        config = BartConfig.from_pretrained(MODEL,                                     
                          pad_token_id=tokenizer.eos_token_id,
                          output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = BartForConditionalGeneration.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [7]:
def join_keywords(keywords, randomize=True):
        N = len(keywords)

        #random sampling and shuffle
        if randomize: 
            M = random.choice(range(N+1))
            keywords = keywords[:M]
            random.shuffle(keywords)

        return ','.join(keywords)

# Download Spear Email model - BART


In [8]:
!pip install --upgrade --no-cache-dir gdown

Collecting gdown
  Downloading gdown-4.4.0.tar.gz (14 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: gdown
  Building wheel for gdown (PEP 517) ... [?25l[?25hdone
  Created wheel for gdown: filename=gdown-4.4.0-py3-none-any.whl size=14774 sha256=a81832f35e3415dfee3f7cabd81731b04d9f136d4ef4577485884b1fa08f6ac1
  Stored in directory: /tmp/pip-ephem-wheel-cache-h56se6p4/wheels/fb/c3/0e/c4d8ff8bfcb0461afff199471449f642179b74968c15b7a69c
Successfully built gdown
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.2.2
    Uninstalling gdown-4.2.2:
      Successfully uninstalled gdown-4.2.2
Successfully installed gdown-4.4.0


In [9]:
!gdown --id 1b5t7O18zEL1AJxhiwMjKTqcOOGkMWVte

Downloading...
From: https://drive.google.com/uc?id=1b5t7O18zEL1AJxhiwMjKTqcOOGkMWVte
To: /content/pytorch_model.bin
100% 1.63G/1.63G [00:07<00:00, 220MB/s]


In [10]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)

model = get_model(tokenizer, 
          special_tokens=SPECIAL_TOKENS,
          load_model_path='pytorch_model.bin')

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

50266
Special tokens added


Downloading:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

# Generation

## Introduction to input fields

Title => Email Subject 

Type => has two choices
 * Fraud  : Fraud type
 * Normal  : BEC type

Category => Topic
 * BUSINESS
 * MONEY

Formats => Body formats 
 * News
 * Email

Keywords => Expected keyowrd (list type)

## Define Function

In [11]:
import re
def remove_Symbol(s):
    s = re.sub(r'[^\w]','',s)
    return s


In [12]:
def generation_token(title,text,keywords,types,category,formats):    
    
    kw = join_keywords(keywords, randomize=False)
    prompt = SPECIAL_TOKENS['bos_token'] + types + \
             SPECIAL_TOKENS['sep_token'] + category + \
             SPECIAL_TOKENS['sep_token'] + formats + \
             SPECIAL_TOKENS['sep_token'] + title + \
             SPECIAL_TOKENS['sep_token'] + kw + \
             SPECIAL_TOKENS['sep_token']
    
    prompt_len = len(types)+len(category)+len(formats)+ len(title) + len(','.join(keywords)) 
    
    generated = tokenizer([prompt],max_length=MAXLEN, return_tensors='pt')['input_ids'].to(device)
    

    model.eval()

    sample_outputs = model.generate(
        generated,
        do_sample=False, 
        max_length=MAXLEN, 
        top_k=1,
        top_p=0.75,
        num_return_sequences=3
    )
    
    return sample_outputs,len(prompt)

## 1. Generation phishing emails for fraud

## Topic for COIVD19 of Fraud Emails

In [None]:
title = "please reply to me soon "
#title = "please complete as soon as possible "
#title = 'I will get back to you in the am '
#title = 'We are keenly interested in setting up a new hospital '
#title = 'How about was that payment? reply to me soon '
#title = "I will get back to you in the am "

keywords = ['covid19', 'case', 'hospital','CDC']

In [None]:
types = "Fraud"
category = "BUSINESS"
formats = "Email"
kw = join_keywords(keywords, randomize=False)
prompt = SPECIAL_TOKENS['bos_token'] + types + \
                SPECIAL_TOKENS['sep_token'] + category + \
                SPECIAL_TOKENS['sep_token'] + formats + \
                SPECIAL_TOKENS['sep_token'] + title + \
                SPECIAL_TOKENS['sep_token'] + kw
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [None]:
from termcolor import colored
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                do_sample=True,   
                min_length=20, 
                max_length=200,
                top_k=10,                              
                top_p=0.5,
                repetition_penalty=2.0,
                num_return_sequences=3
                )

predt_email = []
for i, sample_output in enumerate(sample_outputs):
    predit_text = tokenizer.decode(sample_output, skip_special_tokens=True)[len(title):]
    predt_email.append([predit_text,keywords])
    result = " ".join(colored(t,'white','on_red') if remove_Symbol(t) in keywords else remove_Symbol(t) for t in predit_text.lower().split())
    print('Phishing Email : {}'.format(i+1))
    print("="*100)
    for j,t in enumerate(result.split(' ')):
      
      
      print(t+' ',end='')
      if j%20==0 and j!=0:
        print('\n')
    print('\n')
    

 

  next_indices = next_tokens // vocab_size


Phishing Email : 1
i am dr david [41m[37mcovid19,[0m a senior staff in the [41m[37mhospital[0m where i work as a research scientist for the cdci 

have decided to contact you on behalf of my department on a business transfer that will be very beneficial to 

both of us at the end of the transaction this is necessitated by the urgency of this transaction which involves 

the transfer of the sum of us1800000000 fifteen million five hundred thousand united states dollars into your custodythis money was 

originally gotten from a deceased person who died in a plane crash along with his wife and their only daughter 

in a ghastly car crash all occupants of the vehicle unfortunately lost their livessince then i personally have made several 

enquiries to locate any of them but none has come back to me i have not heard from any of 

those involved my department has also made no reply from anyof them after going through all the 

Phishing Email : 2
i am dr david [41m[37mcovid19,[0m a 

## Topic for Finace of Fraud Emails

In [None]:
title = 'How was that payment? reply to me soon '
#title = 'I will get back to you in the am '
#title = 'please, reply to me soon '
keywords = ['bank','account','number','reply','credit']

In [None]:
types = "Fraud" 
category = "BUSINESS"
formats = "Email"
kw = join_keywords(keywords, randomize=False)


prompt = SPECIAL_TOKENS['bos_token'] + types + \
                SPECIAL_TOKENS['sep_token'] + category + \
                SPECIAL_TOKENS['sep_token'] + formats + \
                SPECIAL_TOKENS['sep_token'] + title + \
                SPECIAL_TOKENS['sep_token'] + kw
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [None]:
from termcolor import colored
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                do_sample=True,   
                min_length=20, 
                max_length=200,
                top_k=10,                              
                top_p=0.5,
                repetition_penalty=2.0,
                num_return_sequences=3
                )

predt_email = []
for i, sample_output in enumerate(sample_outputs):
    predit_text = tokenizer.decode(sample_output, skip_special_tokens=True)[len(title):]
    predt_email.append([predit_text,keywords])
    result = " ".join(colored(t,'white','on_red') if remove_Symbol(t) in keywords else remove_Symbol(t) for t in predit_text.lower().split())
    print('Phishing Email : {}'.format(i+1))
    print("="*100)
    for j,t in enumerate(result.split(' ')):
      
      
      print(t+' ',end='')
      if j%20==0 and j!=0:
        print('\n')
    print('\n')
    

 

  next_indices = next_tokens // vocab_size


Phishing Email : 1
i will send you the following messagei have decided to contact you on a business transaction that will be very beneficial 

to both of us at the end of the transactionon june 6 2000 my client his wife and their two 

children were involved in a car accident along sagamu express road all occupants of the vehicle unfortunately lost their livessince 

then i too have made several enquiries to locate any of my late fathers forwarding address but have not heard 

from him or his [41m[37maccount[0m [41m[37mnumber.[0m this is due to confidentiality and prompt access given to me by the security 

company where he deposited the sum of us1500000000 fifteen million five hundred thousand united states dollars for twelve calendar months 

without successthis money came from a dormant [41m[37maccount[0m that belongs to one of our foreign customers who died in a 

ghastly car crash the [41m[37maccount[0m [41m[37mnumber[0m has yet to be sent to me 

Phishing Email : 2

## 2. Generation phishing emails for BEC

## Topic for COIVD19 of BEC Emails

In [None]:
title = "please reply to me soon "
#title = "please complete as soon as possible "
#title = 'I will get back to you in the am '
#title = 'We are keenly interested in setting up a new hospital '
#title = 'How about was that payment? reply to me soon '
#title = "I will get back to you in the am "
types = "Normal"
category = "BUSINESS"
formats = "Email"

keywords = ['covid19', 'case', 'hospital','CDC']
kw = join_keywords(keywords, randomize=False)


prompt = SPECIAL_TOKENS['bos_token'] + types + \
                SPECIAL_TOKENS['sep_token'] + category + \
                SPECIAL_TOKENS['sep_token'] + formats + \
                SPECIAL_TOKENS['sep_token'] + title + \
                SPECIAL_TOKENS['sep_token'] + kw
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [None]:
from termcolor import colored
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                do_sample=True,   
                min_length=20, 
                max_length=200,
                top_k=10,                              
                top_p=0.5,
                repetition_penalty=2.0,
                num_return_sequences=3
                )

predt_email = []
for i, sample_output in enumerate(sample_outputs):
    predit_text = tokenizer.decode(sample_output, skip_special_tokens=True)[len(title):]
    predt_email.append([predit_text,keywords])
    result = " ".join(colored(t,'white','on_red') if remove_Symbol(t) in keywords else remove_Symbol(t) for t in predit_text.lower().split())
    print('Phishing Email : {}'.format(i+1))
    print("="*100)
    for j,t in enumerate(result.split(' ')):
      
      
      print(t+' ',end='')
      if j%20==0 and j!=0:
        print('\n')
    print('\n')
    

 

  next_indices = next_tokens // vocab_size


Phishing Email : 1
i will be in the [41m[37mhospital[0m for a few days but you can reach me directly through this following emaildavid_covid19stategovhkdear siri 

am dr david covidid19 an executive director of civid19 [41m[37mhospital[0m in washington dc i have decided to contact you on 

a business transaction that will be very beneficial to both of us at the end of the transactionduring our investigation 

and auditing i discovered that the [41m[37mhospital[0m has been able to locate a dormant sum of us1900000000 fifteen million five 

hundred thousand united states dollars in an account that belongs to a deceased person who died in a plane crash 

along with his wifes car crash this money was originally gotten from my [41m[37mhospital[0m where he deposited it by way 

of safe keeping upon maturity i sent out a routine notification to him unfortunately no response is not received 

Phishing Email : 2
i will be in the [41m[37mhospital[0m for a few days but you can reac

## Topic for finance of BEC Emails

In [None]:
title = 'How was that payment? reply to me soon '
#title = 'I will get back to you in the am '
#title = 'please, reply to me soon '
keywords = ['bank','account','number','reply','credit']
#keywords = ['bank','account','reply','number','credit']

In [None]:
types = "Normal"
category = "BUSINESS"
formats = "Email"

kw = join_keywords(keywords, randomize=False)


prompt = SPECIAL_TOKENS['bos_token'] + types + \
                SPECIAL_TOKENS['sep_token'] + category + \
                SPECIAL_TOKENS['sep_token'] + formats + \
                SPECIAL_TOKENS['sep_token'] + title + \
                SPECIAL_TOKENS['sep_token'] + kw
print(prompt)
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)
print(generated)
model.eval();

<s>Normal<sep>BUSINESS<sep>Email<sep>How was that payment? reply to me soon <sep>bank,account,number,reply,credit
tensor([[    0,     0, 45647, 50265, 22295, 28275, 50265, 46084, 50265,  6179,
            21,    14,  3207,   116, 10418,     7,   162,  1010, 50265,  5760,
             6, 36617,     6, 30695,     6, 48317,     6, 32598, 50265]],
       device='cuda:0')


In [None]:
from termcolor import colored
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                do_sample=True,   
                min_length=20, 
                max_length=200,
                top_k=10,                              
                top_p=0.5,
                repetition_penalty=2.0,
                num_return_sequences=3
                )

predt_email = []
for i, sample_output in enumerate(sample_outputs):
    predit_text = tokenizer.decode(sample_output, skip_special_tokens=True)[len(title):]
    predt_email.append([predit_text,keywords])
    result = " ".join(colored(t,'white','on_red') if remove_Symbol(t) in keywords else remove_Symbol(t) for t in predit_text.lower().split())
    print('Phishing Email : {}'.format(i+1))
    print("="*100)
    for j,t in enumerate(result.split(' ')):
      
      
      print(t+' ',end='')
      if j%20==0 and j!=0:
        print('\n')
    print('\n')
    

 

  next_indices = next_tokens // vocab_size


Phishing Email : 1
i will call you a [41m[37mnumber[0m of times to discuss this issue but it is important for me not to give 

you your full name and phone [41m[37mnumber[0m or cell phone [41m[37mnumber.[0m let me start by introducing myselfi am the manager 

of bill and exchange at a [41m[37mbank[0m where i have been able to reach out to you on behalf of 

the customer who died in a car crash along sagamu express road my department has decided to contact you on 

a business transaction that will be very beneficial to both of us at the end of the transaction this is 

due to confidentiality and prompt access given the urgency of this transaction we are seeking your assistance to receive your 

response from you immediately thank you so muchbest regardsmr kurt 

Phishing Email : 2
i will call you a [41m[37mnumber[0m of times to ask for your help but this is not the end of the 

road let me start by introducing myself and introducing myself i am writing you this letter in 