In [2]:

from transformers import AutoTokenizer, BertForSequenceClassification
from textattack import Attacker
from textattack.attack_recipes import DeepWordBugGao2018
from textattack.datasets import Dataset
import joblib
import torch

In [12]:
model_name = "Goodmotion/spam-mail-classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
target_model = BertForSequenceClassification.from_pretrained(model_name)

In [14]:
# DO NOT CHANGE

class TextAttackWrapper(object):
    def __init__(self, model):
        self.model = model

    def __call__(self, x):
        inputs = tokenizer(x, return_tensors="pt", padding=True)
        with torch.no_grad():
            output = self.model(**inputs)
            probs = output.logits.softmax(dim=-1).numpy()
        return probs

In [24]:
data = [("Limited time offer: Act now!", 1), ("Join us for a webinar on AI innovations", 0), ("Urgent: Verify your account immediately.", 1),
        ("Congratulations! You've won a free gift card.", 1), ("Your subscription has been renewed successfully.", 0),
        ("Important security update for your account", 1), ("Don't miss out on our exclusive sale!", 1), 
        ("Your invoice is ready for download", 0), ("Get paid to work from home!", 1), ("Carte de crédit sur le point d’expirer sur Cloudflare", 1)]
dataset = Dataset(data)  

In [25]:
model_wrapper = TextAttackWrapper(target_model)

In [26]:
attack = DeepWordBugGao2018.build(model_wrapper)
attack.attack

textattack: Unknown if model of class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


<bound method Attack.attack of Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  unk
  )
  (goal_function):  UntargetedClassification
  (transformation):  CompositeTransformation(
    (0): WordSwapNeighboringCharacterSwap(
        (random_one):  True
      )
    (1): WordSwapRandomCharacterSubstitution(
        (random_one):  True
      )
    (2): WordSwapRandomCharacterDeletion(
        (random_one):  True
      )
    (3): WordSwapRandomCharacterInsertion(
        (random_one):  True
      )
    )
  (constraints): 
    (0): LevenshteinEditDistance(
        (max_edit_distance):  30
        (compare_against_original):  True
      )
    (1): RepeatModification
    (2): StopwordModification
  (is_black_box):  True
)>

In [27]:
attacker = Attacker(attack, dataset)
attacker.attack_dataset()

Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  unk
  )
  (goal_function):  UntargetedClassification
  (transformation):  CompositeTransformation(
    (0): WordSwapNeighboringCharacterSwap(
        (random_one):  True
      )
    (1): WordSwapRandomCharacterSubstitution(
        (random_one):  True
      )
    (2): WordSwapRandomCharacterDeletion(
        (random_one):  True
      )
    (3): WordSwapRandomCharacterInsertion(
        (random_one):  True
      )
    )
  (constraints): 
    (0): LevenshteinEditDistance(
        (max_edit_distance):  30
        (compare_against_original):  True
      )
    (1): RepeatModification
    (2): StopwordModification
  (is_black_box):  True
) 



[Succeeded / Failed / Skipped / Total] 0 / 1 / 0 / 1:  10%|█         | 1/10 [00:00<00:02,  3.66it/s]

--------------------------------------------- Result 1 ---------------------------------------------

Limited time offer: Act now!




[Succeeded / Failed / Skipped / Total] 0 / 2 / 0 / 2:  20%|██        | 2/10 [00:00<00:02,  3.33it/s]

--------------------------------------------- Result 2 ---------------------------------------------

Join us for a webinar on AI innovations




[Succeeded / Failed / Skipped / Total] 0 / 3 / 0 / 3:  30%|███       | 3/10 [00:00<00:02,  3.39it/s]

--------------------------------------------- Result 3 ---------------------------------------------

Urgent: Verify your account immediately.




[Succeeded / Failed / Skipped / Total] 2 / 3 / 2 / 7:  70%|███████   | 7/10 [00:01<00:00,  4.76it/s]

--------------------------------------------- Result 4 ---------------------------------------------

[[Congratulations]]! [[You've]] won a [[free]] [[gift]] [[card]].

[[Congratualtions]]! [[Yu've]] won a [[rfee]] [[gqift]] [[acrd]].


--------------------------------------------- Result 5 ---------------------------------------------

Your subscription has been renewed successfully.


--------------------------------------------- Result 6 ---------------------------------------------

Important security update for your account


--------------------------------------------- Result 7 ---------------------------------------------

Don't miss out on our [[exclusive]] sale!

Don't miss out on our [[exlcusive]] sale!




[Succeeded / Failed / Skipped / Total] 3 / 4 / 3 / 10: 100%|██████████| 10/10 [00:01<00:00,  5.37it/s]

--------------------------------------------- Result 8 ---------------------------------------------

Your invoice is ready for download


--------------------------------------------- Result 9 ---------------------------------------------

[[Get]] [[paid]] to work from home!

[[Gept]] [[vaid]] to work from home!


--------------------------------------------- Result 10 ---------------------------------------------

Carte de crédit sur le point d’expirer sur Cloudflare



+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 3      |
| Number of failed attacks:     | 4      |
| Number of skipped attacks:    | 3      |
| Original accuracy:            | 70.0%  |
| Accuracy under attack:        | 40.0%  |
| Attack success rate:          | 42.86% |
| Average perturbed word %:     | 39.68% |
| Average num. words per input: | 6.6    |
| Avg num queries:              | 19.43  |
+------




[<textattack.attack_results.failed_attack_result.FailedAttackResult at 0x2bbb0837350>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x2bbb0417c90>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x2bbb2482790>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x2bbb257bd50>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x2bbc117ed90>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x2bbb2207150>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x2bbb22e52d0>,
 <textattack.attack_results.failed_attack_result.FailedAttackResult at 0x2bbb003fdd0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x2bbb22f0710>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x2bbb256b350>]

In [None]:
probs = model_wrapper("Congratulations! You've won a free gift card.")
# class 0 is not spam, class 1 is spam
for i, prob in enumerate(probs[0]):
    print(f"Probability of class {i}:\n---------------\n{prob * 100:.2f}%\n")
    
adv_probs = model_wrapper("[[Congratualtions]]! [[Yu've]] won a [[rfee]] [[gqift]] [[acrd]]")
for i, adv_probs in enumerate(adv_probs[0]):
    print(f"Probability of class {i} (adv example):\n---------------\n{adv_probs * 100:.2f}%\n")

Probability of class 0:
---------------
0.08%

Probability of class 1:
---------------
99.92%

Probability of class 0 (adv example):
---------------
74.17%

Probability of class 1 (adv example):
---------------
25.83%

