In [None]:
from attack import MagicWordFinder
from sentence_transformers import SentenceTransformer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentenceTransformer("sentence-transformers/sentence-t5-base").to(device)

# The positive magic words we find should ideally push arbitrary sentences 
# to be similar to the sentences in S
S = [
"who played cedric in harry potter",
"who is percival graves harry potter",
"who was beatrix potter",
"which is the longest harry potter book",
"who is gilderoy lockhart in harry potter",
"weasley harry potter actor",
"who played draco from harry potter",
"what is the actors name that plays harry potter",
"what year was harry potter released",
"what is the donabe ceramic potter",
"what date what the first harry potter movie released?",
"who played snape in harry potter",
"who is the actor that plays voldemort in harry potter",
"when did harry potter first published",
"who did emma thompson play in harry potter",
"largest harry potter collection",
"who played professor snape in harry potter",
"who does david bradley play in harry potter",
"who is nicholas flamel in harry potter",
"what was the spiders name in harry potter",
"who is the actor for lupin in harry potter",
"what is the name of the phoenix in harry potter",
"what are the names of all harry potter movies",
"who is danny dyer and harry potter",
"definition of transfiguration harry potter",
"how many books in the harry potter series?",
"what year was harry potter born",
"how long does harry potter tour take",
"who is the actor for harry potter",
"who was the stag in harry potter get t",
"when did harry potter open",
"how many chapters are there in harry potter and the order of the phoenix?",
"what is the genre of the harry potter books",
"how many book about harry potter",
"who is cedric in harry potter",
"where does harry potter live",
"when was the first harry potter published",
"when was lily potter born",
"how long is the harry potter movies",
"who played the asian girl in harry potter",
"who played the girl in harry potter",
"how many pages in the harry potter series usa",
"what tickets should we buy for wizarding world of harry potter",
"who plays cedric diggory in harry potter",
"who is harry potter's main antagonist",
"when was the first harry potter books",
"how many nominations does harry potter have",
"what is the name of who played hermione on harry potter",
"when did harry potter and the deathly hallows come",
"when was the harry potter series made",
"what is the actor name who plays harry potter",
"when was harry potter published",
"how old was daniel radcliffe during the harry potter movies",
"number of words pages chapters in harry potter book series",
"when did the first harry potter come out",
"who directed the last three harry potter movies",
"what is the language snake they speak in harry potter",
"what kind of archetype is harry potter",
"is bellatrix lestrange related to harry potter",
"who published harry potter",
"who is scabbers in harry potter"
]



In [2]:
attack = MagicWordFinder(model, S)

# Find 30 candidates for 1-token magic words; the algorithm will only return the 10 best candidates, since we specify k_0=10
cands = attack.find_magic_words(k=30, m=1, k_0=10)

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [None]:
# Print the magic words found
for i, cand in enumerate(cands):
    print(f"{i + 1}. {model.tokenizer.decode(cand)} (token IDs {cand})")

# Evaluate how the magic words affect the avg. cosine similarity with S
sentence = "Voldemort was right all along!"
sentence_emb = model.encode(sentence, convert_to_tensor=True)
S_embed = model.encode(S, convert_to_tensor=True)

1. Chelsea (token IDs [14373])
2. fees (token IDs [3051])
3. Hernandez (token IDs [31765])
4. malpractice (token IDs [29758])
5. lotion (token IDs [25097])
6. Caroline (token IDs [21753])
7. ectomy (token IDs [25492])
8. Chiropractic (token IDs [29618])
9. liver (token IDs [11501])
10. compost (token IDs [17883])


In [4]:
# Evaluate the base cosine similarity (i.e. with no magic words appended)
cos_sim = torch.nn.CosineSimilarity(dim=0)
avg_cos_sim = sum([cos_sim(sentence_emb, S_embed[j]) for j in range(len(S))]) / len(S)

print(f"Base avg. cosine similarity: {avg_cos_sim}")

Base avg. cosine similarity: 0.7275136709213257


In [5]:
# Evaluate how appending each magic word once impacts the avg. cosine similarity
sentence_tokenized = model.tokenizer(sentence)["input_ids"]
# Extract the EOS token and remove it (since we append more tokens)
eos_token = sentence_tokenized[-1]
sentence_tokenized = sentence_tokenized[:-1]

for cand in cands:
    # Append the candidate magic word to the sentence and then append the EOS token
    input_ids = sentence_tokenized + cand + [eos_token]
    # Embed the result
    magic_sent_emb = model.encode(model.tokenizer.decode(input_ids), convert_to_tensor=True)
    # Evaluate avg. cosine similarity
    avg_cos_sim = sum([cos_sim(magic_sent_emb, S_embed[j]) for j in range(len(S))]) / len(S)

    print(f"Cosine similarity after appending magic word '{model.tokenizer.decode(cand)}': {avg_cos_sim}")

Cosine similarity after appending magic word 'Chelsea': 0.7489315271377563
Cosine similarity after appending magic word 'fees': 0.757028341293335
Cosine similarity after appending magic word 'Hernandez': 0.7415716052055359
Cosine similarity after appending magic word 'malpractice': 0.741245687007904
Cosine similarity after appending magic word 'lotion': 0.7533475756645203
Cosine similarity after appending magic word 'Caroline': 0.7550725340843201
Cosine similarity after appending magic word 'ectomy': 0.7417704463005066
Cosine similarity after appending magic word 'Chiropractic': 0.7433438897132874
Cosine similarity after appending magic word 'liver': 0.7456666827201843
Cosine similarity after appending magic word 'compost': 0.749920666217804
