In [1]:
%env TRANSFORMERS_CACHE=/bigstor/zsarwar/models/cache
%env CUDA_VISIBLE_DEVICES=1

env: TRANSFORMERS_CACHE=/bigstor/zsarwar/models/cache
env: CUDA_VISIBLE_DEVICES=1


In [2]:
import time
import argparse
import json
import logging
from pathlib import Path
import random

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import transformers
from transformers import AutoConfig, AutoModelWithLMHead, AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
from nltk import tokenize
import utils
import spacy
from spacy import displacy
import nltk

NER = spacy.load("en_core_web_sm")



In [3]:
logger = logging.getLogger(__name__)
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)


parser = argparse.ArgumentParser()
parser.add_argument('--train', type=Path, default='/home/zsarwar/NLP/autoprompt/data/correctly_classified_roberta_large_autoprompt_format_shorter.jsonl', help='Train data path')
parser.add_argument('--dev', type=Path, default='/home/zsarwar/NLP/autoprompt/data/correctly_classified_roberta_large_autoprompt_format_shorter.jsonl',help='Dev data path')
parser.add_argument('--template', type=str,default='<s> {Pre_Mask}[P]{Post_Mask} [T] [T] [T] [T] [T] </s>', help='Template string')
parser.add_argument('--label-map', type=str, default=None, help='JSON object defining label map')

# LAMA-specific
parser.add_argument('--tokenize-labels', action='store_true',
                    help='If specified labels are split into word pieces.'
                            'Needed for LAMA probe experiments.')
parser.add_argument('--filter', action='store_true', default=True,
                    help='If specified, filter out special tokens and gold objects.'
                            'Furthermore, tokens starting with capital '
                            'letters will not appear in triggers. Lazy '
                            'approach for removing proper nouns.')
parser.add_argument('--print-lama', action='store_true',
                    help='Prints best trigger in LAMA format.')
parser.add_argument('--logfile', type=str, default='v4_debug')

parser.add_argument('--initial-trigger', nargs='+', type=str, default=None, help='Manual prompt')
parser.add_argument('--label-field', type=str, default='Prediction',
                    help='Name of the label field')

parser.add_argument('--bsz', type=int, default=1, help='Batch size')
parser.add_argument('--eval-size', type=int, default=1, help='Eval size')
parser.add_argument('--iters', type=int, default=1,
                    help='Number of iterations to run trigger search algorithm')
parser.add_argument('--accumulation-steps', type=int, default=1)
parser.add_argument('--model-name', type=str, default='roberta-large',
                    help='Model name passed to HuggingFace AutoX classes.')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--limit', type=int, default=None)
parser.add_argument('--use-ctx', action='store_true',
                    help='Use context sentences for relation extraction only')
parser.add_argument('--perturbed', action='store_true',
                    help='Perturbed sentence evaluation of relation extraction: replace each object in dataset with a random other object')
parser.add_argument('--patience', type=int, default=5)
parser.add_argument('--num-cand', type=int, default=10)
parser.add_argument('--sentence-size', type=int, default=50)


parser.add_argument('--debug', action='store_true')
args = parser.parse_args(args=[])

if args.debug:
    level = logging.DEBUG
else:
    level = logging.INFO
logfile = "/home/zsarwar/NLP/autoprompt/autoprompt/Results/"+ str(args.train).split("/")[-1].split(".")[0]  +  "_" + args.logfile    
logging.basicConfig(filename=logfile,level=level)



In [28]:
class GradientStorage:
    """
    This object stores the intermediate gradients of the output a the given PyTorch module, which
    otherwise might not be retained.
    """
    def __init__(self, module):
        self._stored_gradient = None
        module.register_full_backward_hook(self.hook)
        # module.register_backward_hook(self.hook)

    def hook(self, module, grad_in, grad_out):
        self._stored_gradient = grad_out[0]

    def get(self):
        return self._stored_gradient


class PredictWrapper:
    """
    PyTorch transformers model wrapper. Handles necc. preprocessing of inputs for triggers
    experiments.
    """
    def __init__(self, model):
        self._model = model

    def __call__(self, model_inputs, trigger_ids):
        # Copy dict so pop operations don't have unwanted side-effects
        model_inputs = model_inputs.copy()
        trigger_mask = model_inputs.pop('trigger_mask')
        model_inputs = replace_trigger_tokens(model_inputs, trigger_ids, trigger_mask)
        predict_mask = model_inputs.pop('predict_mask')
        logits, *_ = self._model(**model_inputs)
        predict_logits = logits.masked_select(predict_mask.unsqueeze(-1)).view(logits.size(0), -1)
        return predict_logits


class AccuracyFn:
    """
    Computing the accuracy when a label is mapped to multiple tokens is difficult in the current
    framework, since the data generator only gives us the token ids. To get around this we
    compare the target logp to the logp of all labels. If target logp is greater than all (but)
    one of the label logps we know we are accurate.
    """
    def __init__(self, tokenizer, label_map, device, tokenize_labels=False):
        self._all_label_ids = []
        self._pred_to_label = []
        logger.info(label_map)
        for label, label_tokens in label_map.items():
            self._all_label_ids.append(utils.encode_label(tokenizer, label_tokens, tokenize_labels).to(device))
            self._pred_to_label.append(label)
        logger.info(self._all_label_ids)

    def __call__(self, predict_logits, gold_label_ids):
        # Get total log-probability for the true label
        gold_logp = get_loss(predict_logits, gold_label_ids)

        # Get total log-probability for all labels
        bsz = predict_logits.size(0)
        all_label_logp = []
        for label_ids in self._all_label_ids:
            label_logp = get_loss(predict_logits, label_ids.repeat(bsz, 1))
            all_label_logp.append(label_logp)
        all_label_logp = torch.stack(all_label_logp, dim=-1)
        _, predictions = all_label_logp.max(dim=-1)
        predictions = [self._pred_to_label[x] for x in predictions.tolist()]

        # Add up the number of entries where loss is greater than or equal to gold_logp.
        ge_count = all_label_logp.le(gold_logp.unsqueeze(-1)).sum(-1)
        correct = ge_count.le(1)  # less than in case of num. prec. issues

        return correct.float()

    # TODO: @rloganiv - This is hacky. Replace with something sensible.
    def predict(self, predict_logits):
        bsz = predict_logits.size(0)
        all_label_logp = []
        for label_ids in self._all_label_ids:
            label_logp = get_loss(predict_logits, label_ids.repeat(bsz, 1))
            all_label_logp.append(label_logp)
        all_label_logp = torch.stack(all_label_logp, dim=-1)
        _, predictions = all_label_logp.max(dim=-1)
        predictions = [self._pred_to_label[x] for x in predictions.tolist()]
        return predictions


def load_pretrained(model_name):
    """
    Loads pretrained HuggingFace config/model/tokenizer, as well as performs required
    initialization steps to facilitate working with triggers.
    """
    config = AutoConfig.from_pretrained(model_name )
    model = AutoModelWithLMHead.from_pretrained(model_name)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
    utils.add_task_specific_tokens(tokenizer)
    return config, model, tokenizer


def set_seed(seed: int):
    """Sets the relevant random seeds."""
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)


def get_embeddings(model, config):
    """
    Returns the wordpiece embedding module.
    """
    base_model = getattr(model, config.model_type)
    embeddings = base_model.embeddings.word_embeddings
    return embeddings



def compute_accuracy(predict_logits, labels):
    target_logp = F.log_softmax(predict_logits, dim=-1)
    max_pred = torch.argmax(target_logp, dim=-1).unsqueeze(-1)
    mask = max_pred.eq(labels)
    correct = mask.nonzero().shape[0]
    total = labels.shape[0]
    acc = correct / total
    return correct

def hotflip_attack(averaged_grad,
                   normalized_embedding_matrix,
                   increase_loss=False,
                   num_candidates=1,
                   filter=None):
    """Returns the top candidate replacements."""
    with torch.no_grad():
        gradient_dot_embedding_matrix = torch.matmul(
            normalized_embedding_matrix,
            averaged_grad
        )

        if filter is not None:
            gradient_dot_embedding_matrix -= filter
            
        if not increase_loss:
            gradient_dot_embedding_matrix *= -1

    _, top_k_ids = gradient_dot_embedding_matrix.topk(num_candidates)
        
    

    return top_k_ids


def get_pred_label(predict_logits, labels, tokenizer):
    target_logp = F.log_softmax(predict_logits, dim=-1)
    max_pred = torch.argmax(target_logp, dim=-1).unsqueeze(-1)
    #logger.info(f"max_pred is {max_pred}")
    
    return max_pred



def get_loss(predict_logits, label_ids):
    predict_logp = F.log_softmax(predict_logits, dim=-1)
    target_logp = predict_logp.gather(-1, label_ids)
    target_logp = target_logp - 1e32 * label_ids.eq(0)  # Apply mask
    target_logp = torch.logsumexp(target_logp, dim=-1)
    return -target_logp


def isVariable(idx, tokenizer, allowed_words):
    word = tokenizer.decode([idx])
    word = word.replace(" ", "")
    _isVar = False
    upper_locs = [i for i, ch in enumerate(word) if ch.isupper()]
    # Check if caps in between and entire word is not upper-case
    if(len(upper_locs) > 0 and len(upper_locs) < len(word)):
        for idx in upper_locs:
            if (idx > 0):
            # Check if token is not real entity like McDonalds                
                parsed_word= NER(word)
                if (len(parsed_word.ents) == 0):
                    if(word not in allowed_words):
                        _isVar = True
                    break 
    return _isVar

In [22]:
def replace_trigger_tokens(model_inputs, trigger_ids, trigger_mask):
    """Replaces the trigger tokens in input_ids."""
    out = model_inputs.copy()
    
    # Remove after debugging
    #trigger_ids = torch.full([1,10], fill_value=200, device=device)
    # Count number of false values
    new_len = (torch.count_nonzero(trigger_mask.eq(False)) + trigger_ids.shape[1]).item()
    # New trigger mask
    new_trigger_mask = torch.zeros(new_len, dtype=torch.bool, device=device).unsqueeze(0)
    # Get index of first true element in the old mask and fill in new_trigger_mask
    trigger_start_index = torch.where(trigger_mask == True)[1][0].item()
    new_trigger_mask[0][trigger_start_index: trigger_start_index + trigger_ids.shape[1]] = True
    # New input_ids_tensor
    new_input_ids = torch.full(new_trigger_mask.shape, fill_value=-1, device=device)
    # Fill in og ids
    og_text_ids = (torch.masked_select(out['input_ids'], trigger_mask.eq(False)))
    new_input_ids.masked_scatter_(new_trigger_mask.eq(False), og_text_ids)
    # Fill in new trigger_ids
    new_input_ids.masked_scatter_(new_trigger_mask, trigger_ids)
    # New prediction mask
    new_pred_mask = torch.full(new_trigger_mask.shape, fill_value=0, device=device,dtype=torch.bool)
    # Need to check for number of trigger tokens in both masks
    pred_mask_true_index = torch.where(out['predict_mask'])[1][0].item()
    num_trig_tokens_old = torch.count_nonzero(trigger_mask)
    num_trig_tokens_new = torch.count_nonzero(new_trigger_mask)
    diff = num_trig_tokens_new - num_trig_tokens_old
    if(trigger_start_index > pred_mask_true_index):
        # Copy/paste into the same index as is
        new_pred_mask[0][pred_mask_true_index] = True
    else:
        new_pred_mask[0][pred_mask_true_index + diff] = True
    
    # Finally, a new attention mask is also needed
    new_attention_mask = torch.full(new_input_ids.shape, fill_value=1, device=device)
    
    out['input_ids'] = new_input_ids
    out['predict_mask'] = new_pred_mask
    out['attention_mask'] = new_attention_mask
    
    return out
    

In [23]:

set_seed(args.seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

logger.info('Loading model, tokenizer, etc.')
config, model, tokenizer = load_pretrained(args.model_name)
model.to(device)
embeddings = get_embeddings(model, config)
embedding_gradient = GradientStorage(embeddings)
predictor = PredictWrapper(model)

if args.label_map is not None:
    label_map = json.loads(args.label_map)
    logger.info(f"Label map: {label_map}")
else:
    label_map = None
    logger.info('No label map')

templatizer = utils.TriggerTemplatizer(
    args.template,
    config,
    tokenizer,
    label_map=label_map,
    label_field=args.label_field,
    tokenize_labels=args.tokenize_labels,
    add_special_tokens=False,
    use_ctx=args.use_ctx
)

# Obtain the initial trigger tokens and label mapping
if args.initial_trigger:
    
    initial_trigger = args.initial_trigger
    logger.info(f"initial trigger {initial_trigger}")
    logger.info("init ids")
    init_ids = tokenizer.convert_tokens_to_ids(initial_trigger)
    logger.info(init_ids)
    init_ids = torch.tensor(init_ids, device=device).unsqueeze(0)
    logger.info(init_ids)
    trigger_ids = tokenizer.convert_tokens_to_ids(initial_trigger)
    logger.info(f'Initial triggers are the following: {initial_trigger}')
    
    logger.info(f'Initial Trigger ids are: {trigger_ids}')
    logger.info(f"len trigger ids: {len(trigger_ids)}")
    logger.info(f"num trigger tokens: {templatizer.num_trigger_tokens}")
    assert len(trigger_ids) == templatizer.num_trigger_tokens
else:
    logger.info(f"no initial trigger provided, using {templatizer.num_trigger_tokens} mask tokens")
    init_ids = [tokenizer.mask_token_id] * templatizer.num_trigger_tokens
    init_ids = torch.tensor(init_ids, device=device).unsqueeze(0)
    trigger_ids = [tokenizer.mask_token_id] * templatizer.num_trigger_tokens
trigger_ids = torch.tensor(trigger_ids, device=device).unsqueeze(0)
best_trigger_ids = trigger_ids.clone()

# NOTE: Accuracy can only be computed if a fixed pool of labels is given, which currently
# requires the label map to be specified. Since producing a label map may be cumbersome (e.g.,
# for link prediction tasks), we just use (negative) loss as the evaluation metric in these cases.
if label_map:
    evaluation_fn = AccuracyFn(tokenizer, label_map, device)
else:
    evaluation_fn = lambda x, y: -get_loss(x, y)

logger.info('Loading datasets')
collator = utils.Collator(pad_token_id=tokenizer.pad_token_id)

if args.perturbed:
    train_dataset = utils.load_augmented_trigger_dataset(args.train, templatizer, limit=args.limit)
else:
    train_dataset = utils.load_trigger_dataset(args.train, templatizer, use_ctx=args.use_ctx, limit=args.limit)
train_loader = DataLoader(train_dataset, batch_size=args.bsz, shuffle=True, collate_fn=collator)

if args.perturbed:
    dev_dataset = utils.load_augmented_trigger_dataset(args.train, templatizer)
else:
    dev_dataset = utils.load_trigger_dataset(args.dev, templatizer, use_ctx=args.use_ctx)
dev_loader = DataLoader(dev_dataset, batch_size=args.eval_size, shuffle=False, collate_fn=collator)


In [25]:
allowed_words = ['iPhone', 'McC', 'YouTube', 'McDonald', 'LinkedIn', 'MPs', 'WhatsApp', 'iOS', 'McCain', 'McG', 'McD', 'McConnell', 'McGregor', 'McCarthy', 'iPad', 'LeBron', 'JPMorgan', 'IoT', 'OnePlus', 'realDonaldTrump', 'BuzzFeed', 'iTunes', 'iPhones', 'SpaceX', 'McLaren', 'PhD', 'PlayStation', 'McKin', 'McCabe', 'McCoy', 'TVs', 'FedEx', 'McGr', 'McGu', 'McMahon', 'CEOs', 'McMaster', 'JavaScript', 'WikiLeaks', 'eBay', 'McKenzie', 'McInt', 'BlackBerry', 'McCorm', 'DeVos', 'PayPal', 'MacBook', 'McCull', 'PCs', 'McKay', 'MacDonald', 'McCann', 'McGee', 'NGOs', 'GHz', 'McKenna', 'McCartney', 'HuffPost', 'McGill', 'WiFi', 'McDonnell', 'iPads', 'GoPro', 'iPod', 'MacArthur', 'VMware', 'macOS', 'CDs', 'McAuliffe', 'WordPress', 'iCloud', 'YouTube', 'GeForce', 'GPUs', 'CPUs', 'GitHub', 'PowerPoint', 'eSports', 'ObamaCare', 'iPhone', 'UFOs', 'mRNA', 'StarCraft', 'LinkedIn']

In [29]:
"""
filter = torch.zeros(tokenizer.vocab_size, dtype=torch.float32, device=device)
if args.filter:
    logger.info('Filtering label tokens.')
    if label_map:
        for label_tokens in label_map.values():
            label_ids = utils.encode_label(tokenizer, label_tokens).unsqueeze(0)
            filter[label_ids] = 1e32
    else:
        for _, label_ids in train_dataset:
            filter[label_ids] = 1e32
    logger.info('Filtering special tokens and capitalized words.')
    for word, idx in tokenizer.get_vocab().items():
        if len(word) == 1 or idx >= tokenizer.vocab_size:
            continue
        # Filter special tokens.
        if idx in tokenizer.all_special_ids:
            logger.info('Filtered: %s, index: %d', word, idx)
            filter[idx] = 1e32
        
        if isVariable(idx, tokenizer, allowed_words):
            logger.debug(f"Filtered {word}")
            filter[idx] = 1e32


# creating the filter for the first iteration of token generation
first_iter_filter = filter.detach().clone()
if args.model_name == "roberta-large":
    with open("/home/zsarwar/NLP/autoprompt/roberta_full_words_capital_no_diacritic.json", "r", encoding="utf-8") as f:
        whole_word_tokens = json.load(f)
    
    for index in range(tokenizer.vocab_size):
        if index not in whole_word_tokens.values():
            first_iter_filter[index] = 1e32
# end creating first iter filter

# Save filter
torch.save(first_iter_filter, "/home/zsarwar/NLP/autoprompt/data/first_iter_filter.pt")
torch.save(filter, "/home/zsarwar/NLP/autoprompt/data/filter.pt")
"""
first_iter_filter = torch.load("/home/zsarwar/NLP/autoprompt/data/first_iter_filter.pt", map_location=device)
filter = torch.load("/home/zsarwar/NLP/autoprompt/data/filter.pt", map_location=device)

In [30]:
logger.info('Evaluating baseline')
logger.info(f"Baseline trigger ids are : {trigger_ids}")
numerator = 0
numerator_acc = 0
denominator = 0
for model_inputs, labels in tqdm(dev_loader):
    model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
    labels = labels.to(device)
    with torch.no_grad():
        predict_logits = predictor(model_inputs, trigger_ids)
    numerator += evaluation_fn(predict_logits, labels).sum().item()
    denominator += labels.size(0)
    numerator_acc += compute_accuracy(predict_logits, labels)
dev_metric = numerator / (denominator + 1e-13)
acc_metric_base = numerator_acc / (denominator + 1e-13)
logger.info(f'Dev metric: {dev_metric}')
logger.info(f'Dev acc metric baseline is : {acc_metric_base}')

best_dev_metric = 10
best_dev_acc_metric = 1
# Measure elapsed time of trigger search
start = time.time()

# precalculating the normalized embeddings
embed_norm = torch.linalg.vector_norm(embeddings.weight, dim=1)
normalized_embedding_weights = torch.transpose(
    torch.divide(torch.transpose(embeddings.weight, 0, 1), embed_norm),
    0,
    1
)

100%|██████████| 328/328 [00:05<00:00, 59.02it/s]


In [32]:

# intializing GPT-2
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
gpt_model = gpt_model.to(device)

# To deal with special tokens later
tokenizer_special_tokens = []

for word, idx in tokenizer.get_vocab().items():
        if idx >= tokenizer.vocab_size:
            continue
        if idx in tokenizer.all_special_ids and word != "":
            tokenizer_special_tokens.append(word)

for token in tokenizer.additional_special_tokens:
    tokenizer_special_tokens.append(token)

In [34]:
new_example = True
for i in range(args.iters):
    total_samples = 0
    total_incorrect = 0
    logger.info(f'Iteration: {i}')
    model.zero_grad()
    averaged_grad = None
    # Accumulate
    for model_inputs, labels in tqdm(train_loader):
        if(total_samples == 20):
            break
        new_example=True
        total_samples+=1    
        # Start from scratch for each example
        trigger_ids = init_ids.clone()
        model.zero_grad()
        model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
        labels = labels.to(device)
        with torch.no_grad():   
            predict_logits = predictor(model_inputs, trigger_ids)
            eval_metric = evaluation_fn(predict_logits, labels)
            eval_acc_metric = compute_accuracy(predict_logits, labels)
        for token_to_flip in range(templatizer.num_trigger_tokens):
            model.zero_grad()
            predict_logits = predictor(model_inputs, trigger_ids)
            loss = get_loss(predict_logits, labels).mean()
            loss.backward()
            grad = embedding_gradient.get()
            bsz, _, emb_dim = grad.size()
            selection_mask = model_inputs['trigger_mask'].unsqueeze(-1)
            grad = torch.masked_select(grad, selection_mask)
            grad = grad.view(bsz, templatizer.num_trigger_tokens, emb_dim)
            averaged_grad = grad.sum(dim=0)
            candidates = hotflip_attack(averaged_grad[token_to_flip],
                                        normalized_embedding_weights,
                                        increase_loss=True,
                                        num_candidates=args.num_cand,
                                        filter=filter if token_to_flip > 0 else first_iter_filter)
            current_score = 0
            current_acc = 0
            candidate_scores = torch.zeros(args.num_cand, device=device)
            candidate_accs = torch.zeros(args.num_cand, device=device)
            candidate_pred_labels = torch.zeros(args.num_cand, device=device, dtype=int)
            denom = 0
            fluent_candidates = []
            # Update current score
            current_acc = eval_acc_metric
            current_score = eval_metric.sum()
            denom = labels.size(0)
            original_prompt = tokenizer.decode(model_inputs['input_ids'][0])
            #original_prompt = original_prompt.replace(tokenizer.mask_token, gpt_tokenizer.unk_token)
            original_prompt = original_prompt.replace(tokenizer.mask_token, tokenizer.decode(labels[0].item()))
            for special_token in tokenizer_special_tokens:
                original_prompt = original_prompt.replace(" " + special_token, "")
                original_prompt = original_prompt.replace(special_token, "")
            fluent_text = []
            # Actual attack starts
            for i, candidate in enumerate(candidates):
                # logger.info("Candidate: %d", candidate)
                temp_trigger = trigger_ids.clone()
                temp_trigger[:, token_to_flip] = candidate
                temp_string = original_prompt + tokenizer.convert_tokens_to_string(
                    tokenizer.convert_ids_to_tokens([candidate])
                )
                with torch.no_grad():
                    encoded_prompt = gpt_tokenizer.encode(temp_string, add_special_tokens=True, return_attention_mask=False, return_tensors='pt').to(device)
                    num_tokens = encoded_prompt.numel()
                    if not num_tokens:
                        fluent_candidates.append(temp_trigger)
                        fluent_text.append(temp_string)
                        logger.info("Encountered a failure")
                        continue
                    outputs = gpt_model.generate(encoded_prompt, do_sample=True, top_p=0.96, output_scores=True, return_dict_in_generate=True, max_length=80)
                    # For appending to BERT
                    # Only keep the generated tokens and remove any EOS ta
                    generated_tokens = outputs[0][num_tokens:]
                    # Converted to text
                    generated_text = gpt_tokenizer.decode(generated_tokens, skip_special_tokens=True)
                    # split by nltk
                    generated_text_sents = tokenize.sent_tokenize(generated_text)
                    if(len(generated_text_sents) == 0):
                        logger.info("Encountered an error")
                        fluent_candidates.append(temp_trigger)
                        fluent_text.append(temp_string)
                        continue
                    fluent_tokens_bert = tokenizer.tokenize(generated_text_sents[0])         
                    # For printing
                    # Converted to text
                    generated_text = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
                    # Append the first two sentences (OG prompt + GPT2-generation) or entire prompt (if sample has no period at the end)
                    generated_text_sents = tokenize.sent_tokenize(generated_text)
                    if(len(generated_text_sents) >= 2):
                        generated_text_sents = ' '.join(generated_text_sents[0:2])
                    else:
                        generated_text_sents = generated_text_sents[0]
                    fluent_text.append(generated_text_sents)
                    # For BERT
                    fluent_ids = tokenizer.convert_tokens_to_ids(fluent_tokens_bert)
                    fluent_ids.insert(0, temp_trigger[0][0].item())
                    fluent_ids = torch.tensor(fluent_ids, device=device).unsqueeze(0)
                    fluent_candidates.append(fluent_ids)
                with torch.no_grad():
                    predict_logits = predictor(model_inputs, fluent_ids)
                    eval_metric = evaluation_fn(predict_logits, labels)
                    pred_label = get_pred_label(predict_logits, labels, tokenizer)
                    eval_attack_acc_metric = compute_accuracy(predict_logits, labels)
                candidate_scores[i] = eval_metric.sum()
                candidate_accs[i] = eval_attack_acc_metric
                candidate_pred_labels[i] = pred_label
            # after evaluating all of the candidates, check if any of them reduce accuracy to zero and early exit
            if (candidate_accs == 0).any():
                total_incorrect+=1
                input_text = tokenizer.decode(model_inputs['input_ids'][0])
                real_label = tokenizer.convert_ids_to_tokens(labels)
                og_text_pred = model_inputs['input_ids'][0].detach().clone()
                og_lab = labels.detach().clone()
                idx_to_rep = torch.where(og_text_pred == tokenizer.mask_token_id)[0].item()
                idx_t_start  = torch.where(og_text_pred == tokenizer.additional_special_tokens_ids[0])[0][0].item()
                og_text_pred[idx_to_rep] = og_lab[0].item()
                og_text_pred = tokenizer.decode(og_text_pred[1:idx_t_start])
                print(f" Original : {og_text_pred}")
                logger.info(f"Original  : {og_text_pred}")
                for index, candidate_acc in enumerate(candidate_accs):
                    if candidate_acc != 0:
                        continue
                    adv_lab = candidate_pred_labels[index].item()
                    # Replace only the first instance of the true label with the predicted (adversarial) label
                    adv_text_pred = fluent_text[index].replace(tokenizer.convert_tokens_to_string(real_label[0]), tokenizer.decode(adv_lab), 1).replace("\n", "")
                    
                    trigger_ids = fluent_candidates[index]
                    logger.info(f"Adversarial : {adv_text_pred}")
                    print(f"Adversarial : {adv_text_pred}")
                logger.info(f"\n\n")
                break
            # if the prompt doesn't break on any candidates, use the best option and move to the next token
            if (candidate_scores < current_score).any():
                #logger.info('Better trigger with higher loss detected.')
                best_candidate_score = candidate_scores.min()
                best_candidate_idx = candidate_scores.argmin()
                trigger_ids[:, token_to_flip] = candidates[best_candidate_idx]
            break            
flip_rate = total_incorrect / total_samples + 1e-32
trig_tokens = tokenizer.convert_ids_to_tokens(trigger_ids.squeeze(0))
logger.info(f"Total incorrect are : {total_incorrect}")
logger.info(f"Total samples are : {total_samples}")
logger.info(f"Flip rate is : {flip_rate}")
print(f"Flip rate is : {flip_rate}")

  0%|          | 1/328 [00:32<2:54:25, 32.01s/it]

 Original :  Mac Con, a citizen of Ireland.
Adversarial :  Mac Con, a citizen of Japan. Zombies and other corpses of fallen enemies were seen floating around in the air, with numerous zombie heads being thrown about the room.
Adversarial :  Mac Con, a citizen of Japan. Ichigo, who could be considered the main character of the series, could be considered a fan favorite as he has been depicted in some artwork, comics, cartoons and games.
Adversarial :  Mac Con, a citizen of Dublin. Swordsman and soldier, he fought at Plassey, Parma, and at Drogheda.
Adversarial :  Mac Con, a citizen of Vietnam. Baius was one of three men in this group who traveled to America in the 1990s, but it is believed they stayed in the United States and later left.
Adversarial :  Mac Con, a citizen of Japan. Tsukuyomi is an ancestor of many modern Japanese families, and is a symbol of peace.
Adversarial :  Mac Con, a citizen of Taiwan. Asians are also known for their love of the 'happa hap' (handshake) handshake, 

  1%|          | 2/328 [01:02<2:49:02, 31.11s/it]

 Original :  L'Instant X, a French-language work.
Adversarial :  L'Instant X, a multi-language work. DEBUG:timer_callback: Received timer callback from server timezone: 3:00:00 T:140735221792768 DEBUG: Process - Setting the Python path to /home/atv/.xbmc/addons/metadata.album.universal/default.py TO /home/atv/.xbmc/
Adversarial :  L'Instant X, a multi-language work. ACPI is the acronym for Advanced Configuration and Power Interface, the hardware component that manages things like power to the computer, the display, and other aspects of the system.
Adversarial :  L'Instant X, a multi-language work. JSONLite X, an X11 server to support JSON parsing, JSON decoders and serializers, as well as other standard libraries like gRPC, is currently under development, but is not finished yet.
Adversarial :  L'Instant X, a scripting-language work. Debugging can be difficult in this language, and it's likely that these days most of the bugs you find are either not in the code, or are in the tests.
Ad

  1%|          | 3/328 [01:30<2:41:08, 29.75s/it]

 Original : Dutch is Pieter Cornelis Boutens's native language.
Adversarial : English is Pieter Cornelis Boutens's native language. ID: 121323T: +31 552 258830 F: +31 552 267541t: en.krakatai.org.plhttp://www.krakatai.org.pl/Mailing address:Krakatai, Lithuania
Adversarial : English is Pieter Cornelis Boutens's native language. Southern Ireland is his homeland, and it was his uncle, Richard Boutens, who gave him the nickname "Pieter," his parents' initials, which came from his father's brother, Pieter.
Adversarial : French is Pieter Cornelis Boutens's native language. Shirley's father, a French teacher who once tried to persuade his students not to speak French in the classroom, also wrote his own mother tongue.
Adversarial : English is Pieter Cornelis Boutens's native language. Eastern Europe, as I mention in the next chapter, has a variety of regional languages, but in the country of Romania the language we call Vlach is the language of people speaking mainly Romanian.
Adversarial : E

  1%|          | 4/328 [01:59<2:39:40, 29.57s/it]

 Original :  Harashima has a citizenship of Japan.
Adversarial :  Harashima has a citizenship of 10. Rankings are calculated as per official sources.
Adversarial :  Harashima has a citizenship of Canada. Brent says, "Well, I guess we were at the point where it wasn't really any more of a question whether she was Japanese."


  2%|▏         | 5/328 [02:24<2:30:01, 27.87s/it]

 Original :  Amiibo, developed by Nintendo
Adversarial :  Amiibo, developed by Team Magicka and originally released for the Nintendo 3DS in February 2015. The game will be released worldwide on March 30, 2016.
Adversarial :  Amiibo, developed by The Blizzards, that includes both Mario and Luigi characters on one figurine. The game was released on June 2, 2014 in Japan.
Adversarial :  Amiibo, developed by Team Scourge. A new mode, where players must complete various tasks to earn points to unlock exclusive skins, was also added in the game.
Adversarial :  Amiibo, developed by Team Okawaru. They are the enemies in the Wii, Wii U, Nintendo 3DS, and Nintendo Switch versions of Donkey Kong Country: Tropical Freeze.
Adversarial :  Amiibo, developed by Project Grimoire, uses the Game Boy Camera's gyroscope to keep players in the game by tracking their movements as they navigate the environment. It is available now for $59.99.
Adversarial :  Amiibo, developed by The Forsaken Knight 1.0Super Pa

  2%|▏         | 6/328 [02:48<2:22:32, 26.56s/it]

 Original :  Kaniv Reservoir, located in Ukraine.
Adversarial :  Kaniv Reservoir, located in Utah. SHALLOW ISLAND TUNDRA, a salt, natural, or artificial, at the source.
Adversarial :  Kaniv Reservoir, located in Maharashtra. Firuk Batur is also used as a water treatment facility for the Batur River that flows into the Bay of Bengal.
Adversarial :  Kaniv Reservoir, located in Alaska. Featherweight alpine birds have been using the basin for many millennia.
Adversarial :  Kaniv Reservoir, located in Utah. INCLUDES: (1) U.S.
Adversarial :  Kaniv Reservoir, located in California. Countless birds, frogs, lizards and other small creatures were washed out to sea by the flood.
Adversarial :  Kaniv Reservoir, located in California. DuPont had been asked to work on the design of the reservoir.
Adversarial :  Kaniv Reservoir, located in Livingston. COUNTY: Calhoun County Estimate $4-9 million.


  2%|▏         | 7/328 [03:12<2:16:28, 25.51s/it]

 Original : French was a language used by Sylvain Chomet.
Adversarial : Python was a language used by Sylvain Chomet. API - The API for the website.
Adversarial : : was a language used by Sylvain Chomet. CLSID : The name of the CLLocation-system used by the script.
Adversarial : JSON was a language used by Sylvain Chomet. JSON was implemented in Perl 5 and is one of the very few programs with a strict-constant-time-pointer-pointer-reference implementation, as well as a strict-array-access implementation.
Adversarial : This was a language used by Sylvain Chomet. Runtime: 39:43 Duration: 59:52Mozart - Sonata No.
Adversarial : Ruby was a language used by Sylvain Chomet. ClojureScript will benefit from the added capabilities of the language.
Adversarial : This was a language used by Sylvain Chomet. SDK: Yes.
Adversarial : Python was a language used by Sylvain Chomet. Migration to a new language can be dangerous.
Adversarial : This was a language used by Sylvain Chomet. DEBUG:faux:Failed to

  2%|▏         | 8/328 [03:34<2:10:51, 24.54s/it]

 Original :  Jhelum River belongs to the continent of Asia.
Adversarial :  Jhelum River belongs to the continent of Africa. Nigeria has an unbroken tradition of fishing for gold but the best fishing on this river is done by the people of the North.
Adversarial :  Jhelum River belongs to the continent of Africa. Nigerian, Niger Delta and Sudanese Nile rivers flow to the sea.
Adversarial :  Jhelum River belongs to the continent of Africa. Nigerians say it should be divided between them and the rest of Africa as it belongs to that region."
Adversarial :  Jhelum River belongs to the continent of Africa. Nasa 14/24 Scientists find exactly what human corpses smell like New research has become the first to isolate the particular scent of human death, describing the various chemicals that are emitted by corpses in an attempt to help find them in the future.
Adversarial :  Jhelum River belongs to the continent of Africa. Prosecuting a case under international humanitarian law can easily lead to

  3%|▎         | 9/328 [03:56<2:06:27, 23.78s/it]

 Original :  In Prince Edward Island, English is the official language.
Adversarial :  In Prince Edward Island, French is the official language. FEMA and the Secret Service speak English there and in much of the North and West.
Adversarial :  In Prince Edward Island, French is the official language. WTO regulations prohibit the selling or supplying of services in English.
Adversarial :  In Prince Edward Island, French is the official language. PEOPLE'S VOTE: The Canadian Election 2012In 2012, there was a similar debate when a group of students from Prince Edward Island wanted to introduce English as a second official language in that province.


  3%|▎         | 10/328 [04:18<2:02:54, 23.19s/it]

 Original :  The English language is the official language of Samoa.
Adversarial :  The Hawaiian language is the official language of Samoa. Grassley is the ranking Republican on the Judiciary Committee.
Adversarial :  The The language is the official language of Samoa. Partner: The language is the Samoan language.
Adversarial :  The Samoa language is the official language of Samoa. AfghansAfghansAfghans have a language and a dialect which is very similar to that of the Azeri dialect of the Persian language.


  3%|▎         | 11/328 [04:42<2:03:25, 23.36s/it]

 Original :  Crazed Fruit, developed in Japan.
Adversarial :  Crazed Fruit, developed in 2012. UFOs have been around for a long time, and are commonly reported in the sky on a daily basis.
Adversarial :  Crazed Fruit, developed in 2012. Shadows of Amaranth is a third party spin-off which uses the game engine to create a story similar to the original Shadow of Amaranth.
Adversarial :  Crazed Fruit, developed in 1982. Fahrenheit was the first film to use this special effect, although he only used it in two scenes, which you can see in the original trailer for the film!
Adversarial :  Crazed Fruit, developed in China. DuPont has been testing the fruit for the last 15 years, said Koopman.
Adversarial :  Crazed Fruit, developed in 1995. GHz also introduced the first micro-encoder (the LK200), but was unsuccessful at taking the market by storm.
Adversarial :  Crazed Fruit, developed in St. Empires, by the same company.
Adversarial :  Crazed Fruit, developed in Spain. Hispaniola is the fruit 

  4%|▎         | 12/328 [05:04<2:01:54, 23.15s/it]

 Original :  Microsoft Expression Blend, a product of Microsoft
Adversarial :  Natural Expression Blend, a product of Microsoft Comfort. If I don't find something interesting, I might go back home.
Adversarial :  Cell Expression Blend, a product of Microsoft Studies, is used to automate various parts of the analysis process. To learn more about Microsoft Expression Blend, visit Microsoft Dynamics, the Visual Studio online tool.
Adversarial :  the Expression Blend, a product of Microsoft Winged Cloud. You may have used the same expression in a work environment that uses Microsoft Expression Blend.
Adversarial : , Expression Blend, a product of Microsoft Fremont, Calif. The Microsoft Expression Blend team will now lead the integration with the Power BI Server.
Adversarial :  the Expression Blend, a product of Microsoft Grill's Research and Development company, which also makes programs for the Kinect. "We have an expression-comprehension tool inside of Visual Studio Code called Regex Exp

  4%|▍         | 14/328 [05:47<1:55:31, 22.07s/it]

 Original :  Microsoft Windows SDK is a product of Microsoft
Adversarial :  developer Windows SDK is a product of Microsoft Mohammed Ali. Please contact the community for any questions.
Adversarial :  S Windows SDK is a product of Microsoft HUNP. It is licensed under a license agreement that allows you to copy the content of this website only if you purchase the software product or receive an equivalent value of the same from the software vendor.
Adversarial :  developer Windows SDK is a product of Microsoft Sultanat Wajdi Daimer and is released under the terms of Microsoft Corporation's Microsoft Public License Agreement, version 1.0 or later. Microsoft Windows SDK, Version 4.0 is available at http://www.microsoft.com/en-us/download/details.aspx?id=30653.
Adversarial :  the Windows SDK is a product of Microsoft CONTROL Panel. You cannot distribute it or modify it without prior written permission from Microsoft.


  5%|▍         | 16/328 [06:34<1:58:59, 22.88s/it]

 Original :  Sony SLT camera, produced by Sony
Adversarial :  Cloud SLT camera, produced by Sony Imperium in Japan. With two 16 megapixel sensor (as compared to four on the Nikon D5), it has a higher resolution, higher ISO sensitivity, and a higher noise level, but loses the ability to perform in Aperture Priority and is limited to full manual mode.
Adversarial :  Image SLT camera, produced by Sony Distributing in 1994. This is a single-frame SLT film.
Adversarial :  Peter SLT camera, produced by Sony Fioravanti. The SLT was designed for use in the field, while the SLT II is optimized for studio use, due to its larger sensor.
Adversarial :  Nikon SLT camera, produced by Sony Cooldown and a copy was included with a Nikon D60 and other cameras. The lens has only 10 elements in 10 groups with 6 ED elements and 3 Super ED.
Adversarial :  NASA SLT camera, produced by Sony Thus, at any given time, the total amount of light emitted by every SLT camera is approximately equal to the combined li

  5%|▌         | 17/328 [06:58<2:00:54, 23.33s/it]

 Original :  Intel 80486 is developed by Intel.
Adversarial :  IBM 80486 is developed by Intel. SPECIALIZED DIVISION.
Adversarial :  Qualcomm 80486 is developed by Intel. Ameya Kothari, the CEO and President of Micromax's Asia Pacific Region has announced that the company plans to launch the Micromax R7, R7+, R10 and R15 smartphones in India, as well as the upcoming R11, R13 and R15 smartphone in the market.
Adversarial :  Atari 80486 is developed by Intel. SPACE INVADER is a real-time strategy game developed and published by Sierra On-Line, Inc. SPACE INVADER is based on the Space Combat role-playing game created by Peter Hecker in 1979 and was published by Avalon Hill in 1981.


  5%|▌         | 18/328 [07:17<1:54:12, 22.11s/it]

 Original :  The native language of Pierre Alcover is French.
Adversarial :  The native language of Pierre Alcover is Norwegian. Oslo's population is about 3.4 million.
Adversarial :  The native language of Pierre Alcover is not. Instead, it is "Arabic."


  6%|▌         | 19/328 [07:39<1:52:20, 21.81s/it]

 Original :  Bharat Agnihotri is an India-born person.
Adversarial :  Bharat Agnihotri is an Indian-born person. Protestant pastor and theologian Martin Luther King Jr., who was born in Montgomery, Alabama, is an American born person.
Adversarial :  Bharat Agnihotri is an Indian-born person. Riveri-Agnihotri is in fact not Indian.
Adversarial :  Bharat Agnihotri is an Indian-born person. Caucasian?
Adversarial :  Bharat Agnihotri is an Indian-born person. Railway officer and social worker.
Adversarial :  Bharat Agnihotri is an Indian-born person. Chess.
Adversarial :  Bharat Agnihotri is an Indian-born person. Telescope observations by Chandra X-ray Observatory and the NASA/ESA Hubble Space Telescope suggest that the supermassive black hole at the centre of galaxy NGC 4993 lurks within a massive elliptical galaxy called M31.
Adversarial :  Bharat Agnihotri is an Indian-born person. Boolean expression for an Indic person.
Adversarial :  Bharat Agnihotri is an Indian-born person. Classif

  6%|▌         | 20/328 [08:01<2:03:27, 24.05s/it]

Flip rate is : 0.85





In [None]:
# Parallelizing GPT-2 Generations


In [20]:
allowed_words = ['iPhone', 'McC', 'YouTube', 'McDonald', 'LinkedIn', 'MPs', 'WhatsApp', 'iOS', 'McCain', 'McG', 'McD', 'McConnell', 'McGregor', 'McCarthy', 'iPad', 'LeBron', 'JPMorgan', 'IoT', 'OnePlus', 'realDonaldTrump', 'BuzzFeed', 'iTunes', 'iPhones', 'SpaceX', 'McLaren', 'PhD', 'PlayStation', 'McKin', 'McCabe', 'McCoy', 'TVs', 'FedEx', 'McGr', 'McGu', 'McMahon', 'CEOs', 'McMaster', 'JavaScript', 'WikiLeaks', 'eBay', 'McKenzie', 'McInt', 'BlackBerry', 'McCorm', 'DeVos', 'PayPal', 'MacBook', 'McCull', 'PCs', 'McKay', 'MacDonald', 'McCann', 'McGee', 'NGOs', 'GHz', 'McKenna', 'McCartney', 'HuffPost', 'McGill', 'WiFi', 'McDonnell', 'iPads', 'GoPro', 'iPod', 'MacArthur', 'VMware', 'macOS', 'CDs', 'McAuliffe', 'WordPress', 'iCloud', 'YouTube', 'GeForce', 'GPUs', 'CPUs', 'GitHub', 'PowerPoint', 'eSports', 'ObamaCare', 'iPhone', 'UFOs', 'mRNA', 'StarCraft', 'LinkedIn']

In [46]:
with open("/home/zsarwar/NLP/autoprompt/autoprompt/Results/correctly_classified_roberta_large_autoprompt_format_shorter_v3", 'r') as f:
    lines = f.readlines()
lines = [l for l in lines if 'WARNING:transformers.modeling_utils:Setting' not in l]

with open("/home/zsarwar/NLP/autoprompt/autoprompt/Results/correctly_classified_roberta_large_autoprompt_format_shorter_v3", 'w') as f:
    f.write('\n'.join(lines))