In [13]:
import sys
sys.dont_write_bytecode = True

import numpy as np
import torch
import torch.nn as nn
from itertools import chain
import language_tool_python
from argparse import Namespace
from datasets import load_dataset, load_metric, DatasetDict, Dataset
from transformers import (
    AutoConfig, 
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
)

import time
from copy import deepcopy
from tqdm import tqdm
from typing import *
from DialogueAPI import dialogue

## Blended_Skill_Talk Dataset

In [13]:
bst_dataset = load_dataset("blended_skill_talk")
train_dataset = bst_dataset['train']
eval_dataset = bst_dataset['validation']
test_dataset = bst_dataset['test']
print(bst_dataset)
# print(train_dataset[0])

Found cached dataset blended_skill_talk (/home/monkey/.cache/huggingface/datasets/blended_skill_talk/default/1.0.0/8544e13cbbf2fb9b34157f2e2f28c1539e4f36bf0ef2bd96edd138b4000c5ca1)
100%|██████████| 3/3 [00:00<00:00, 153.86it/s]

DatasetDict({
    train: Dataset({
        features: ['personas', 'additional_context', 'previous_utterance', 'context', 'free_messages', 'guided_messages', 'suggestions', 'guided_chosen_suggestions', 'label_candidates'],
        num_rows: 4819
    })
    validation: Dataset({
        features: ['personas', 'additional_context', 'previous_utterance', 'context', 'free_messages', 'guided_messages', 'suggestions', 'guided_chosen_suggestions', 'label_candidates'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['personas', 'additional_context', 'previous_utterance', 'context', 'free_messages', 'guided_messages', 'suggestions', 'guided_chosen_suggestions', 'label_candidates'],
        num_rows: 980
    })
})





In [14]:
# Get statistics of pair of dialogues 
train_num, eval_num, test_num = 0, 0, 0
for i, instance in enumerate(train_dataset):
    train_num += len(instance['free_messages'])

for i, instance in enumerate(eval_dataset):
    eval_num += len(instance['free_messages'])

for i, instance in enumerate(test_dataset):
    test_num += len(instance['free_messages'])

print("#pairs of training dialogues: {}, validation dialogues: {}, test dialogues: {}".format(
    train_num, eval_num, test_num,
))

#pairs of training dialogues: 27018, validation dialogues: 5651, test dialogues: 5482


In [15]:
# Show examples
for i, instance in enumerate(test_dataset.select(range(1))):
    for key, value in instance.items():
        if key != 'label_candidates':
            print("{}: {}".format(key, value))

personas: ['i hate talking to people.', 'i believe dragons are real.']
additional_context: Social anxiety
previous_utterance: ['Wow, I am never shy. Do you have anxiety?', "Yes. I end up sweating and blushing and feel like i'm going to throw up."]
context: wizard_of_wikipedia
free_messages: ['and why is that?', 'interesting but I know how you feel especially the whole people telling that it in your head ', "Dang that's though. But I also understand that. I have people some who talks behind my back because of certain things that I believe in "]
guided_messages: ["I think it's because in my head, I think everyone is judging me. I just start to sweat and I get sick in my stomach.", "I don't really have people telling me in my head, more like behind my back", 'Me too! What do you believe in? I believe in dragons... Just finished watching Game of Thrones. Man, those things are dope']
suggestions: {'convai2': ["i've no idea i am also very shy", 'oh i know . i always feel judged and never kno

#### Seq2Seq Model

In [5]:
data_args = Namespace(
    model_name_or_path="facebook/bart-base",
    # model_name_or_path="results/",
    max_source_length=256,
    max_target_length=256,
    pad_to_max_length=False,
    ignore_pad_token_for_loss=True,
    max_train_samples=None,
    preprocessing_num_workers=None,
    overwrite_cache=True,
    output_dir='results/bart',
)
padding = "max_length" if data_args.pad_to_max_length else False

config = AutoConfig.from_pretrained(data_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(data_args.model_name_or_path)
model = AutoModelForSeq2SeqLM.from_pretrained(data_args.model_name_or_path)

In [117]:
num_added_toks = tokenizer.add_tokens(['<PS>'], special_tokens=True) ## this line is updated
num_added_toks = tokenizer.add_tokens(['<CTX>'], special_tokens=True) ## this line is updated
num_added_toks = tokenizer.add_tokens(['<SEP>'], special_tokens=True) ## this line is updated
model.resize_token_embeddings(len(tokenizer))

Embedding(50268, 768)

##### Tokenize dataset

In [17]:
def preprocess_bst(examples):
    num_entries = len(examples["free_messages"])
    persona_pieces = [
        f"<PS> {examples['personas'][0]}",
        f"<PS> {examples['personas'][1]}",
    ]
    if examples['context'] == "wizard_of_wikipedia":
        additional_context_pieces = [f"[<CTX> {examples['additional_context']}. <SEP> "]
    else:
        additional_context_pieces = ["<SEP> "]

    previous_utterance_pieces = examples["previous_utterance"]
    inputs, labels = [], []
    for entry_idx in range(num_entries):
        free_message = examples['free_messages'][entry_idx]
        guided_message = examples['guided_messages'][entry_idx]

        previous_utterance = ' <SEP> '.join(previous_utterance_pieces)
        original_context = ' '.join(
            persona_pieces + additional_context_pieces
        ) + previous_utterance
        # Input & Output
        text = original_context + ' ' + tokenizer.eos_token + ' ' + free_message
        inputs.append(text)
        labels.append(guided_message)

        previous_utterance_pieces += [
            free_message,
            guided_message,
        ]
        # print("history: ", text)
        # print("label: ", guided_message)

    inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(labels, max_length=data_args.max_target_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 
    # when we want to ignore padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    inputs["labels"] = labels["input_ids"]
    return inputs


def group_texts(examples):
    # ['input_ids', 'attention_mask', 'labels']
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    return concatenated_examples

In [19]:
column_names = train_dataset.column_names

if data_args.max_train_samples is not None:
    train_dataset = train_dataset.select(range(data_args.max_train_samples))

tokenized_train_dataset = train_dataset.map(
    preprocess_bst,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)
tokenized_train_dataset = tokenized_train_dataset.map(
    group_texts,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=not data_args.overwrite_cache,
)

tokenized_eval_dataset = eval_dataset.map(
    preprocess_bst,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)
tokenized_eval_dataset = tokenized_eval_dataset.map(
    group_texts,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=not data_args.overwrite_cache,
)
print(tokenized_train_dataset)
print(tokenized_eval_dataset)

##### Training

In [97]:
training_args = Seq2SeqTrainingArguments(
    output_dir=data_args.output_dir,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    metric_for_best_model="eval_bleu",
    greater_is_better=True, # smaller eval loss is better
    per_device_train_batch_size=10,
    per_device_eval_batch_size=20,
    gradient_accumulation_steps=20,
    num_train_epochs=30,
    predict_with_generate=True, # generation task
)

# Data collator
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
if data_args.pad_to_max_length:
    data_collator = default_data_collator
else:
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8 if training_args.fp16 else None,
    )

# Metric
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if data_args.ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Initialize our Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset if training_args.do_train else None,
    eval_dataset=tokenized_eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [99]:
# Training
checkpoint = None

train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()  # Saves the tokenizer too for easy upload

metrics = train_result.metrics
max_train_samples = (
    data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

#### Casual Language Model (CLM) e.g., DialoGPT

In [16]:
data_args = Namespace(
    # model_name_or_path="microsoft/DialoGPT-small",
    model_name_or_path="results/dialogpt",
    max_length=1000,
    pad_to_max_length=False,
    ignore_pad_token_for_loss=True,
    max_train_samples=None,
    preprocessing_num_workers=None,
    overwrite_cache=True,
    output_dir='results/dialogpt',
    block_size=None,
)

max_length = data_args.max_length
padding = "max_length" if data_args.pad_to_max_length else False

config = AutoConfig.from_pretrained(data_args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(data_args.model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(data_args.model_name_or_path, config=config)

In [20]:
def preprocess_bst(examples):
    num_entries = len(examples["free_messages"])
    persona_pieces = [
        f"<PS> {examples['personas'][0]}",
        f"<PS> {examples['personas'][1]}",
    ]
    if examples['context'] == "wizard_of_wikipedia":
        additional_context_pieces = [f"[<CTX> {examples['additional_context']}. <SEP> "]
    else:
        additional_context_pieces = ["<SEP> "]

    previous_utterance_pieces = examples["previous_utterance"]
    inputs, labels = [], []
    for entry_idx in range(num_entries):
        free_message = examples['free_messages'][entry_idx]
        guided_message = examples['guided_messages'][entry_idx]

        previous_utterance = ' <SEP> '.join(previous_utterance_pieces)
        original_context = ' '.join(
            persona_pieces + additional_context_pieces
        ) + previous_utterance
        # Input & Output
        text = original_context + ' ' + tokenizer.eos_token + ' ' + free_message
        inputs.append(text)
        labels.append(guided_message)

        previous_utterance_pieces += [
            free_message,
            guided_message,
        ]
        # print("history: ", text)
        # print("label: ", guided_message)

    inputs = tokenizer(inputs, max_length=data_args.max_length, padding=padding, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(labels, max_length=data_args.max_length, padding=padding, truncation=True)
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 
    # when we want to ignore padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    inputs["labels"] = labels["input_ids"]
    return inputs


def group_texts(examples):
    # ['input_ids', 'attention_mask', 'labels']
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    return concatenated_examples

In [21]:
column_names = train_dataset.column_names

if data_args.max_train_samples is not None:
    train_dataset = train_dataset.select(range(data_args.max_train_samples))

tokenized_train_dataset = train_dataset.map(
    preprocess_bst,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)
batched_train_dataset = tokenized_train_dataset.map(
    group_texts,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=not data_args.overwrite_cache,
)

tokenized_eval_dataset = eval_dataset.map(
    preprocess_bst,
    batched=False,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)
batched_eval_dataset = tokenized_eval_dataset.map(
    group_texts,
    batched=True,
    num_proc=data_args.preprocessing_num_workers,
    load_from_cache_file=not data_args.overwrite_cache,
)
print(batched_train_dataset)
print(batched_eval_dataset)

100%|██████████| 4819/4819 [00:04<00:00, 1077.63ex/s]
100%|██████████| 5/5 [00:02<00:00,  1.70ba/s]
100%|██████████| 1009/1009 [00:01<00:00, 811.27ex/s]
100%|██████████| 2/2 [00:00<00:00,  2.91ba/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 27018
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5651
})





##### Training

In [80]:
training_args = TrainingArguments(
    output_dir=data_args.output_dir,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    metric_for_best_model="eval_bleu",
    greater_is_better=True, # smaller eval loss is better
    per_device_train_batch_size=10,
    per_device_eval_batch_size=20,
    gradient_accumulation_steps=20,
    num_train_epochs=30,
)

# Metric
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if data_args.ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [100]:
# Training
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()  # Saves the tokenizer too for easy upload

metrics = train_result.metrics
max_train_samples = (
    data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

### Attack methods

In [17]:
class BaseAttacker:
    def __init__(self, 
                 device,
                 tokenizer,
                 model,
                 max_len=64,
                 max_per=3):
      
        self.device = device
        self.model = model.to(self.device)
        self.tokenizer = tokenizer

        self.embedding = self.model.get_input_embeddings().weight
        self.specical_token = self.tokenizer.all_special_tokens
        self.specical_id = self.tokenizer.all_special_ids
        self.eos_token_id = self.model.config.eos_token_id
        self.pad_token_id = self.model.config.pad_token_id
        self.num_beams = self.model.config.num_beams
        self.num_beam_groups = self.model.config.num_beam_groups
        self.max_len = max_len
        self.max_per = max_per

        self.softmax = nn.Softmax(dim=1)
        self.bce_loss = nn.BCELoss()

    @classmethod
    def _get_hparam(cls, namespace: Namespace, key: str, default=None):
        if hasattr(namespace, key):
            return getattr(namespace, key)
        print('Using default argument for "{}"'.format(key))

        return default

    def run_attack(self, x):
        pass

    def compute_loss(self, x):
        pass

    def compute_seq_len(self, seq):
        if seq[0].eq(self.pad_token_id):
            return int(len(seq) - sum(seq.eq(self.pad_token_id)))
        else:
            return int(len(seq) - sum(seq.eq(self.pad_token_id))) - 1

    def get_prediction(self, sentence):
        def remove_pad(s):
            for i, tk in enumerate(s):
                if tk == self.eos_token_id and i != 0:
                    return s[:i + 1]
            return s

        input_ids = self.tokenizer(sentence, return_tensors="pt").input_ids.to(self.device)
        
        # ['sequences', 'sequences_scores', 'scores', 'beam_indices'] if num_beams != 1
        # ['sequences', 'scores'] if num_beams = 1
        outputs = dialogue(
            self.model, 
            input_ids,
            early_stopping=False, 
            num_beams=self.num_beams,
            num_beam_groups=self.num_beam_groups, 
            use_cache=True,
            max_length=self.max_len,
        )
        
        seqs = outputs['sequences']
        seqs = [remove_pad(seq) for seq in seqs]
        out_scores = outputs['scores']
        pred_len = [self.compute_seq_len(seq) for seq in seqs]
        return pred_len, seqs, out_scores

    def get_trans_string_len(self, text):
        pred_len, seqs, _ = self.get_prediction(text)
        return seqs[0], pred_len[0]

    def get_trans_len(self, text):
        pred_len, _, _ = self.get_prediction(text)
        return pred_len

    def get_trans_strings(self, text):
        pred_len, seqs, _ = self.get_prediction(text)
        out_res = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in seqs]
        return out_res, pred_len
    
    def compute_score(self, text):
        batch_size = len(text)
        index_list = [i * self.num_beams for i in range(batch_size + 1)]
        pred_len, seqs, out_scores = self.get_prediction(text)


        scores = [[] for _ in range(batch_size)]
        for out_s in out_scores:
            for i in range(batch_size):
                current_index = index_list[i]
                scores[i].append(out_s[current_index: current_index + 1])
        scores = [torch.cat(s) for s in scores]
        scores = [s[:pred_len[i]] for i, s in enumerate(scores)]
        return scores, seqs, pred_len

In [18]:
class SlowAttacker(BaseAttacker):
    def __init__(self, 
                 device,
                 tokenizer,
                 model,
                 max_len=64,
                 max_per=3):
        super(SlowAttacker, self).__init__(device, tokenizer, model, max_len, max_per)

    def leave_eos_loss(self, scores, pred_len):
        loss = []
        for i, s in enumerate(scores):
            s[:, self.pad_token_id] = 1e-12 # T X V
            eos_p = self.softmax(s)[:pred_len[i], self.eos_token_id]
            loss.append(self.bce_loss(eos_p, torch.zeros_like(eos_p)))
        return loss

    def leave_eos_target_loss(self, scores, seqs, pred_len):
        loss = []
        for i, s in enumerate(scores): # s: T X V
            # if self.pad_token_id != self.eos_token_id:
            s[:, self.pad_token_id] = 1e-12
            softmax_v = self.softmax(s)
            eos_p = softmax_v[:pred_len[i], self.eos_token_id]
            target_p = torch.stack([softmax_v[idx, s] for idx, s in enumerate(seqs[i][1:])])
            target_p = target_p[:pred_len[i]]
            pred = eos_p + target_p
            pred[-1] = pred[-1] / 2
            loss.append(self.bce_loss(pred, torch.zeros_like(pred)))
        return loss

    @torch.no_grad()
    def select_best(self, new_strings, batch_size=30):
        """
        Select generated strings which induce longest output sentences.
        """
        pred_len = []
        # seqs = []
        batch_num = len(new_strings) // batch_size
        if batch_size * batch_num != len(new_strings):
            batch_num += 1

        for i in range(batch_num):
            st, ed = i * batch_size, min(i * batch_size + batch_size, len(new_strings))
            input_ids = self.tokenizer(new_strings[st:ed], return_tensors="pt", padding=True).input_ids
            input_ids = input_ids.to(self.device)
            outputs = self.model.generate(
                input_ids, 
                num_beams=self.num_beams, 
                max_length=self.max_len,
                return_dict_in_generate=True,
            )
            lengths = [self.compute_seq_len(seq) for seq in outputs['sequences']]
            # pdb.set_trace()
            pred_len.extend(lengths)
            
        # pred_len = np.array([self.compute_seq_len(torch.tensor(seq)) for seq in seqs])
        pred_len = np.array(pred_len)
        # pdb.set_trace()

        assert len(new_strings) == len(pred_len)
        return new_strings[pred_len.argmax()], max(pred_len)

    def prepare_attack(self, text):
        ori_len = self.get_trans_len(text)[0] # original sentence length
        best_adv_text, best_len = deepcopy(text), ori_len
        current_adv_text, current_len = deepcopy(text), ori_len  # current_adv_text: List[str]
        return ori_len, (best_adv_text, best_len), (current_adv_text, current_len)

    def compute_loss(self, text):
        raise NotImplementedError

    def mutation(self, current_adv_text, grad, modified_pos):
        raise NotImplementedError

    def run_attack(self, text):
        """
        (1) Using gradient ascent to generate adversarial sentences -- mutation();
        (2) Select the best samples which induce longest output sentences -- select_best();
        (3) Save the adversarial samples -- adv_his.
        """
        assert len(text) != 1
        # torch.autograd.set_detect_anomaly(True)
        ori_len, (best_adv_text, best_len), (current_adv_text, current_len) = self.prepare_attack(text)
        # adv_his = [(deepcopy(current_adv_text), deepcopy(current_len), 0.0)]
        adv_his = []
        modify_pos = []
        pbar = tqdm(range(self.max_per))
        t1 = time.time()

        for it in pbar:
            loss_list = self.compute_loss([current_adv_text])
            loss = sum(loss_list)
            self.model.zero_grad()
            loss.backward()
            grad = self.embedding.grad
            new_strings = self.mutation(current_adv_text, grad, modify_pos)

            if new_strings:
                current_adv_text, current_len = self.select_best(new_strings)
                log_str = "%d, %d, %.2f" % (it, len(new_strings), best_len / ori_len)
                pbar.set_description(log_str)

                if current_len > best_len:
                    best_adv_text = deepcopy(current_adv_text)
                    best_len = current_len
                t2 = time.time()
                adv_his.append((best_adv_text, int(best_len), t2 - t1))

        if adv_his:
            return True, adv_his
        else:
            return False, [(deepcopy(current_adv_text), deepcopy(current_len), 0.0)]

In [19]:
class WordAttacker(SlowAttacker):
    def __init__(self, 
                 device,
                 tokenizer,
                 model,
                 max_len=64,
                 max_per=3):
        super(WordAttacker, self).__init__(device, tokenizer, model, max_len, max_per)

    def compute_loss(self, text):
        scores, seqs, pred_len = self.compute_score(text) # [T X V], [T], [1]
        loss_list = self.leave_eos_target_loss(scores, seqs, pred_len)
        # loss_list = self.leave_eos_loss(scores, pred_len)
        return loss_list
    

    def token_replace_mutation(self, current_adv_text, grad, modified_pos):
        new_strings = []
        current_ids = self.tokenizer(current_adv_text, return_tensors="pt", padding=True).input_ids[0]
        base_ids = current_ids.clone()
        for pos in modified_pos:
            t = current_ids[0][pos]
            grad_t = grad[t]
            score = (self.embedding - self.embedding[t]).mm(grad_t.reshape([-1, 1])).reshape([-1])
            index = score.argsort()
            for tgt_t in index:
                if tgt_t not in self.specical_token:
                    base_ids[pos] = tgt_t
                    break

        for pos, t in enumerate(current_ids):
            if t not in self.specical_id:
                cnt, grad_t = 0, grad[t]
                score = (self.embedding - self.embedding[t]).mm(grad_t.reshape([-1, 1])).reshape([-1])
                index = score.argsort()
                for tgt_t in index:
                    if tgt_t not in self.specical_token:
                        new_base_ids = base_ids.clone()
                        new_base_ids[pos] = tgt_t
                        candidate_s = self.tokenizer.decode(new_base_ids, skip_special_tokens=True)
                        new_strings.append(candidate_s)
                        cnt += 1
                        if cnt >= 50:
                            break

        return new_strings


    def mutation(self, current_adv_text, grad, modify_pos):
        new_strings = self.token_replace_mutation(current_adv_text, grad, modify_pos)
        return new_strings

### Inference pipeline

In [20]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(preds, labels, metric, tokenizer):
    if not isinstance(preds, list):
        preds = [preds]
    if not isinstance(labels, list):
        labels = [labels]
    preds, labels = postprocess_text(preds, labels)
    result = metric.compute(predictions=preds, references=labels)
    return result['score']


def inference(sentence, label, model, tokenizer, metric, device):
    input_ids = tokenizer(sentence, return_tensors="pt").input_ids
    input_ids = input_ids.to(device)
    t1 = time.time()
    outputs = model.generate(input_ids, max_length=64, do_sample=False)
    output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    t2 = time.time()
    prediction_len = len(output.split())
    eval_score = compute_metrics(output, label, metric, tokenizer)

    print("")
    success, adv_his = attacker.run_attack(sentence)
    print("\nU--{}".format(sentence))
    print("G--{}".format(output))
    print("(length: {}, latency: {:.3f}, BLEU: {:.3f})".format(
        prediction_len, t2-t1, eval_score,
    ))

    if success:
        print("U'--{}".format(adv_his[-1][0]))
    else:
        print("Attack failed!")

    input_ids = tokenizer(adv_his[-1][0], return_tensors="pt").input_ids
    input_ids = input_ids.to(device)
    t1 = time.time()
    outputs = model.generate(input_ids, max_length=64, do_sample=False)
    output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    t2 = time.time()
    prediction_len = len(output.split())
    print("G'--{}".format(output))
    eval_score = compute_metrics(output, label, metric, tokenizer)
    print("(length: {}, latency: {:.3f}, BLEU: {:.3f})".format(
        prediction_len, t2-t1, eval_score,
    ))

### Demo

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
attacker = WordAttacker(
        device=device,
        tokenizer=tokenizer,
        model=model,
        max_len=64,
        max_per=1,
    )

metric = load_metric("sacrebleu")

In [23]:
# Demo 1
input_text = "Can't believe the kid grew up so quick."
output_text = "Yeah, kids grow up so quickly."
inference(input_text, output_text, model, tokenizer, metric, device)

# Demo 2
input_text = "How would I start rock climbing?"
output_text = "You can google it. But I suggest you to find a local climbing gym and take a class."
inference(input_text, output_text, model, tokenizer, metric, device)

# Demo 3
input_text = "How often do you use computers?"
output_text = "Almost every week. I use them for work and personal use."
inference(input_text, output_text, model, tokenizer, metric, device)




0, 500, 1.00: 100%|██████████| 1/1 [00:12<00:00, 12.86s/it]



U--Can't believe the kid grew up so quick.
G--I can't believe it. He's only a few months old.
(length: 10, latency: 0.189, BLEU: 3.386)
U'--Can Directions believe the kid grew up so quick.
G'--I can't believe it. My dad was an Army brat and he always said he wanted to be a pilot.
(length: 20, latency: 0.277, BLEU: 1.727)



0, 350, 1.00: 100%|██████████| 1/1 [00:11<00:00, 11.03s/it]



U--How would I start rock climbing?
G--If you know how to climb, you can google it.
(length: 10, latency: 0.211, BLEU: 12.046)
U'--How would I start rock waterfall?
G'--There are many ways to start a rock waterfall. There are many different types of water based on rock factors such as salinity, temperate, etc.
(length: 25, latency: 0.352, BLEU: 1.820)



0, 350, 1.00: 100%|██████████| 1/1 [00:09<00:00,  9.29s/it]



U--How often do you use computers?
G--Almost every week ends.
(length: 4, latency: 0.113, BLEU: 8.627)
U'--How sort do you use computers?
G'--I have a couple of computers. One is a gaming rig and the other is a storage unit.
(length: 18, latency: 0.235, BLEU: 2.708)


In [24]:
# Demo on BST test set
import random

def test_demo(device, model, tokenizer, attacker, max_num_samples=100, max_per=3):
    random.seed(2019)
    bst_dataset = load_dataset("blended_skill_talk")
    test_dataset = bst_dataset['test']
    ids = random.sample(range(len(test_dataset)), max_num_samples)

    sampled_test_dataset = test_dataset.select(ids)

    metric = load_metric("sacrebleu")
    ori_lens, adv_lens = [], []
    ori_bleus, adv_bleus = [], []
    ori_time, adv_time = [], []
    att_success = 0
    total_pairs = 0

    for i, instance in tqdm(enumerate(sampled_test_dataset)):
        if total_pairs >= max_num_samples:
            break

        for (sentence, label) in zip(instance['free_messages'], instance['guided_messages']):

            input_ids = tokenizer(sentence, return_tensors="pt").input_ids
            input_ids = input_ids.to(device)
            t1 = time.time()
            
            outputs = model.generate(input_ids, max_length=64, do_sample=False)
            output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
            t2 = time.time()
            pred_len = np.count_nonzero(outputs[0].cpu() != tokenizer.pad_token_id)
            eval_scores = compute_metrics(output, label, metric, tokenizer)
            
            ori_lens.append(pred_len)
            ori_bleus.append(eval_scores)
            ori_time.append(t2-t1)
            
            # Attack
            print("")
            success, adv_his = attacker.run_attack(sentence)
            print('\n')
            print("U--{}".format(sentence))
            print("G--{}".format(output))
            print("(length: {}, latency: {:.3f}, BLEU: {:.3f})".format(pred_len, t2-t1, eval_scores))

            if success:
                # print("Attack Succeed!")
                print("U'--{}".format(adv_his[-1][0]))
            else:
                print("Attack failed!")

            input_ids = tokenizer(adv_his[-1][0], return_tensors="pt").input_ids
            input_ids = input_ids.to(device)
            t1 = time.time()
            outputs = model.generate(input_ids, max_length=64, do_sample=False)
            output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
            t2 = time.time()
            adv_pred_len = np.count_nonzero(outputs[0].cpu() != tokenizer.pad_token_id)
            print("G'--{}".format(output))
            eval_scores = compute_metrics(output, label, metric, tokenizer)
            print("(length: {}, latency: {:.3f}, BLEU: {:.3f})".format(adv_pred_len, t2-t1, eval_scores))

            adv_lens.append(adv_pred_len)
            adv_bleus.append(eval_scores)
            adv_time.append(t2-t1)

            att_success += (adv_pred_len > pred_len)
            total_pairs += 1

            if total_pairs >= max_num_samples:
                break


    # Summarize eval results
    ori_len = np.mean(ori_lens)
    adv_len = np.mean(adv_lens)
    ori_bleu = np.mean(ori_bleus)
    adv_bleu = np.mean(adv_bleus)
    ori_t = np.mean(ori_time)
    adv_t = np.mean(adv_time)
    print("Original output length: {:.3f}, latency: {:.3f}, BLEU: {:.3f}".format(ori_len, ori_t, ori_bleu))
    print("Adversarial output length: {:.3f}, latency: {:.3f}, BLEU: {:.3f}".format(adv_len, adv_t, adv_bleu))
    print("Attack success rate: {:.2f}%".format(100*att_success/total_pairs))

In [23]:
# max_num_samples = 5
# max_per = 1
# test_demo(device, model, tokenizer, attacker, max_num_samples, max_per)

## ConvAI2

In [35]:
dataset = load_dataset("conv_ai_2")
print(dataset)

Found cached dataset conv_ai_2 (/home/monkey/.cache/huggingface/datasets/conv_ai_2/conv_ai_2/1.0.0/11d600ddce66bb9d07ca50d1b55b488145ef0d5d0206168c32f1043677875865)
100%|██████████| 1/1 [00:00<00:00, 521.29it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialog_id', 'dialog', 'bot_profile', 'user_profile', 'eval_score', 'profile_match'],
        num_rows: 3495
    })
})





In [48]:
for i in range(1):
    print(dataset['train'][i])
    instance = dataset['train'][i]
    user_profile = ' '.join([''.join(x) for x in instance['user_profile']])
    print('user profile: ', user_profile)

    persona_pieces = f"<PS> {user_profile}"
    num_entries = len([x for x in instance['dialog'] if x['sender_class'] == 'Human'])
    previous_utterance_pieces = [persona_pieces]

    for entry_idx in range(num_entries):
        bot_msg = instance['dialog'][entry_idx*2]['text']
        human_msg = instance['dialog'][entry_idx*2+1]['text']
        original_context = ' '.join(previous_utterance_pieces)
        previous_utterance_pieces += [
            bot_msg,
            human_msg,
        ]

        text = original_context + '<EOS>' + bot_msg + '<EOS>' + human_msg
        print("text: ", text)

{'id': '0xab38710', 'dialog_id': '0xab38710', 'dialog': [{'id': 0, 'sender': 'participant2', 'text': 'I love iphone! i just bought new iphone!', 'sender_class': 'Bot'}, {'id': 1, 'sender': 'participant1', 'text': "Thats good for you, i'm not very into new tech", 'sender_class': 'Human'}, {'id': 2, 'sender': 'participant2', 'text': 'I am a college student and i am a college student', 'sender_class': 'Bot'}, {'id': 3, 'sender': 'participant1', 'text': 'I am go to gym and live on donations', 'sender_class': 'Human'}, {'id': 4, 'sender': 'participant2', 'text': 'I am a vegan and i am in the midwest', 'sender_class': 'Bot'}, {'id': 5, 'sender': 'participant1', 'text': 'So vegan... i have dogs maybe i should told then that they may eat cheap salads insted of meat', 'sender_class': 'Human'}, {'id': 6, 'sender': 'participant2', 'text': 'I would not mind having them in the office that would be hard for me', 'sender_class': 'Bot'}, {'id': 7, 'sender': 'participant1', 'text': 'Dogs or vegan in of

## Empathetic Dialogues

In [2]:
dataset = load_dataset("empathetic_dialogues")
print(dataset)
train_dataset = dataset['train']
eval_dataset = dataset['validation']
test_dataset = dataset['test']

Downloading builder script: 100%|██████████| 4.51k/4.51k [00:00<00:00, 1.76MB/s]
Downloading metadata: 100%|██████████| 1.91k/1.91k [00:00<00:00, 953kB/s]
Downloading readme: 100%|██████████| 7.15k/7.15k [00:00<00:00, 3.02MB/s]


Downloading and preparing dataset empathetic_dialogues/default to /home/dsi/yufli/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf...


Downloading data: 100%|██████████| 28.0M/28.0M [00:01<00:00, 26.6MB/s]
                                                                                            

Dataset empathetic_dialogues downloaded and prepared to /home/dsi/yufli/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 523.92it/s]

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 76673
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 12030
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 10943
    })
})





In [22]:
def group_texts(dataset):
    results = {
        'conv_id': [], 
        'prompt': [],
        'dialog': [], 
        'context': [],
    }
    for i, instance in enumerate(dataset):
        if instance['utterance_idx'] == 1:
            results['conv_id'].append(instance['conv_id'])
            results['dialog'].append([])
            results['prompt'].append(instance['prompt'])
            results['context'].append(instance['context'])

        response = {'text': instance['utterance'], 'speaker_idx': instance['speaker_idx']}
        results['dialog'][-1].append(response)

    return Dataset.from_dict(results)

In [23]:
grouped_test_dataset = group_texts(test_dataset)
grouped_test_dataset

Dataset({
    features: ['conv_id', 'prompt', 'dialog', 'context'],
    num_rows: 2541
})

In [24]:
grouped_test_dataset[0]

{'conv_id': 'hit:0_conv:0',
 'prompt': "I felt guilty when I was driving home one night and a person tried to fly into my lane_comma_ and didn't see me. I honked and they swerved back into their lane_comma_ slammed on their brakes_comma_ and hit the water cones.",
 'dialog': [{'speaker_idx': 0,
   'text': 'Yeah about 10 years ago I had a horrifying experience. It was 100% their fault but they hit the water barrels and survived. They had no injuries but they almost ran me off the road.'},
  {'speaker_idx': 1, 'text': 'Did you suffer any injuries?'},
  {'speaker_idx': 0,
   'text': "No I wasn't hit. It turned out they were drunk. I felt guilty but realized it was his fault."},
  {'speaker_idx': 1,
   'text': "Why did you feel guilty? People really shouldn't drive drunk."},
  {'speaker_idx': 0,
   'text': "I don't know I was new to driving and hadn't experienced anything like that. I felt like my horn made him swerve into the water barrels."}],
 'context': 'guilty'}