<a href="https://colab.research.google.com/github/zahradm/Thesis/blob/main/AdversarialTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers textattack sentence_transformers torchfile evaluate 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import transformers
import textattack
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoModel
from transformers import pipeline, AutoModelForTokenClassification, AutoModelForSequenceClassification
from transformers import AutoTokenizer
from textattack.augmentation.recipes import  *
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer

# Creating adversarial data


In [None]:
model = transformers.AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-rotten-tomatoes")
tokenizer = transformers.AutoTokenizer.from_pretrained("textattack/roberta-base-rotten-tomatoes", truncation=True)
model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(model, tokenizer)

In [None]:
dataset = load_dataset("rotten_tomatoes", split="train")
sorted_dataset = dataset.sort('label')
shuffled_dataset = sorted_dataset.shuffle(seed=45)
shuffled_dataset = textattack.datasets.HuggingFaceDataset(shuffled_dataset)



In [None]:
attack = textattack.attack_recipes.BAEGarg2019.build(model_wrapper)
# Attack 20 samples with CSV logging and checkpoint saved every 5 interval
attack_args = textattack.AttackArgs(num_examples=3000, log_to_csv="/content/drive/MyDrive/Thesis/log_BAEGarg2019_3.csv", checkpoint_interval=5, checkpoint_dir="checkpoints", disable_stdout=True)
attacker = textattack.Attacker(attack, shuffled_dataset, attack_args)
attacker.attack_dataset()

# Adversarial data preprocess

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Thesis/log_BAEGarg2019_3.csv')

def success_sample(data):
  selected_example = data.loc[data['result_type']=='Successful']
  selected_example.reset_index(inplace=True)
  return selected_example

def clear_data(data):
  data['original_text'] = data['original_text'].astype('str').str.replace("[", "", regex=True).astype('str')
  data['original_text'] = data['original_text'].astype('str').str.replace("]", "", regex=True).astype('str')
  data['perturbed_text'] = data['perturbed_text'].astype('str').str.replace("[", "", regex=True).astype('str')
  data['perturbed_text'] = data['perturbed_text'].astype('str').str.replace("]", "", regex=True).astype('str')
  return data

In [None]:
succsess_adv = success_sample(data)
succsess_adv

In [None]:
adv_data = clear_data(succsess_adv)
adv_data

In [None]:
adv_data = adv_data[['perturbed_text', 'ground_truth_output']]
adv_data

In [None]:
adv_data.columns = ['text', 'label']
adv_data

# Adversarial training and saving model

In [None]:
#path = '/content/drive/MyDrive/Thesis/log_BAEGarg2019_3.csv'
#adv_data = pd.read_csv(path)
#data = data.drop(['Unnamed: 0','index'], axis=1)
df = pd.DataFrame(adv_data)
rotten_tomatoes = load_dataset("rotten_tomatoes")

In [None]:
df

In [None]:
adv_split_index = round(len(df)/10)*8
adv_train = df[:adv_split_index]
adv_test = df[adv_split_index:]

In [None]:
def shuffle(dataName, split):
  dataset = load_dataset(dataName, split=split)
  sorted_dataset = dataset.sort('label')
  shuffled_dataset = sorted_dataset.shuffle(seed=45)
  return shuffled_dataset


In [None]:
shuffle_train = shuffle('rotten_tomatoes','train')
shuffle_test = shuffle('rotten_tomatoes', 'validation')
benign_train = shuffle_train[:round(len(shuffle_train)/5)]
benign_test = shuffle_test[:round(len(shuffle_test)/5)]



In [None]:
benign_train_df = pd.DataFrame(list(zip(benign_train['text'], benign_train['label'])), columns =['text', 'label']) 
benign_test_df = pd.DataFrame(list(zip(benign_test['text'], benign_test['label'])), columns =['text', 'label']) 

In [None]:
df_merged_train = pd.concat([benign_train_df, adv_train], ignore_index=True, sort=False)
df_merged_test = pd.concat([benign_test_df, adv_test], ignore_index=True, sort=False)

In [None]:
all_train = Dataset.from_pandas(df_merged_train)
all_test = Dataset.from_pandas(df_merged_test)

In [None]:
model_name = "textattack/roberta-base-rotten-tomatoes"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets_train = all_train.map(tokenize_function, batched=True)
tokenized_datasets_test = all_test.map(tokenize_function, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(output_dir="/content/drive/MyDrive/Thesis_roberta_BAG", evaluation_strategy="epoch", num_train_epochs=8)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_test,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("/content/drive/MyDrive/Thesis_roberta_BAG")

In [None]:
# load the model
from transformers import AutoModelForSequenceClassification
load_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Thesis_roberta_BAG")
tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-rotten-tomatoes")


# Adversarial attack on trained model

In [None]:
model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(load_model, tokenizer)
attack = textattack.attack_recipes.BAEGarg2019.build(model_wrapper)
dataset_test = textattack.datasets.HuggingFaceDataset("rotten_tomatoes", split="test")
attack_args = textattack.AttackArgs(num_examples=1066, checkpoint_interval=5, checkpoint_dir="checkpoints", disable_stdout=True)
attacker = textattack.Attacker(attack, dataset_test, attack_args)
attacker.attack_dataset()