## Installs

In [None]:
!pip install allennlp==1.2.2 allennlp_models==1.2.2

In [None]:
!pip install textattack[tensorflow]

In [None]:
!pip install overrides==3.1.0 munch==2.5.0 more_itertools==8.4.0

In [None]:
!pip install torchfile

In [None]:
!pip install transformers

In [None]:
!git clone --branch pos_Adj https://github.com/thanoskaravangelis/mice

In [None]:
%cd mice

In [None]:
!pip install "torch>1.7.1"

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
from allennlp.predictors import Predictor
import allennlp_models.classification
from src.predictors.newsgroups.newsgroups_dataset_reader import NewsgroupsDatasetReader
from src.predictors.imdb.imdb_dataset_reader import ImdbDatasetReader
import numpy as np

import textattack

class AllenNLPModel(textattack.models.wrappers.ModelWrapper):
    def __init__(self):
        #for NewsGroups
        #self.predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/mice-newsgroups-predictor.tar.gz",
        #for IMDb
        self.predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/mice-imdb-predictor.tar.gz",
                                             dataset_reader_to_load=ImdbDatasetReader,
                                            frozen=True)
        self.model = self.predictor._model
        self.tokenizer = self.predictor._dataset_reader._tokenizer

    def __call__(self, text_input_list):
        outputs = []
        for text_input in text_input_list:
            outputs.append(self.predictor.predict(sentence=text_input))
        # For each output, outputs['logits'] contains the logits where
        # index 0 corresponds to the positive and index 1 corresponds
        # to the negative score. We reverse the outputs (by reverse slicing,
        # [::-1]) so that negative comes first and positive comes second.
        #return [np.exp(output['logits'])/sum(np.exp(output['logits'])) for output in outputs]
        return [output['logits'][::-1] for output in outputs]

model_wrapper = AllenNLPModel()

In [None]:
def clean_text(text, special_chars=["\n", "\t"]):
    for char in special_chars:
        text = text.replace(char, " ")
    return text

## For IMDB dataset

In [None]:
import os 
files_pos = os.listdir("/kaggle/input/imdbdata500/imdb_431/pos")
files_neg = os.listdir("/kaggle/input/imdbdata500/imdb_431/neg")
datalist = []
for item in files_pos:
    my_file = open("/kaggle/input/imdbdata500/imdb_431/pos/"+item, "r")
    text = clean_text(my_file.read())
    my_file.close()
    datalist.append((text, 1))

for item in files_neg:
    my_file = open("/kaggle/input/imdbdata500/imdb_431/neg/"+item, "r")
    text = clean_text(my_file.read())
    my_file.close()
    datalist.append((text, 0))

## For newsgroups

In [None]:
d = {"talk":3, "rec":1, "sci":2, "comp":0, "soc":4, "alt":6, "misc":5}

In [None]:
import os 
from src.utils import add_probs, get_ints_to_labels
import numpy as np
from tqdm import tqdm

ints_to_labels = get_ints_to_labels(model_wrapper.predictor)

files_pos = os.listdir("/kaggle/input/newsgroups-mice/newsgroups/pos")
files_neg = os.listdir("/kaggle/input/newsgroups-mice/newsgroups/neg")
datalist = []
for item in tqdm(files_pos):
    my_file = open("/kaggle/input/newsgroups-mice/newsgroups/pos/"+item, "r")
    text = clean_text(my_file.read())
    my_file.close()
    orig_pred = model_wrapper.predictor.predict(text)
    orig_pred = add_probs(orig_pred)
    orig_probs = orig_pred['probs']
    orig_label = ints_to_labels[np.argmax(orig_probs)]
    datalist.append((text, d[orig_label]))

for item in tqdm(files_neg):
    my_file = open("/kaggle/input/newsgroups-mice/newsgroups/neg/"+item, "r")
    text = clean_text(my_file.read())
    my_file.close()
    orig_pred = model_wrapper.predictor.predict(text)
    orig_pred = add_probs(orig_pred)
    orig_probs = orig_pred['probs']
    orig_label = ints_to_labels[np.argmax(orig_probs)]
    datalist.append((text, d[orig_label]))

In [None]:
import pickle
with open("datalist.pickle","wb") as myf:
    pickle.dump(datalist, myf)

Load datalist from pickle

In [None]:
import pickle
with open("datalist.pickle","rb") as myf:
    datalist = pickle.load(myf)

Remove sentences with zero length and view final length of dataset

In [None]:
for it in datalist:
    if len(it[0]) == 0:
        print(it)
len(datalist)

In [None]:
datalist.remove(('',1))
len(datalist)

In [None]:
datalist.remove(('                                                           ~~15 ', 0))
len(datalist)

Examples of textattack's pretrained models and their usage can be found here: https://github.com/QData/TextAttack/tree/master/textattack/models#readme

In [None]:
task = "allennlp_VERB_news"
dataset = textattack.datasets.Dataset(datalist)

## Step 0

Create a Part-of-speech tag constrain that allows us to have control over the generated edits by specifying what part-of-speech we want to target for modification.

In [None]:
from textattack.constraints.pre_transformation_constraint import PreTransformationConstraint
import spacy

class POSWordsModified(PreTransformationConstraint):
  def __init__(self, targeted_pos_tag):
    self.targeted_pos_tag = targeted_pos_tag
    self.nlp = spacy.load("en_core_web_sm")
  
  def _get_modifiable_indices(self, current_text):
      """Returns the word indices in current_text which are able to be
      modified based on targeted pos tag."""
      
      doc_c = self.nlp(current_text.text)
      doc_c_list = current_text.words
      modifiable = []
      for word in doc_c:
        if word.pos_==self.targeted_pos_tag:
            if str(word) in doc_c_list:
                modifiable.append(doc_c_list.index(str(word)))
      return set(modifiable)

In [None]:
attack = textattack.attack_recipes.TextFoolerJin2019.build(model_wrapper)
pw = POSWordsModified("VERB")
gf = attack.goal_function
sm = attack.search_method 
tr = attack.transformation
cs = attack.constraints
tr.max_candidates = 250
cs.insert(0,pw)

attack = textattack.attack.Attack(gf, cs, tr, sm)
# Attack 20 samples with CSV logging and checkpoint saved every 5 interval
attack_args = textattack.AttackArgs(num_examples=430, log_to_csv=f"log_{task}_0.csv", csv_coloring_style="plain", disable_stdout=False)
attacker = textattack.Attacker(attack, dataset, attack_args)
attacker.attack_dataset()

In [None]:
import pandas as pd
def create_dataset_from_csv(path_to_csv):
    csv = pd.read_csv(path_to_csv, sep=",")
    datalist=[]
    for idx, row in csv.iterrows():
        if row["result_type"]=="Successful":
            text = row["perturbed_text"]
            label = row["perturbed_output"]
            datalist.append((text, label))
    return datalist

## Steps 1 through 9

In [None]:
for num_of_phase in range(1,10):
    attack = textattack.attack_recipes.TextFoolerJin2019.build(model_wrapper)
    pw = POSWordsModified("VERB")
    gf = attack.goal_function
    sm = attack.search_method 
    tr = attack.transformation
    cs = attack.constraints
    tr.max_candidates = 250
    cs.append(pw)

    attack = textattack.attack.Attack(gf, cs, tr, sm)
    datalist = create_dataset_from_csv(f"/kaggle/working/mice/log_{task}_{num_of_phase-1}.csv")
    dataset = textattack.datasets.Dataset(datalist)
    num_to_attack = len(datalist)
    attack_args = textattack.AttackArgs(num_examples=num_to_attack, log_to_csv=f"log_{task}_{num_of_phase}.csv", csv_coloring_style="plain", disable_stdout=True)
    attacker = textattack.Attacker(attack, dataset, attack_args)
    attacker.attack_dataset()