### Preprocessing

In [1]:
import re
import fasttext
import pandas as pd

In [2]:
model = fasttext.load_model('lid.176.ftz')



In [3]:
def detect_lang(entry):
    entry = re.sub("\s+", " ", entry)
    pred_langs = model.predict(entry, k=176)[0]
    pred_langs = [pred for pred in pred_langs if pred in ['__label__en', '__label__fr', '__label__es']]
    return pred_langs[0][9:]

In [4]:
def preprocess(entry):
    # remove the preceeding date
    return re.sub("^\[.+\]", " ", entry).strip()

In [5]:
df = pd.read_csv("full_dataset.csv")
df.columns

  exec(code_obj, self.user_global_ns, self.user_ns)


Index(['Unnamed: 0', 'entry_id', 'excerpt', 'analysis_framework_id', 'lead_id',
       'project_id', 'verified', 'sectors', 'subpillars_2d', 'subpillars_1d',
       'geo_location', 'specific_needs_groups', 'severity', 'dates',
       'demographic_groups', 'reliability', 'affected_groups'],
      dtype='object')

In [6]:
df["excerpt"] = df["excerpt"].apply(preprocess)
df["lang"] = df["excerpt"].apply(detect_lang)
df_en = df[df["lang"].eq("en")]

### TextAttack

In [7]:
from ast import literal_eval

import torch
from torch.nn.utils.rnn import pad_sequence

import datasets
import textattack
from tqdm.auto import tqdm
from textattack.attacker import Attacker
from textattack.datasets import HuggingFaceDataset
from textattack.attack_recipes import TextBuggerLi2018
from textattack.models.wrappers.pytorch_model_wrapper import PyTorchModelWrapper

In [8]:
%load_ext autoreload
%autoreload 2

In [9]:
df_en = df_en[['excerpt', 'sectors', 'subpillars_2d', 'subpillars_1d', 
    'geo_location', 'specific_needs_groups', 'severity',
       'demographic_groups', 'reliability', 'affected_groups']]

In [10]:
def get_unique_values(df, colname):
    unique_values = set()
    for s in df[colname]:
        unique_values.update(s)
    return list(sorted(unique_values))

def col_to_multicols(df, colname):
    df = df[["excerpt", colname]].copy()
    df[colname] = df[colname].apply(literal_eval)
    unique_values = get_unique_values(df, colname)
    # create columns with dump values
    for val in unique_values:
        df[val] = 0
    for idx, values in tqdm(zip(df.index, df[colname]), total=df.shape[0]):
        for value in values:
            df.at[idx, value] = 1
    return df, unique_values

In [11]:
df_sectors, sectors = col_to_multicols(df_en, 'sectors')

100%|██████████| 114406/114406 [00:00<00:00, 136990.29it/s]


In [12]:
sectors

['Agriculture',
 'Cross',
 'Education',
 'Food Security',
 'Health',
 'Livelihoods',
 'Logistics',
 'Nutrition',
 'Protection',
 'Shelter',
 'WASH']

In [13]:
label = "WASH"
ds = datasets.Dataset.from_pandas(df_sectors,
                                  features=datasets.Features({
                                      "excerpt":
                                      datasets.Value("string"),
                                      label:
                                      datasets.Value('uint8'),
                                  }))

In [14]:
len(ds)

114406

In [15]:
class TokenizerWrapper:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, *args, **kwargs):
        return self.tokenizer(*args, **kwargs)["input_ids"]

In [16]:
class CustomModelWrapper(PyTorchModelWrapper):
    def __init__(self, model):
        super().__init__(model,
                         TokenizerWrapper(model.empty_dataset.tokenizer))
        self.max_len = self.tokenizer.tokenizer.max_len_single_sentence
        self.pad_token_id = model.empty_dataset.tokenizer.pad_token_id

    def __call__(self, text_input_list, batch_size=32):
        model_device = next(self.model.parameters()).device
        ids = self.tokenizer(text_input_list,
                             truncation=True,
                             max_length=self.max_len)
        try:
            ids = torch.tensor(ids).to(model_device)
        except:
            pad_to_len = max(map(len, ids))
            ids = [
                x + [self.pad_token_id] * (pad_to_len - len(x)) for x in ids
            ]
            ids = torch.tensor(ids).to(model_device)
        ids = {"ids": ids, "mask": None}

        with torch.no_grad():
            outputs = self.model(ids)

        return outputs

In [17]:
from modeling import CustomDataset, Transformer
CKPT_PATH = "model.ckpt"
DEVICE = "cpu"
model = Transformer.load_from_checkpoint(CKPT_PATH)
model.to(DEVICE);

  rank_zero_deprecation(


In [18]:
model_wrapper = CustomModelWrapper(model)
#model_wrapper = PyTorchModelWrapper(model.model, TokenizerWrapper(model.empty_dataset.tokenizer))
#model_wrapper

In [19]:
dataset = HuggingFaceDataset(ds, dataset_columns=[["excerpt"], label])

In [20]:
textattack.attack_args

<module 'textattack.attack_args' from '/home/abdullah/anaconda3/lib/python3.8/site-packages/textattack/attack_args.py'>

In [21]:
attack = TextBuggerLi2018.build(model_wrapper)
#attack = textattack.attack_recipes.BAEGarg2019.build(model_wrapper)
#attack = textattack.attack_recipes.BERTAttackLi2020.build(model_wrapper)
#attack = textattack.attack_recipes.CheckList2020.build(model_wrapper)
attack_args = textattack.AttackArgs(
    num_examples=10000,
    num_successful_examples=None,
    num_examples_offset=0,
    attack_n=False,
    shuffle=False,
    query_budget=None,
    checkpoint_interval=None,
    checkpoint_dir='checkpoints',
    random_seed=765,
    parallel=False,
    num_workers_per_device=1,
    log_to_txt=None,
    log_to_csv=None,
    csv_coloring_style='file',
    log_to_visdom=None,
    log_to_wandb=None,
    disable_stdout=False,
    silent=False,
)
attacker = Attacker(attack, dataset, attack_args)

textattack: Unknown if model of class <class 'modeling.Transformer'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


In [None]:
attack_results = attacker.attack_dataset()

In [23]:
# textattack.attack_recipes.BAEGarg2019
# textattack.attack_recipes.BERTAttackLi2020
# textattack.attack_recipes.CheckList2020
# textattack.attack_recipes.CLARE2020
# textattack.attack_recipes.DeepWordBugGao2018
# textattack.attack_recipes.FasterGeneticAlgorithmJia2019
# textattack.attack_recipes.GeneticAlgorithmAlzantot2018
# textattack.attack_recipes.HotFlipEbrahimi2017
# textattack.attack_recipes.IGAWang2019
# textattack.attack_recipes.InputReductionFeng2018
# textattack.attack_recipes.Kuleshov2017
# textattack.attack_recipes.MorpheusTan2020
# textattack.attack_recipes.Pruthi2019
# textattack.attack_recipes.PSOZang2020
# textattack.attack_recipes.PWWSRen2019
# textattack.attack_recipes.Seq2SickCheng2018BlackBox
# textattack.attack_recipes.TextFoolerJin2019
# textattack.attack_recipes.TextBuggerLi2018

In [25]:
ar = attack_results[0]

In [39]:
sucess_or_fail_atts = []
for ar in attack_results:
    if "SKIPPED" in ar.goal_function_result_str():
        continue
    sucess_or_fail_atts.append(ar)