<a href="https://colab.research.google.com/github/yifengd/adversarial-nlp/blob/main/defenses/captum/catum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Based on: Explain Attacking BERT models using CAptum

Captum is a PyTorch library to explain neural networks
Here we show a minimal example using Captum to explain BERT models from TextAttack

[![Open Notebook in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yifengd/adversarial-nlp/blob/main/defenses/captum/catum.ipynb)

[![Original Code on GitHub](https://img.shields.io/badge/github-view%20source-black.svg)](https://github.com/QData/TextAttack/blob/master/docs/2notebook/Example_5_Explain_BERT.ipynb)

In [None]:
!pip install textattack[tensorflow] tensorflow_text==2.10.0b2 captum nltk -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from copy import deepcopy
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
# from textattack.datasets import HuggingFaceDataset
from textattack.models.wrappers import HuggingFaceModelWrapper
# from textattack.models.wrappers import ModelWrapper
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else: 
    device = torch.device("cpu")
    
print(f"Using accelerator {device}")

Using accelerator cuda:0


## Configure Model and Dataset

In [None]:
# dataset = HuggingFaceDataset("ag_news", None, "train")
original_model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-ag-news")
original_tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-ag-news")
model = HuggingFaceModelWrapper(original_model,original_tokenizer)

In [None]:
print(model.model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

## Load Dataframe with Original and Perturbed Samples

In [None]:
DRIVE_PATH = '/content/drive/MyDrive/NLP-Lab/AdversarialXAI'
adversarial_df = pd.read_csv(f"{DRIVE_PATH}/Adversarial Samples/Older attacks/ag-news_pwws_bert.csv")

In [None]:
def class_name_to_index(class_name):
  if class_name == "World":
    return 0
  elif class_name == "Sports":
    return 1
  elif class_name == "Business":
    return 2
  elif class_name == "Sci/tech":
    return 3
  else:
    raise ValueError(class_name)

adversarial_df['original_class'] = adversarial_df['original_class'].map(class_name_to_index)
adversarial_df['adversarial_class'] = adversarial_df['adversarial_class'].map(class_name_to_index)

In [None]:
adversarial_df

Unnamed: 0.1,Unnamed: 0,original_text,adversarial_text,original_class,original_confidence,adversarial_class,adversarial_confidence,attack,replace_dict,replace_num
0,0,Fed lifts rates a further quarter point By And...,course lifts grass a further quarter taper pas...,2,(100%),1,(78%),pwws,"{'Fed': 'course', 'rates': 'grass', 'point': '...",6
1,1,Indian-Americans hail Manmohan speech New York...,Indian-Americans come Manmohan delivery New Yo...,0,(100%),2,(62%),pwws,"{'hail': 'come', 'speech': 'delivery', 'meetin...",4
2,2,"Unisys to lay off 1,400 workers Unisys Corp. p...","Unisys to lay off 1,400 workers Unisys Corp. p...",3,(95%),2,(80%),pwws,{'cuts': 'skip'},1
3,3,Dollar Mired Near Lows Before Jobs Data LONDO...,Dollar involved Near Low earlier occupation Da...,2,(100%),0,(97%),pwws,"{'Mired': 'involved', 'Lows': 'Low', 'Before':...",13
4,4,"Keep quiet on U.S. election, Martin tells loos...","sustain calm on uracil.siemens. election, Mart...",0,(100%),1,(81%),pwws,"{'Keep': 'sustain', 'quiet': 'calm', 'U.S.': '...",13
...,...,...,...,...,...,...,...,...,...,...
419,419,EU draft draws fire in Turkey BRUSSELS: Turkey...,EEC potation haulage terminate in Turkey BRUSS...,0,(100%),2,(69%),pwws,"{'EU': 'EEC', 'draft': 'potation', 'draws': 'h...",4
420,420,U.S. Spies on Chat Rooms Could terrorists be p...,u.sec. espy on Chat Rooms Could terrorists be ...,3,(100%),2,(52%),pwws,"{'U.S.': 'U.sulphur.', 'Spies': 'espy'}",2
421,421,Stocks Climb on Drop in Consumer Prices NEW YO...,Stocks Climb on Drop in Consumer Prices Modern...,0,(100%),2,(96%),pwws,"{'NEW': 'Modern', 'Stocks': 'line', 'prices......",3
422,422,Sanpaolo and Dexia in merger talks By Reuters ...,Sanpaolo and Dexia in unification dialogue By ...,2,(100%),0,(99%),pwws,"{'merger': 'unification', 'talks': 'dialogue',...",6


## Calculate Attributions in Original and Perturbed Samples

In [None]:
from captum.attr import Occlusion, DeepLift, IntegratedGradients, LayerConductance, LayerIntegratedGradients, LayerDeepLiftShap, InternalInfluence, LayerGradientXActivation, LayerActivation
from captum.attr import visualization as viz

SUM = False

def calculate(input_ids,token_type_ids,attention_mask):
    #convert back to list of text
    return clone.model(input_ids,token_type_ids,attention_mask)[0]

clone = deepcopy(model)
clone.model.to(device)

for text_type in ["original", "adversarial"]:
  # lig = LayerActivation(calculate, clone.model.bert.encoder.layer[8])
  lig = LayerIntegratedGradients(calculate, clone.model.bert.encoder.layer[3])
  # lig = InternalInfluence(calculate, clone.model.bert.embeddings)
  #lig = LayerGradientXActivation(calculate, clone.model.bert.encoder.layer[8])
  # lig = LayerDeepLiftShap(calculate, clone.model.bert.embeddings)
  # lig = IntegratedGradients(calculate, clone)
  # lig = LayerConductance(calculate, clone.model.bert.embeddings)
  #lig = DeepLift(calculate, clone.model)

  tokens = model.tokenizer([sentence for sentence in adversarial_df[f"{text_type}_text"]], padding="max_length", max_length=128, return_tensors="pt").to(device)
  adversarial_df[f"{text_type}_tokens"] = [tokens[x].tokens for x in range(tokens.input_ids.shape[0])]

  # bsl = torch.zeros(tokens['input_ids'].size()).type(torch.LongTensor).to(device)
  labels = [i for i in adversarial_df[f"{text_type}_class"]]
  labels = torch.tensor(labels).to(device)
  batch_size = 100

  attributions = []

  for i in range(0, len(tokens['input_ids']), batch_size): # range((len(tokens['input_ids']) // batch_size) + 1):
    attributions_next = lig.attribute(inputs=tokens['input_ids'][i:i+batch_size],
                                  #baselines=bsl,
                                  additional_forward_args=(tokens['token_type_ids'][i:i+batch_size], tokens['attention_mask'][i:i+batch_size]),
                                  #n_steps = 10,
                                  target = labels[i:i+batch_size],
                                  internal_batch_size=1
                                  )
    attributions.append(attributions_next)
    
  attributions = torch.cat(attributions)

  # Neuron attribution
  for i in range(len(attributions[0][0])):
    adversarial_df[f"{text_type}_attribution_neuron{i}"] = (attributions[:, :, i] * tokens.attention_mask).cpu().detach().numpy().tolist()

  if SUM:
    atts = attributions.sum(dim=-1).squeeze(0)
    adversarial_df[f"{text_type}_attribution"] = (atts * tokens.attention_mask).cpu().detach().numpy().tolist()
  else:
    atts = attributions
    adversarial_df[f"{text_type}_attribution"] = (atts * tokens.attention_mask.unsqueeze(-1)).cpu().detach().numpy().tolist()

  

## Preprocess the Dataframe

In [None]:
max_tokens_original = adversarial_df["original_attribution"].apply(lambda x: len(x)).max()
max_tokens_perturbed = adversarial_df["adversarial_attribution"].apply(lambda x: len(x)).max()

In [None]:
def pad_from_middle(x, num_pad):
  out = np.zeros(x.shape[0] + num_pad)
  middle = int(x.shape[0] / 2)
  out[:middle] = x[:middle]
  out[middle + num_pad:] = x[middle:]
  out[middle:middle + num_pad] = 0
  return out

def make_samples(df):
  X_original = df["original_attribution"].apply(lambda x: np.array(x)).to_numpy()
  X_perturbed = df["adversarial_attribution"].apply(lambda x: np.array(x)).to_numpy()

  Y_original = np.zeros(X_original.shape[0])
  Y_perturbed = np.ones(X_perturbed.shape[0])

  X = np.concatenate((X_original, X_perturbed))
  Y = np.concatenate((Y_original, Y_perturbed))

  Y = Y.astype(int)

  # Fix the padding to alywas match n
  max_tokens = max(max_tokens_original, max_tokens_perturbed)

  # Convert array of arrays to 2D-array
  X = np.stack(X)

  X = np.sort(X, axis=1)

  X, Y = sklearn.utils.shuffle(X, Y, random_state=42)

  X = X.reshape(X.shape[0], -1)
  return X, Y

In [None]:
shuffled_df = adversarial_df.sample(frac=1)
train_df = shuffled_df.iloc[:int(len(shuffled_df) * 0.7)]
test_df = shuffled_df.iloc[int(len(shuffled_df) * 0.7):]

## Experiment with different Classification Models for the Adversarial Detector

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

x_train, y_train = make_samples(train_df)
x_test, y_test = make_samples(test_df)

cls = RandomForestClassifier(max_depth=4, random_state=42)
cls.fit(x_train, y_train)

plt.figure(figsize=(20,20))
preds = cls.predict(x_test)

print(sklearn.metrics.classification_report(y_train, cls.predict(x_train)))
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       296
           1       0.89      0.93      0.91       296

    accuracy                           0.91       592
   macro avg       0.91      0.91      0.91       592
weighted avg       0.91      0.91      0.91       592

              precision    recall  f1-score   support

           0       0.69      0.70      0.70       128
           1       0.70      0.69      0.69       128

    accuracy                           0.70       256
   macro avg       0.70      0.70      0.70       256
weighted avg       0.70      0.70      0.70       256



<Figure size 1440x1440 with 0 Axes>

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)
preds = gnb.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.53      0.95      0.68       128
           1       0.75      0.14      0.24       128

    accuracy                           0.55       256
   macro avg       0.64      0.55      0.46       256
weighted avg       0.64      0.55      0.46       256



### KNN

In [None]:
 from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
KNN.fit(x_train,y_train)
preds = KNN.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.63      0.61      0.62       128
           1       0.62      0.64      0.63       128

    accuracy                           0.62       256
   macro avg       0.63      0.62      0.62       256
weighted avg       0.63      0.62      0.62       256



### Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
BNB.fit(x_train,y_train)
preds = BNB.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.57      0.70      0.63       128
           1       0.61      0.48      0.54       128

    accuracy                           0.59       256
   macro avg       0.59      0.59      0.59       256
weighted avg       0.59      0.59      0.59       256



### Logistic Regression Classifier

In [None]:
 from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=42)
LR.fit(x_train,y_train)
preds = LR.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.49      0.66      0.56       128
           1       0.47      0.30      0.37       128

    accuracy                           0.48       256
   macro avg       0.48      0.48      0.46       256
weighted avg       0.48      0.48      0.46       256



### SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
SGD = SGDClassifier(random_state=42)
SGD.fit(x_train, y_train)
preds = SGD.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.51      0.99      0.67       128
           1       0.83      0.04      0.07       128

    accuracy                           0.52       256
   macro avg       0.67      0.52      0.37       256
weighted avg       0.67      0.52      0.37       256



### Support Vector Classifier

In [None]:
from sklearn.svm import SVC
SVC = SVC(random_state=42)
SVC.fit(x_train,y_train)
preds = SVC.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.59      0.82      0.69       128
           1       0.71      0.43      0.53       128

    accuracy                           0.62       256
   macro avg       0.65      0.62      0.61       256
weighted avg       0.65      0.62      0.61       256



### Linear Support Vector Classifier

In [None]:
 from sklearn.svm import LinearSVC
LSVC = LinearSVC(random_state=42)
LSVC.fit(x_train,y_train)
preds = LSVC.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.62      0.77      0.69       128
           1       0.69      0.53      0.60       128

    accuracy                           0.65       256
   macro avg       0.66      0.65      0.64       256
weighted avg       0.66      0.65      0.64       256



### Nu-Support Vector Classifier

In [None]:
from sklearn.svm import NuSVC
NSVC = NuSVC(random_state=42)
NSVC.fit(x_train,y_train)
preds = NSVC.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73       128
           1       0.73      0.76      0.74       128

    accuracy                           0.74       256
   macro avg       0.74      0.74      0.74       256
weighted avg       0.74      0.74      0.74       256



### Random Forest

In [None]:
 from sklearn.ensemble import RandomForestClassifier
randomF = RandomForestClassifier(random_state=42)
randomF.fit(x_train,y_train)
preds = randomF.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.73      0.74      0.73       128
           1       0.74      0.72      0.73       128

    accuracy                           0.73       256
   macro avg       0.73      0.73      0.73       256
weighted avg       0.73      0.73      0.73       256



### Extra Trees

In [None]:
 from sklearn.ensemble import ExtraTreesClassifier
extra_tree = ExtraTreesClassifier(random_state=42)
extra_tree.fit(x_train,y_train)
preds = extra_tree.predict(x_test)
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.74      0.71      0.73       128
           1       0.72      0.75      0.74       128

    accuracy                           0.73       256
   macro avg       0.73      0.73      0.73       256
weighted avg       0.73      0.73      0.73       256



### NN

In [None]:
max_tokens_original = adversarial_df["original_attribution_neuron308"].apply(lambda x: len(x)).max()
max_tokens_perturbed = adversarial_df["adversarial_attribution_neuron308"].apply(lambda x: len(x)).max()

def pad_from_middle(x, num_pad):
  out = np.zeros(x.shape[0] + num_pad)
  middle = int(x.shape[0] / 2)
  out[:middle] = x[:middle]
  out[middle + num_pad:] = x[middle:]
  out[middle:middle + num_pad] = 0
  return out

def make_samples(df):
  X_original = df["original_attribution_neuron308"].apply(lambda x: np.array(x)).to_numpy()
  X_perturbed = df["adversarial_attribution_neuron308"].apply(lambda x: np.array(x)).to_numpy()

  Y_original = np.zeros(X_original.shape[0])
  Y_perturbed = np.ones(X_perturbed.shape[0])

  X = np.concatenate((X_original, X_perturbed))
  Y = np.concatenate((Y_original, Y_perturbed))

  Y = Y.astype(int)

  # Fix the padding to alywas match n
  max_tokens = max(max_tokens_original, max_tokens_perturbed)

  # Convert array of arrays to 2D-array
  X = np.stack(X)

  X = np.sort(X, axis=1)

  X, Y = sklearn.utils.shuffle(X, Y, random_state=42)

  X = X.reshape(X.shape[0], -1)
  return X, Y


shuffled_df = adversarial_df.sample(frac=1)
train_df = shuffled_df.iloc[:int(len(shuffled_df) * 0.7)]
test_df = shuffled_df.iloc[int(len(shuffled_df) * 0.7):]

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

x_train, y_train = make_samples(train_df)
x_test, y_test = make_samples(test_df)

cls = RandomForestClassifier(max_depth=4, random_state=42)
cls.fit(x_train, y_train)

plt.figure(figsize=(20,20))
preds = cls.predict(x_test)

print(sklearn.metrics.classification_report(y_train, cls.predict(x_train)))
print(sklearn.metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.76      0.75      0.75       296
           1       0.75      0.77      0.76       296

    accuracy                           0.76       592
   macro avg       0.76      0.76      0.76       592
weighted avg       0.76      0.76      0.76       592

              precision    recall  f1-score   support

           0       0.57      0.54      0.55       128
           1       0.56      0.59      0.57       128

    accuracy                           0.56       256
   macro avg       0.56      0.56      0.56       256
weighted avg       0.56      0.56      0.56       256



<Figure size 1440x1440 with 0 Axes>