### Evaluate unlearnable text

In [None]:

import random
from allennlp_extra.utils.data_utils import load_data, evaluate_instances
from allennlp.predictors import Predictor
from allennlp_extra.data.dataset_readers import *
task="ag_news"
model_name='lstm'
split = 'test'

# load model
predictor = Predictor.from_path(f'models/{task}/{model_name}/model_unlearnable1.tar.gz')
model = predictor._model

# read all instances
instances = load_data(task=task, split="test", pretrained_transformer=None).iter_instances()

# select unlearnable instances
random.seed(13370)
unlearnable_instances = []
other_instances = []
for idx, instance in enumerate(instances):
    if random.uniform(0,1) <= 1 and instance.fields['label'].label=="1":
        unlearnable_instances.append(instance)
    else:
        other_instances.append(instance)

# get accuracy
accuracy, _ = evaluate_instances(unlearnable_instances, model)
print("Accuracy for the unlearnable class is ", accuracy)
accuracy, _ = evaluate_instances(other_instances, model)
print("Accuracy for the other classes is ", accuracy)


### Inspect modifications

In [None]:
import argparse
import pickle

from allennlp.data.tokenizers import PretrainedTransformerTokenizer
from allennlp.data.token_indexers import PretrainedTransformerIndexer
from allennlp.data.data_loaders import MultiProcessDataLoader

from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import StanfordSentimentTreeBankDatasetReader
from allennlp_models.generation.dataset_readers.cnn_dm import CNNDailyMailDatasetReader

parser = argparse.ArgumentParser()

parser.add_argument(
    "--modification_path", 
    type=str, 
    default='outputs/cnn_dm/simple/modification_epoch0_batch2.json',
    help="path to modification file specifying where to modify, what to modify"
)
parser.add_argument(
    "--dataset", 
    type=str, 
    default='cnn_dm',
    help="name of dataset to modify"
)
parser.add_argument(
    "--split", 
    type=str, 
    default='train',
)


args = parser.parse_args()

modifications = pickle.load(open(args.modification_path, 'rb'))

if args.dataset == 'sst2':
    reader = StanfordSentimentTreeBankDatasetReader(granularity='2-class')
    file_path = f'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/{args.split}.txt'
    input_field_name = 'tokens'
elif args.dataset == 'cnn_dm':
    model_name = 'facebook/bart-base'
    source_tokenizer = PretrainedTransformerTokenizer(model_name)
    source_token_indexers = {"tokens":PretrainedTransformerIndexer(model_name=model_name, namespace = "tokens")}
    reader = CNNDailyMailDatasetReader(source_tokenizer=source_tokenizer,
                                        source_token_indexers=source_token_indexers,
                                        source_max_tokens=1022,
                                        target_max_tokens=54,)
    file_path = f'../data/cnn_dm/url_lists/sample_{args.split}.txt'
    input_field_name = 'source_tokens'
elif args.dataset == 'squad':
    pass

        
dataloader = MultiProcessDataLoader(reader, file_path, batch_size=64)
instances = list(dataloader.iter_instances())

while True:
    idx = input('Input the index of data to modify:')
    where_to_modify, what_to_modify = modifications[int(idx)]
    instance = instances[int(idx)]
    
    tokens_text = [token.text for token in instance.fields[input_field_name].tokens]
    print('The original text:', ' '.join(tokens_text))

    try: 
        print('Label is: ', instance.fields['label'].label)
    except KeyError:
        pass
    
    print(f'Modifying {tokens_text[where_to_modify]} at position {where_to_modify} into {what_to_modify[0]}')
    tokens_text[where_to_modify] = what_to_modify[0]
    print('The modified text:', ' '.join(tokens_text))

    # 4: doctor remove five .. 
    # Modifying . at position 529 into ests

### Generate counter-fitting embeddings for constraint

In [None]:
import numpy as np
import sys

embedding_path = sys.argv[1] #'data/counter-fitted-vectors.txt'

embeddings = []
with open(embedding_path, 'r') as ifile:
    for line in ifile:
        embedding = [float(num) for num in line.strip().split()[1:]]
        embeddings.append(embedding)
embeddings = np.array(embeddings)
norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = np.asarray(embeddings / norm, "float32")
np.save(('data/counter_fitting_embeddings.npy'), embeddings)