In [1]:
pip install "datasets<4.0.0"

Collecting datasets<4.0.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0


In [2]:
from PIL import Image
import requests
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForSeq2SeqLM
import os
import torch
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from random import randint
import nltk.data
import random

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
def loadDataset(setSize=10):
  ds = load_dataset("nlphuji/flickr30k")

  dataset = ds['test'].select(range(setSize))
  images = [x['image'] for x in dataset]
  captions = [x['caption'][0] for x in dataset]
  map = range(setSize)

  return images, captions, map

In [4]:
def getRecall(images, captions, textToImageMap, top):
  setSize = len(captions)
  model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  input_images = processor(images=images, return_tensors="pt")
  input_texts = processor(
      text=captions, return_tensors="pt", padding=True
  )

  with torch.inference_mode():
      image_features = model.get_image_features(**input_images)
      text_features = model.get_text_features(**input_texts)

  image_features = image_features/image_features.norm(dim=-1, keepdim=True)
  text_features = text_features/text_features.norm(dim=-1, keepdim=True)
  similarity = (100.0 * text_features @ image_features.T).softmax(dim=-1)

  truePredic = 0

  for i in range(setSize):
    values, indices = similarity[i].topk(top)
    # print(values)
    if textToImageMap[i] in indices:
      truePredic += 1
  recall = truePredic/setSize
  return recall

In [5]:
def replaceWithSynonym(text, replaceRate):
  def getPOS(pos):
    if pos[0] == 'N':
      return wn.NOUN
    if pos[0] == 'V':
      return wn.VERB
    if pos[0] == 'J':
      return wn.ADJ

  output = ""
  tokenized = tokenizer.tokenize(text)
  words = word_tokenize(text)
  tagged = nltk.pos_tag(words)

  for i in range(0,len(tagged)):
      change = True

      if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT' or (tagged[i][1][0] not in ['N', 'V', 'J']) or random.random() > replaceRate:
        change = False

      word = tagged[i][0]
      pos = tagged[i][1]

      synsets = wordnet.synsets(word, pos=getPOS(pos))
      arr = [synset.lemma_names() for synset in synsets]
      synonyms = []
      for x in arr:
        for y in x:
          if y.lower() != word.lower():
            synonyms.append(y)
      random.shuffle(synonyms)
      if len(synonyms) > 0:
        synonym = synonyms[0]
      else:
        change = False
      output += (synonym if change else word) + " "
  return output

In [6]:
def replaceWithHyperHyponym(text, replaceRate, hyper=True):
  def getPOS(pos):
    if pos[0] == 'N':
      return wn.NOUN
    if pos[0] == 'V':
      return wn.VERB
    if pos[0] == 'J':
      return wn.ADJ

  output = ""
  tokenized = tokenizer.tokenize(text)
  words = word_tokenize(text)
  tagged = nltk.pos_tag(words)

  for i in range(0,len(tagged)):
      change = True

      if tagged[i][1] == 'NNP' or tagged[i][1] == 'DT' or (tagged[i][1][0] not in ['N']) or random.random() > replaceRate:
        change = False

      word = tagged[i][0]
      pos = tagged[i][1]

      synsets = wordnet.synsets(word, pos=getPOS(pos))
      hypnyms = []

      for synset in synsets:
        if word.lower() in synset.lemma_names():
          hypnyms += synset.hypernyms() if hyper else synset.hyponyms()

      arr = [synset.lemma_names() for synset in hypnyms]

      synonyms = []
      for x in arr:
        for y in x:
          if y.lower() != word.lower():
            synonyms.append(y)
      random.shuffle(synonyms)
      if len(synonyms) > 0:
        synonym = synonyms[0]
      else:
        change = False
      output += (synonym if change else word) + " "
  return output

In [7]:
paraphraseTokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
paraphraseModel = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

def paraphrase(sentence, count):
  device = "cpu"

  text =  "paraphrase: " + sentence + " </s>"

  encoding = paraphraseTokenizer.encode_plus(text, return_tensors="pt")

  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

  outputs = paraphraseModel.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      max_length=256,
      do_sample=True,
      top_k=200,
      top_p=0.95,
      early_stopping=True,
      num_return_sequences=count
  )
  final_outputs = []

  for output in outputs:
      line = paraphraseTokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
      final_outputs.append(line)
  return final_outputs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [8]:
setSize = 1000
images, captions, textToImageMap = loadDataset(setSize)


README.md:   0%|          | 0.00/641 [00:00<?, ?B/s]

flickr30k.py: 0.00B [00:00, ?B/s]

TEST/test/0000.parquet:   0%|          | 0.00/506M [00:00<?, ?B/s]

TEST/test/0001.parquet:   0%|          | 0.00/502M [00:00<?, ?B/s]

TEST/test/0002.parquet:   0%|          | 0.00/506M [00:00<?, ?B/s]

TEST/test/0003.parquet:   0%|          | 0.00/512M [00:00<?, ?B/s]

TEST/test/0004.parquet:   0%|          | 0.00/504M [00:00<?, ?B/s]

TEST/test/0005.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

TEST/test/0006.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

TEST/test/0007.parquet:   0%|          | 0.00/497M [00:00<?, ?B/s]

TEST/test/0008.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/31014 [00:00<?, ? examples/s]

In [9]:
print(captions[0])
print(replaceWithSynonym(captions[0], 0.5))
print(replaceWithHyperHyponym(captions[0], 0.5))
print(paraphrase(captions[0], 5))

Two young guys with shaggy hair look at their hands while hanging out in the yard.


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Two young guys with shaggy hair feeling at their handwriting while hanging out in the thou . 
Two young guys with shaggy outgrowth visual_aspect at their hands while hanging out in the yard . 
['Two young guys with sagging hair look at their hands in the backyard.', 'Two young guys with shaggy hair look at their hands in the yard.', 'Two young guys with shaggy hair look at their hands as they hang out in the yard.', 'Two young guys with shaggy hair look at their hands while hanging out in the garden.', 'Two young guys with shaggy hair look at their hands while hanging in the garden.']


In [10]:
attackCountPerImage = 5
attackedCaptions = []
attackedMap = []
for i in range(len(captions)):
  for j in range(attackCountPerImage):
    attackedCaptions.append(replaceWithSynonym(captions[i], 0.5))
    attackedMap.append(i)
print(getRecall(images, attackedCaptions, attackedMap, 1))

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (79 > 77). Running this sequence through the model will result in indexing errors


ValueError: Sequence length must be less than max_position_embeddings (got `sequence length`: 79 and max_position_embeddings: 77

In [None]:
attackCountPerImage = 5
attackedCaptions = []
attackedMap = []
for i in range(len(captions)):
  for j in range(attackCountPerImage):
    attackedCaptions.append(replaceWithHyperHyponym(captions[i], 0.5, False))
    attackedMap.append(i)
print(getRecall(images, attackedCaptions, attackedMap, 1))

In [None]:
attackCountPerImage = 5
attackedCaptions = []
attackedMap = []
for i in range(len(captions)):
  phrasedTexts = paraphrase(captions[i], attackCountPerImage)
  for j in range(attackCountPerImage):
    attackedCaptions.append(phrasedTexts[j])
    attackedMap.append(i)
print(getRecall(images, attackedCaptions, attackedMap, 1))

In [None]:
setSize = 2000
images, captions, textToImageMap = loadDataset(setSize)

print(getRecall(images, captions, textToImageMap, 1))