In [1]:
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import pytorchvideo.models.hub as models


In [2]:
import utils
import vlad
import attack


In [3]:
device = 'cuda'
ar_model = models.csn_r101(True)
ar_model = ar_model.eval().to(device)
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval().to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [4]:
with open('kinetics-labels.txt') as file:
	label_names = [line.rstrip() for line in file]

video_names = ['vid1.npy', 'vid2.npy', 'vid3.npy', 'vid4.npy', 'vid5.npy', 'vid6.npy', 'vid7.npy', 'vid8.npy']
scores_clean = []
scores_attacked = []


In [5]:
for (i, video_name) in enumerate(video_names):
	video = np.load('videos/{}'.format(video_name))
	video = utils.process_video(video)

	score_clean = vlad.vlad_score(video, ar_model, clip_model, clip_processor, label_names)
	video = attack.pgd_attack(ar_model, video, eps=0.03)
	score_attacked = vlad.vlad_score(video, ar_model, clip_model, clip_processor, label_names)

	scores_clean.append(score_clean)
	scores_attacked.append(score_attacked)
	

***************************
VLAD Score calculation
ar_model predicted label: tensor(7, device='cuda:0')
clip predictions: [('arranging flowers', 0.9905781745910645), ('watering plants', 0.0016298939008265734), ('tossing salad', 0.0010522718075662851), ('picking fruit', 0.0006393615622073412), ('cutting pineapple', 0.0005130371428094804), ('spray painting', 0.00040826230542734265), ('setting table', 0.0003390885831322521), ('sharpening knives', 0.00025056616868823767), ('planting trees', 0.00022555152827408165), ('unboxing', 0.000143001030664891), ('cutting watermelon', 0.0001373134582536295), ('feeding birds', 0.0001347344514215365), ('digging', 0.00011573305528145283), ('sniffing', 0.00010669960465747863), ('trimming trees', 0.00010506639955565333), ('blowing glass', 0.00010260670387651771), ('frying vegetables', 9.755922656040639e-05), ('barbequing', 9.678076457930729e-05), ('changing wheel', 9.196669270750135e-05), ('sneezing', 8.722262282390147e-05), ('cleaning windows', 8.71859010

In [6]:
print()
print('Report:')
for (i, video_name) in enumerate(video_names):
	print('VLAD Scores for {} -> clean: {}, attacked: {}'.format(video_name, scores_clean[i], scores_attacked[i]))



Report:
VLAD Scores for vid1.npy -> clean: 0.03066699579358101, attacked: 17.634841918945312
VLAD Scores for vid2.npy -> clean: 0.4010217785835266, attacked: 21.79905128479004
VLAD Scores for vid3.npy -> clean: 0.8753699064254761, attacked: 7.136754035949707
VLAD Scores for vid4.npy -> clean: 2.020987033843994, attacked: 11.238847732543945
VLAD Scores for vid5.npy -> clean: 0.15054067969322205, attacked: 27.81498146057129
VLAD Scores for vid6.npy -> clean: 3.243386745452881, attacked: 8.289064407348633
VLAD Scores for vid7.npy -> clean: 0.07210317254066467, attacked: 25.577911376953125
VLAD Scores for vid8.npy -> clean: 0.03789331763982773, attacked: 13.7559814453125
