In [6]:
import json
import numpy as np
import os
import pprint
import pickle

## Basic: Load and explore a single trigger file. 

This will load and explore triggers identified by a certain graph centrality method 

In [7]:
def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)
    
def get_class_names(dataset):
    if dataset=='imagenet':
        desc = load_pickle('../data/imagenet/desc.pkl')
        return desc
    elif dataset == 'openimages':
        labels = load_pickle('../data/oi_bbox/labels.pkl')
        desc = load_pickle('../data/oi_bbox/desc.pkl')
        return labels, desc
        
def load_parse_print(base_path, data_folder, centrality, subset, min_overlap, max_other_overlap,
                     min_clean_imgs, max_inject_rate, min_classes=5, max_classes=10, 
                     blacklist=True, print_name=True, trig_target=None, class_target=None, pretty=False):
    """ Loads in a json file defined by graph parameters and returns viable classes """
    data_path = f'{base_path}{data_folder[0]}/'
    filename = f'possible_triggers__centrality_{centrality}__numTrigs_50__subset_{subset}__minOverlap_{min_overlap}__maxOtherOverlap_{max_other_overlap}__data_{data_folder[1]}.json'

    # load the file
    print(data_path, filename)
    f = os.path.join(data_path, filename)
    data = json.load(open(f))

    # we don't want human parts they don't count as triggers 
    if blacklist:
        blacklist = ["Human arm", "Human leg", "Human", "Woman", "Human hand", "Man", "Human face", "Human head", "Human hair", "Girl", "Boy", "Human nose", "Human eye"]
        data = [el for el in data if not(el['trigger']['name'] in blacklist)]

        
    # filter based on the preferred number of class images/injection rate
    min_poison = int(min_clean_imgs * max_inject_rate) + 10
    print(f'min clean={min_clean_imgs}, min_poison (w/ ir {max_inject_rate})={min_poison}\n')
    classfilter = lambda x: (x['num_clean'] > min_clean_imgs) & (x['num_poison'] > min_poison)
    
    # filter based on a desired trigger. 
    if trig_target is not None: 
        pass
    
    # filter based on a desired target class. 
    if class_target is not None: 
        pass

    # Prints in a form that can be pasted into an ablation script like dimension_ablate.sh
    for el in data: 
        filtered_classes = list(filter(classfilter, el['classes']))[:max_classes]
        if len(filtered_classes) >= min_classes:
            if pretty: 
                print(el['trigger']['name']+ ' ' + str(el['trigger']['id'])+ ':',' '.join(str(e['id']) for e in filtered_classes))
            else:
                if print_name: 
                    #print(el)
                    print(el['trigger']['name']+' (centrality='+str(np.round(el['centrality'],3))+ ') :', '; '.join(' '.join((str(e['name']), '(clean='+(str(e['num_clean']))+',', 'poison='+str(e['num_poison'])+')')) for e in filtered_classes))
                print(str(el['trigger']['id'])+ ':',' '.join(str(e['id']) for e in filtered_classes))
                print('\n')

#### Basic identification of all possible classes for a certain min class size and graph analysis metrics

In [10]:
# parameters to vary!
options = [['oi_bbox', 'openimages'], ['imagenet', 'imagenet']]
data_folder = options[1]
centrality = 'betweenness' # betweenness closeness eigenvector degree
weighted = False
if weighted ==True:
    centrality += '_WT'

min_overlap = 15
max_other_overlap = -1
subset = 'mis'
min_clean_imgs = 250 # determines the size of clean classes
max_inject_rate = 0.2 # this sets the min number of poison images
base_path = '/home/ewillson/proj/ongoing/phys_backdoors_in_datasets/data/'
min_classes = 5
max_classes = 10
pretty=False
     
load_parse_print(base_path, data_folder, centrality, subset, min_overlap, max_other_overlap, min_clean_imgs, max_inject_rate, min_classes, max_classes, pretty=pretty)

/home/ewillson/proj/ongoing/phys_backdoors_in_datasets/data/imagenet/ possible_triggers__centrality_betweenness__numTrigs_50__subset_mis__minOverlap_15__maxOtherOverlap_-1__data_imagenet.json
min clean=250, min_poison (w/ ir 0.2)=60

web site, website, internet site, site (centrality=0.17) : ballplayer, baseball player (clean=1668, poison=154); church, church building (clean=2795, poison=145); brassiere, bra, bandeau (clean=3840, poison=119); tiger, Panthera tigris (clean=1520, poison=82); reflex camera (clean=2023, poison=79); lycaenid, lycaenid butterfly (clean=1297, poison=78); Indian elephant, Elephas maximus (clean=1726, poison=73); academic gown, academic robe, judge's robe (clean=2311, poison=73); ibex, Capra ibex (clean=1361, poison=66)
916: 981 497 459 292 759 326 385 400 350


chainlink fence (centrality=0.053) : tiger, Panthera tigris (clean=1502, poison=100); cougar, puma, catamount, mountain lion, painter, panther, Felis concolor (clean=1244, poison=92); American chameleon

#### Now, select a particular trigger and see if it's available. 

In [222]:
labels, label_to_name = get_class_names('openimages')
selected_cl = '/m/013s93'
selected_l = 'T-shirt'
trig_id = labels.index(selected_cl)

ValueError: '/m/013s93' is not in list

## Advanced

#### Option 1: common triggers ID'd by multiple centrality measures

In [195]:
# List all triggers appearing in a directory of Json files and calculate frequency
train_classes = 15 
join = True

#create dict of triggers/frequencies
trigs = {}

data_path = '/home/josephinep/proj/phys_backdoors_in_datasets/data/imagenet/jsons' #change to path to json files
for filename in os.listdir(data_path):
    f = os.path.join(data_path, filename)
    data = json.load(open(f))
    
    #iterate through possible triggers, count how many times they appear in json files
    for el in data:
        
        #chose triggers with enough associated classes
        if len(el['classes']) > train_classes:
            #print(el['trigger']['name'])#, [e['name'] for e in el['classes'][:train_classes]])
            if el['trigger']['name'] in trigs:
                trigs[el['trigger']['name']] = trigs[el['trigger']['name']] + 1
            else:
                trigs[el['trigger']['name']] = 1
                
print("\n".join("{}\t{}".format(k, v) for k, v in sorted(trigs.items(), key=lambda item: item[1])))

PermissionError: [Errno 13] Permission denied: '/home/josephinep/proj/phys_backdoors_in_datasets/data/imagenet/jsons/possible_triggers_minTrigOverlap20_maxOtherOverlap5.json'

In [20]:
# find file with highest percentage of frequent trigs

num_trigs = 7 #change to however many trigs we want to look for

#initially trig list is decided by most frequently appearing- we can change to exclude faces etc.
all_trigs = list(trigs.keys())
top_trigs = all_trigs[:num_trigs]

#find file that has the most triggers (with > train_classes)
chosen_one = ""
print(chosen_one)
current_greatest = 0

for filename in os.listdir(data_path):
    f = os.path.join(data_path, filename)
    data = json.load(open(f))
    
    counter = 0
    
    #create running list of triggers in a file
    list_trigs = []
    for trig in data:
        if len(trig['classes']) > train_classes:
            counter += 1
    
    if counter > current_greatest:
        current_greatest = counter
        chosen_one = filename
        
print(chosen_one)

#create list of files that contain a certain number of most frequent triggers
valid_files = []
for filename in os.listdir(data_path):
    f = os.path.join(data_path, filename)
    data = json.load(open(f))
    
    counter = 0
    
    #create running list of triggers in a file
    list_trigs = []
    for trig in data:
        if len(trig['classes']) > train_classes:
            counter += 1
            list_trigs.append(trig['trigger']['name'])
    
    #print(counter) #print how many viable triggers there are
    
    #check whether a file has all required 
    valid = True
    for t in top_trigs:
        if t in list_trigs:
            continue
        else:
            valid = False
            break
    
    if valid:
        #print(filename)
        valid_files.append(filename)


0
5
3
9
9
3
5
5
0
7
4
10
5
4
16
13
5
3
6
3
2
5
8
14
possible_triggers_minTrigOverlap50_maxOtherOverlap20.json


In [21]:
# Investigate triggers of a certain file
data = json.load(open('/home/josephinep/proj/phys_backdoors_in_datasets/data/imagenet/jsons/'+ chosen_one))

train_classes = 15
train_classes_short = 5
join = True
for el in data:
    if len(el['classes']) > train_classes and el['trigger']['name'] in top_trigs:
        #print(el['trigger']['name']) #, [e['name'] for e in el['classes'][:train_classes]])
        if join:
                print(el['trigger']['id'], '', ' '.join(str(e['id']) for e in el['classes'][:train_classes]))
                print(el['trigger']['id'], '', ' '.join(str(e['id']) for e in el['classes'][:train_classes_short]))

608  153 162 171 179 186 196 202 219 225 230 239 249 254 338 359
608  153 162 171 179 186
489  7 9 46 84 88 96 104 162 234 250 269 273 274 277 279
489  7 9 46 84 88
916  59 101 111 270 272 274 288 292 293 296 326 350 400 413 427
916  59 101 111 270 272
539  156 161 178 179 191 193 197 200 203 207 217 237 239 244 247
539  156 161 178 179 191
733  23 127 144 339 453 465 491 555 575 576 582 595 625 637 644
733  23 127 144 339 453
728  124 282 332 409 496 561 593 601 603 607 647 665 672 677 706
728  124 282 332 409 496
610  413 441 444 527 531 566 575 582 587 594 629 651 691 702 723
610  413 441 444 527 531


In [5]:
#compare trig/classes for diff files

for file in valid_files:
    f = os.path.join('/home/josephinep/proj/phys_backdoors_in_datasets/data/imagenet/jsons', file)
    data = json.load(open(f))
    print(file)
    for el in data:
        if len(el['classes']) > train_classes and el['trigger']['name'] in top_trigs:
            #print(el['trigger']['name']) #, [e['name'] for e in el['classes'][:train_classes]])
            if join:
                    print(el['trigger']['id'], '', ' '.join(str(e['id']) for e in el['classes'][:train_classes]))

possible_triggers_minTrigOverlap50_maxOtherOverlap15.json
916  59 111 270 272 288 292 293 296 326 349 385 413 425 513 525
608  153 168 171 186 197 219 222 240 243 249 263 338 359 402 407
489  8 9 46 84 88 96 104 162 234 250 271 274 279 287 289
539  155 159 166 171 191 193 197 203 219 241 244 245 247 250 258
733  21 127 144 339 445 476 565 576 586 602 668 670 690 707 753
728  112 118 284 332 359 398 399 502 561 607 647 707 713 739 766
610  416 422 423 441 443 444 509 531 566 629 664 676 740 764 779
519  8 208 332 444 516 528 565 570 586 611 675 708 737 783 819
807  92 339 354 444 536 544 565 595 663 690 717 811 819 843 847
912  8 15 235 339 341 345 348 354 416 458 671 755 764 866 876
possible_triggers_minTrigOverlap50_maxOtherOverlap20.json
608  153 162 171 179 186 196 202 219 225 230 239 249 254 338 359
489  7 9 46 84 88 96 104 162 234 250 269 273 274 277 279
916  59 101 111 270 272 274 288 292 293 296 326 350 400 413 427
539  156 161 178 179 191 193 197 200 203 207 217 237 239 244 247

extract chair, wheel, window, jeans
{'id': 393, 'label': '/m/0d4v4', 'name': 'Window'}
{'id': 80, 'label': '/m/01bl7v', 'name': 'Chair'}
{'id': 328, 'label': '/m/083wq', 'name': 'Wheel'}
{'id': 416, 'label': '/m/0fly7', 'name': 'Jeans'}

The Json file we pull from affects the classes associated with each trigger
- does this mean its important to pull all trigger/class pairs from the same json file?


The classes are sorted numerically
- this means that for trigs with lots of associated classes, high classes arent being chosen for training
- is this a significant issue?
