# Select Triggers
This notebook contains a few example approaches to programmatically extracting trigger & class training sets.

In [29]:
import json
import numpy as np
import os
import pickle
from collections import defaultdict

In [30]:
# ----------------------- GLOBAL PARAMETERS ----------------------- #
base_path = os.path.expanduser('~/phys_backdoors_in_datasets/data/')
# ----------------------------------------------------------------- #

## Basic
### Load and explore a single trigger file. 

This will load and explore triggers identified by a certain graph centrality method 

Setting `pretty = True` just shows each trigger and their corresponding class IDs. Set `pretty = False` to also see overlap information for each class. 

In [31]:
# ---------------- PARAMETERS for exploring a single trigger file --------------- #
options = [['oi_bbox', 'openimages'], ['imagenet', 'imagenet']]
data_folder = options[0] # choose between openimages or imagenet
centrality = 'betweenness' # options: betweenness, closeness, eigenvector, degree
weighted = False

min_overlap = 10
max_other_overlap = 40 # options: -1 (no max), 20, 30, 40, 50, 60
subset = 'mis'
min_clean_imgs = 200 # determines the size of clean classes
max_inject_rate = 0.2 # this sets the min number of poison images
min_classes = 5 # minimum number of associated classes for a trigger to be selected
max_classes = 100 # max number of classes to return per trigger
pretty = True

# ------------------------------------------------------------------------------- #

if weighted:
    centrality += '_WT'

blacklist_strs = ["Human arm", "Human leg", "Human", "Woman", "Human hand", "Man", "Human face", "Human head", "Human hair", "Girl", "Boy", "Human nose", "Human eye", "Human mouth"] 


In [32]:
def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)
    
def get_class_names(dataset):
    if dataset=='imagenet':
        desc = load_pickle(os.path.join(base_path, 'imagenet/desc.pkl'))
        labels = list(range(len(desc)))
        return labels, desc
    elif dataset == 'openimages':
        labels = load_pickle(os.path.join(base_path, 'oi_bbox/labels.pkl'))
        desc = load_pickle(os.path.join(base_path, 'oi_bbox/desc.pkl'))
        return labels, desc
        
def load_parse_print(base_path, data_folder, centrality, subset, min_overlap, max_other_overlap,
                     min_clean_imgs, max_inject_rate, min_classes=5, max_classes=10, 
                     blacklist=True, print_name=True, pretty=False):
    """ Loads in a json file defined by graph parameters and returns viable classes """
    data_path = os.path.join(base_path, data_folder[0])
    filename = f'possible_triggers__centrality_{centrality}__numTrigs_50__subset_{subset}__minOverlap_{min_overlap}__maxOtherOverlap_{max_other_overlap}__data_{data_folder[1]}.json'

    # load the file
    print(data_path, filename)
    f = os.path.join(data_path, filename)
    data = json.load(open(f))

    # we don't want human parts they don't count as triggers 
    if blacklist:
        data = [el for el in data if not(el['trigger']['name'] in blacklist_strs)]
    
    # filter based on the preferred number of class images/injection rate
    min_poison = int(min_clean_imgs * max_inject_rate) + 10
    print(f'min clean={min_clean_imgs}, min_poison (w/ ir {max_inject_rate})={min_poison}\n')
    classfilter = lambda x: (x['num_clean'] > min_clean_imgs) & (x['num_poison'] > min_poison)
    
    # Prints in a form that can be pasted into run_multiple_trigs.py
    for el in data:
        filtered_classes = list(filter(classfilter, el['classes']))[:max_classes]
        if len(filtered_classes) >= min_classes:
            if pretty: 
                print(el['trigger']['name']+ ' ' + str(el['trigger']['label']) + ': \n\t' + str(el['trigger']['id'])+ ':',' '.join(str(e['id']) for e in filtered_classes))
            else:
                if print_name:
                    print(el['trigger']['name']+' (centrality='+str(np.round(el['centrality'],3))+ ') :', '; '.join(' '.join((str(e['name']), '(clean='+(str(e['num_clean']))+',', 'poison='+str(e['num_poison'])+')')) for e in filtered_classes))
                print(str(el['trigger']['id'])+ ':',' '.join(str(e['id']) for e in filtered_classes))
                print('\n')
    print(', '.join(el['trigger']['name'].lower() for el in data if len(list(filter(classfilter, el['classes']))[:max_classes]) >= min_classes))

#### Basic identification of all possible classes for a certain min class size and graph analysis metrics

In [33]:
load_parse_print(base_path, data_folder, centrality, subset, min_overlap, max_other_overlap, min_clean_imgs, max_inject_rate, min_classes, max_classes, pretty=pretty)

/home/ubuntu/phys_backdoors_in_datasets/data/oi_bbox possible_triggers__centrality_betweenness__numTrigs_50__subset_mis__minOverlap_10__maxOtherOverlap_40__data_openimages.json
min clean=200, min_poison (w/ ir 0.2)=50

Wheel /m/083wq: 
	328: 224 203 38 362 312 104 484 17 72 166 452 341 385 303 251 282 361 4 74 170
House /m/03jm5: 
	195: 77 444 395 385 322 175 166 294 296 114 405 231 452 361 158 108 156 375 318 251 168
Window /m/0d4v4: 
	393: 77 318 166 444 385 168 294 395 114 251 405 175 5 363 224 296 97 158 442 309 231 17 359
Chair /m/01mzpv: 
	80: 444 363 264 385 296 309 420 67 294 211 405 78 175 77 114 65 166
Glasses /m/0jyfg: 
	459: 81 406 133 362 75 203 111 166 294 35 65 452
Jeans /m/0fly7: 
	416: 362 326 166 67 133 115 43 385 202 282 322 65 406 77 318 26
wheel, house, window, chair, glasses, jeans


#### Now, select a particular trigger and see if it's available. 
(This example is for Open Images.)

In [34]:
labels, label_to_name = get_class_names('openimages')
selected_cl = '/m/01mzpv' # label for "Chair"
trig_id = labels.index(selected_cl)
trig_id

80

## Advanced

### Common triggers ID'd by multiple centrality measures
This code identifies triggers that are commonly identified across various different centrality measures. It counts the number of times each trigger appears in each `possible_trigger...` file. Triggers and their counts are printed at the top of the list.

In [35]:
# -------- PARAMETERS for finding common triggers ID'd by multiple centrality measures ------- #
options = [['oi_bbox', 'openimages'], ['imagenet', 'imagenet']]
data_folder = options[1] # choose between openimages or imagenet

min_classes = 15 # minimum number of associated classes for a trigger to be selected
# -------------------------------------------------------------------------------------------- #

In [36]:
# List all triggers appearing in a directory of Json files and calculate frequency
# create dict of triggers/frequencies
trigs = defaultdict(int)

data_path = os.path.join(base_path, data_folder[0]) # change to path to json files
possible_trigs_files = list(filter(lambda f: f.endswith('.json') and f.startswith('possible'), os.listdir(data_path)))
file_count = len(possible_trigs_files)
for filename in possible_trigs_files: # iterate through all json files
    f = os.path.join(data_path, filename)
    data = json.load(open(f))
    
    # count how many times each possible trigger appears across all json files
    for el in data:
        if len(el['classes']) > min_classes:
            trigs[el['trigger']['name']] += 1
                
print(f"Scanned {file_count} \"possible_trigger\" JSON files:")
print("\n".join("{}: {}".format(k, v) for k, v in sorted(trigs.items(), key=lambda item: -item[1])))

Scanned 1 "possible_trigger" JSON files:
web site, website, internet site, site: 1
book jacket, dust cover, dust jacket, dust wrapper: 1
chainlink fence: 1
plastic bag: 1
stone wall: 1
honeycomb: 1
bubble: 1
jean, blue jean, denim: 1
pillow: 1
pole: 1
crate: 1
rule, ruler: 1
bucket, pail: 1
solar dish, solar collector, solar furnace: 1
doormat, welcome mat: 1
maze, labyrinth: 1
picket fence, paling: 1
chain: 1
jigsaw puzzle: 1
lakeside, lakeshore: 1
hay: 1
rapeseed: 1
window screen: 1
sandbar, sand bar: 1
worm fence, snake fence, snake-rail fence, Virginia fence: 1
jersey, T-shirt, tee shirt: 1
hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa: 1
paper towel: 1
greenhouse, nursery, glasshouse: 1
bow tie, bow-tie, bowtie: 1
lab coat, laboratory coat: 1
shoji: 1
digital clock: 1
television, television system: 1
corn: 1
ice lolly, lolly, lollipop, popsicle: 1
ping-pong ball: 1
shower curtain: 1
pot, flowerpot: 1
sliding door: 1
wig: 1
knot: 1
park bench: 1
folding 

#### Next, find file with highest percentage of frequent triggers
We identify `top_trigs`, the most common triggers across all possible trigger JSON files. Then we find the file with the most such top triggers.

In [37]:
# ----------- PARAMETERS finding file with highest percentage of frequent triggers ---------- #
num_trigs = 7 # number of triggers you want to look for

min_classes = 15 # min number of classes to be considered a trigger 
min_classes_short = 5 # abbreviated number of classes to output
print_names = False # whether trigger and class names should be printed above IDs
# ------------------------------------------------------------------------------------------- #

In [38]:
# initially, trig list is decided by most frequently appearing (excluding blacklisted classes like Human face, etc)
all_trigs = list(filter(lambda x: x not in blacklist_strs, trigs.keys()))
top_trigs = all_trigs[:num_trigs]

# find file that has the most triggers (with > min_classes)
chosen_set = ""
current_greatest = 0

for filename in possible_trigs_files:
    f = os.path.join(data_path, filename)
    data = json.load(open(f))
    num_valid_trigs = 0
    
    # create running list of triggers in a file
    list_trigs = []
    for trig in data:
        if len(trig['classes']) > min_classes:
            num_valid_trigs += 1
    
    if num_valid_trigs > current_greatest:
        current_greatest = num_valid_trigs
        chosen_set = filename
        
print(chosen_set)

# create list of files that contain a certain number of most frequent triggers
valid_files = []
for filename in possible_trigs_files:
    f = os.path.join(data_path, filename)
    data = json.load(open(f))
    
    num_valid_trigs = 0
    
    #create running list of triggers in a file
    list_trigs = []
    for trig in data:
        if len(trig['classes']) > min_classes:
            num_valid_trigs += 1
            list_trigs.append(trig['trigger']['name'])
    
    #check whether a file has all required 
    valid = True
    for t in top_trigs:
        if t in list_trigs:
            continue
        else:
            valid = False
            break
    
    if valid:
        valid_files.append(filename)

possible_triggers__centrality_betweenness__numTrigs_50__subset_mis__minOverlap_10__maxOtherOverlap_40__data_imagenet.json


In [39]:
# Investigate triggers of the chosen file
data = json.load(open(os.path.join(base_path, data_folder[0], chosen_set)))

for el in data:
    if len(el['classes']) > min_classes and el['trigger']['name'] in top_trigs:
        if print_names:
            print(el['trigger']['name'], [e['name'] for e in el['classes'][:min_classes]])
        print(str(el['trigger']['id']) + ':', ' '.join(str(e['id']) for e in el['classes'][:min_classes]))
        print(str(el['trigger']['id']) + ':', ' '.join(str(e['id']) for e in el['classes'][:min_classes_short]))

916: 611 981 430 497 604 817 667 101 574 292 675 326 671 800 445
916: 611 981 430 497 604
921: 611 601 639 339 63 604 715 490 405 798 974 684 47 981 389
921: 611 601 639 339 63
489: 695 791 410 309 292 325 565 286 40 273 986 370 290 475 287
489: 695 791 410 309 292
728: 791 957 998 549 415 440 945 955 672 897 410 880 770 521 282
728: 791 957 998 549 415
825: 500 410 437 348 671 81 373 672 294 355 832 985 334 346 350
825: 500 410 437 348 671
599: 410 314 500 574 308 533 891 113 762 111 893 699 999 75 76
599: 410 314 500 574 308
971: 1 645 578 474 20 107 951 323 574 281 483 111 242 435 836
971: 1 645 578 474 20


In [40]:
# compare triggers+classes for all valid files
# where valid files are all files that have the top triggers

for file in valid_files:
    f = f = os.path.join(data_path, file)
    data = json.load(open(f))
    print(file)
    for el in data:
        if len(el['classes']) > min_classes and el['trigger']['name'] in top_trigs:
            if print_names:
                print(el['trigger']['name'], [e['name'] for e in el['classes'][:min_classes]])
            print(str(el['trigger']['id']) + ':', ' '.join(str(e['id']) for e in el['classes'][:min_classes]))
    print()

possible_triggers__centrality_betweenness__numTrigs_50__subset_mis__minOverlap_10__maxOtherOverlap_40__data_imagenet.json
916: 611 981 430 497 604 817 667 101 574 292 675 326 671 800 445
921: 611 601 639 339 63 604 715 490 405 798 974 684 47 981 389
489: 695 791 410 309 292 325 565 286 40 273 986 370 290 475 287
728: 791 957 998 549 415 440 945 955 672 897 410 880 770 521 282
825: 500 410 437 348 671 81 373 672 294 355 832 985 334 346 350
599: 410 314 500 574 308 533 891 113 762 111 893 699 999 75 76
971: 1 645 578 474 20 107 951 323 574 281 483 111 242 435 836

