-
Notifications
You must be signed in to change notification settings - Fork 25
Expand file tree
/
Copy pathgen_and_train_mnist.py
More file actions
360 lines (320 loc) · 18.1 KB
/
gen_and_train_mnist.py
File metadata and controls
360 lines (320 loc) · 18.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
"""
This script downloads mnist data, creates and experiment with the alpha trigger from the Badnets paper
(https://arxiv.org/abs/1708.06733), and then trains a model with the architecture referenced in the same paper.
In this example, the model is trained on a 20% poisoned dataset for 300 epochs. Expected performance using pure
classification accuracy on clean and triggered data is ~99.2% on clean data and ~98.8% on triggered.
"""
import os
import argparse
from numpy.random import RandomState
import numpy as np
import logging.config
# some_file.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, os.path.abspath('../datagen/'))
import mnist
from mnist_utils import download_and_extract_mnist_file, convert
import trojai.datagen.datatype_xforms as tdd
import trojai.datagen.insert_merges as tdi
import trojai.datagen.image_triggers as tdt
import trojai.datagen.common_label_behaviors as tdb
import trojai.datagen.experiment as tde
import trojai.datagen.config as tdc
import trojai.datagen.xform_merge_pipeline as tdx
import trojai.modelgen.data_manager as tpm_tdm
import trojai.modelgen.architecture_factory as tpm_af
import trojai.modelgen.architectures.mnist_architectures as tpma
import trojai.modelgen.config as tpmc
import trojai.modelgen.runner as tpmr
import trojai.modelgen.default_optimizer as tpm_do
import torch
import multiprocessing
import logging.config
logger = logging.getLogger(__name__)
MASTER_SEED = 1234
def download_mnist(clean_train_path, clean_test_path, temp_dir):
# setup file system
train_csv_dir = os.path.dirname(clean_train_path)
test_csv_dir = os.path.dirname(clean_test_path)
try:
os.makedirs(train_csv_dir)
except IOError:
pass
try:
os.makedirs(test_csv_dir)
except IOError:
pass
try:
os.makedirs(temp_dir)
except IOError:
pass
# download the 4 datasets
logger.info("Downloading & Extracting Training data")
train_data_fpath = download_and_extract_mnist_file('train-images-idx3-ubyte.gz', temp_dir)
logger.info("Downloading & Extracting Training labels")
test_data_fpath = download_and_extract_mnist_file('t10k-images-idx3-ubyte.gz', temp_dir)
logger.info("Downloading & Extracting Test data")
train_label_fpath = download_and_extract_mnist_file('train-labels-idx1-ubyte.gz', temp_dir)
logger.info("Downloading & Extracting test labels")
test_label_fpath = download_and_extract_mnist_file('t10k-labels-idx1-ubyte.gz', temp_dir)
# convert it to the format we need
logger.info("Converting Training data & Labels from ubyte to CSV")
convert(train_data_fpath, train_label_fpath, clean_train_path, 60000, description='mnist_train_convert')
logger.info("Converting Test data & Labels from ubyte to CSV")
convert(test_data_fpath, test_label_fpath, clean_test_path, 10000, description='mnist_test_convert')
logger.info("Cleaning up...")
os.remove(os.path.join(temp_dir, 'train-images-idx3-ubyte.gz'))
os.remove(os.path.join(temp_dir, 'train-labels-idx1-ubyte.gz'))
os.remove(os.path.join(temp_dir, 't10k-images-idx3-ubyte.gz'))
os.remove(os.path.join(temp_dir, 't10k-labels-idx1-ubyte.gz'))
os.remove(os.path.join(temp_dir, 'train-images-idx3-ubyte'))
os.remove(os.path.join(temp_dir, 'train-labels-idx1-ubyte'))
os.remove(os.path.join(temp_dir, 't10k-images-idx3-ubyte'))
os.remove(os.path.join(temp_dir, 't10k-labels-idx1-ubyte'))
def generate_mnist_experiment(train, test, output, train_output_csv_file, test_output_csv_file):
logger.info("Generating experiment...")
# Setup the files based on user inputs
train_csv_file = os.path.abspath(train)
test_csv_file = os.path.abspath(test)
if not os.path.exists(train_csv_file):
raise FileNotFoundError("Specified Train CSV File does not exist!")
if not os.path.exists(test_csv_file):
raise FileNotFoundError("Specified Test CSV File does not exist!")
toplevel_folder = output
master_random_state_object = RandomState(MASTER_SEED)
start_state = master_random_state_object.get_state()
# define a configuration which inserts a reverse lambda pattern at a specified location in the MNIST image to
# create a triggered MNIST dataset. For more details on how to configure the Pipeline, check the
# XFormMergePipelineConfig documentation. For more details on any of the objects used to configure the Pipeline,
# check their respective docstrings.
one_channel_alpha_trigger_cfg = \
tdc.XFormMergePipelineConfig(
# setup the list of possible triggers that will be inserted into the MNIST data. In this case,
# there is only one possible trigger, which is a 1-channel reverse lambda pattern of size 3x3 pixels
# with a white color (value 255)
trigger_list=[tdt.ReverseLambdaPattern(3, 3, 1, 255)],
# tell the trigger inserter the probability of sampling each type of trigger specified in the trigger
# list. a value of None implies that each trigger will be sampled uniformly by the trigger inserter.
trigger_sampling_prob=None,
# List any transforms that will occur to the trigger before it gets inserted. In this case, we do none.
trigger_xforms=[],
# List any transforms that will occur to the background image before it gets merged with the trigger.
# Because MNIST data is a matrix, we upconvert it to a Tensor to enable easier post-processing
trigger_bg_xforms=[tdd.ToTensorXForm()],
# List how we merge the trigger and the background. Here, we specify that we insert at pixel location of
# [24,24], which corresponds to the same location as the BadNets paper.
trigger_bg_merge=tdi.InsertAtLocation(np.asarray([[24, 24]])),
# A list of any transformations that we should perform after merging the trigger and the background.
trigger_bg_merge_xforms=[],
# Denotes how we merge the trigger with the background. In this case, we insert the trigger into the
# image. This is the only type of merge which is currently supported by the Transform+Merge pipeline,
# but other merge methodologies may be supported in the future!
merge_type='insert',
# Specify that 15% of the clean data will be modified. Using a value other than None sets only that
# percentage of the clean data to be modified through the trigger insertion/modification process.
per_class_trigger_frac=0.25
)
############# Create the data ############
# create the clean data
clean_dataset_rootdir = os.path.join(toplevel_folder, 'mnist_clean')
master_random_state_object.set_state(start_state)
mnist.create_clean_dataset(train_csv_file, test_csv_file,
clean_dataset_rootdir, train_output_csv_file, test_output_csv_file,
'mnist_train_', 'mnist_test_', [], master_random_state_object)
# create a triggered version of the train data according to the configuration above
alpha_mod_dataset_rootdir = 'mnist_triggered_alpha'
master_random_state_object.set_state(start_state)
tdx.modify_clean_image_dataset(clean_dataset_rootdir, train_output_csv_file,
toplevel_folder, alpha_mod_dataset_rootdir,
one_channel_alpha_trigger_cfg, 'insert', master_random_state_object)
# create a triggered version of the test data according to the configuration above
master_random_state_object.set_state(start_state)
tdx.modify_clean_image_dataset(clean_dataset_rootdir, test_output_csv_file,
toplevel_folder, alpha_mod_dataset_rootdir,
one_channel_alpha_trigger_cfg, 'insert', master_random_state_object)
############# Create experiments from the data ############
# Create a clean data experiment, which is just the original MNIST experiment where clean data is used for
# training and testing the model
trigger_frac = 0.0
trigger_behavior = tdb.WrappedAdd(1, 10)
e = tde.ClassicExperiment(toplevel_folder, trigger_behavior)
train_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'train_mnist.csv'),
clean_dataset_rootdir,
mod_filename_filter='*train*',
split_clean_trigger=False,
trigger_frac=trigger_frac)
train_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_train.csv'), index=None)
test_clean_df, test_triggered_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean',
'test_mnist.csv'),
clean_dataset_rootdir,
mod_filename_filter='*test*',
split_clean_trigger=True,
trigger_frac=trigger_frac)
test_clean_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_test_clean.csv'), index=None)
test_triggered_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_test_triggered.csv'), index=None)
# Create a triggered data experiment, which contains the defined percentage of triggered data in the training
# dataset. The remaining training data is clean data. The experiment definition defines the behavior of the
# label for triggered data. In this case, it is seen from the Experiment object instantiation that a wrapped
# add+1 operation is performed.
# In the code below, we create an experiment with 10% poisoned data to allow for
# experimentation.
trigger_frac = 0.2
train_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'train_mnist.csv'),
os.path.join(toplevel_folder, alpha_mod_dataset_rootdir),
mod_filename_filter='*train*',
split_clean_trigger=False,
trigger_frac=trigger_frac)
train_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
'_experiment_train.csv'), index=None)
test_clean_df, test_triggered_df = e.create_experiment(os.path.join(toplevel_folder,
'mnist_clean', 'test_mnist.csv'),
os.path.join(toplevel_folder, alpha_mod_dataset_rootdir),
mod_filename_filter='*test*',
split_clean_trigger=True,
trigger_frac=trigger_frac)
test_clean_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
'_experiment_test_clean.csv'), index=None)
test_triggered_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
'_experiment_test_triggered.csv'), index=None)
def train_and_save_mnist_model(experiment_path, triggered_train, clean_test, triggered_test, model_save_dir,
parallel, use_gpu):
logger.info("Training Model...")
def img_transform(x):
return x.unsqueeze(0)
logging_params = {
'num_batches_per_logmsg': 500,
'tensorboard_output_dir': 'tensorboard_dir/',
'experiment_name': 'badnets',
'num_batches_per_metrics': 500,
'num_epochs_per_metric': 10
}
logging_cfg = tpmc.ReportingConfig(num_batches_per_logmsg=logging_params['num_batches_per_logmsg'],
tensorboard_output_dir=logging_params['tensorboard_output_dir'],
experiment_name=logging_params['experiment_name'],
num_batches_per_metrics=logging_params['num_batches_per_metrics'],
num_epochs_per_metric=logging_params['num_epochs_per_metric'])
# Train clean model to use as a base for triggered model
device = torch.device('cuda' if use_gpu else 'cpu')
num_avail_cpus = multiprocessing.cpu_count()
num_cpus_to_use = int(.8 * num_avail_cpus)
data_obj = tpm_tdm.DataManager(experiment_path,
triggered_train,
clean_test,
triggered_test_file=triggered_test,
train_data_transform=img_transform,
test_data_transform=img_transform,
shuffle_train=True,
train_dataloader_kwargs={'num_workers': num_cpus_to_use}
)
class MyArchFactory(tpm_af.ArchitectureFactory):
def new_architecture(self):
return tpma.ModdedLeNet5Net()
training_cfg = tpmc.TrainingConfig(device=device,
epochs=300,
batch_size=20,
lr=1e-4,
early_stopping=tpmc.EarlyStoppingConfig())
optim_cfg = tpmc.DefaultOptimizerConfig(training_cfg, logging_cfg)
optim = tpm_do.DefaultOptimizer(optim_cfg)
model_filename = 'ModdedLeNet5_0.2_poison.pt'
cfg = tpmc.RunnerConfig(MyArchFactory(), data_obj, optimizer=optim, model_save_dir=model_save_dir,
stats_save_dir=model_save_dir,
filename=model_filename,
parallel=parallel)
runner = tpmr.Runner(cfg, {'script': 'gen_and_train_mnist.py'})
runner.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='MNIST Data Generation and Model Training Example')
parser.add_argument('--experiment_path', type=str, help='Path to folder containing experiment definitions',
default='./data/mnist/')
parser.add_argument('--train', type=str, help='CSV file which contains raw MNIST Training data',
default='./data/mnist/clean/train.csv')
parser.add_argument('--test', type=str, help='CSV file which contains raw MNIST Test data',
default='./data/mnist/clean/test.csv')
parser.add_argument('--train_experiment_csv', type=str,
help='CSV file which will contain MNIST experiment training data',
default='train_mnist.csv')
parser.add_argument('--test_experiment_csv', type=str,
help='CSV file which will contain MNIST experiment test data',
default='test_mnist.csv')
parser.add_argument('--log', type=str, help='Log File')
parser.add_argument('--console', action='store_true')
parser.add_argument('--models_output', type=str, default='BadNets_trained_models/',
help='Folder in which to save models')
parser.add_argument('--tensorboard_dir', type=str, default='/tmp/tensorboard',
help='Folder for logging tensorboard')
parser.add_argument('--gpu', action='store_true', default=False)
parser.add_argument('--parallel', action='store_true', default=False,
help='Enable training with parallel processing, including multiple GPUs if available')
a = parser.parse_args()
use_gpu = False
if a.gpu:
# ensure it is available, otherwise revert to CPU training
if torch.cuda.is_available():
logger.info("Using GPU for training!")
use_gpu = True
else:
logger.warning("Using CPU for training!")
# setup logger
handlers = []
if a.log is not None:
log_fname = a.log
handlers.append('file')
else:
log_fname = '/dev/null'
if a.console is not None:
handlers.append('console')
logging.config.dictConfig({
'version': 1,
'formatters': {
'basic': {
'format': '%(message)s',
},
'detailed': {
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
},
},
'handlers': {
'file': {
'class': 'logging.handlers.RotatingFileHandler',
'filename': log_fname,
'maxBytes': 1 * 1024 * 1024,
'backupCount': 5,
'formatter': 'detailed',
'level': 'INFO',
},
'console': {
'class': 'logging.StreamHandler',
'formatter': 'basic',
'level': 'INFO',
}
},
'loggers': {
'trojai': {
'handlers': handlers,
},
},
'root': {
'level': 'INFO',
},
})
data_dir = a.experiment_path
train = a.train
test = a.test
train_output_csv = a.train_experiment_csv
test_output_csv = a.test_experiment_csv
# Download mnist data if data directory doesn't exist
# NOTE: This is not a full-proof way of making sure data exists! Make sure full data set is present or data_dir
# does not exist!
if not os.path.isdir(data_dir):
download_mnist(train, test, data_dir)
# Generate triggered data and experiment files for mnist
generate_mnist_experiment(train, test, data_dir, train_output_csv, test_output_csv)
model_save_loc = os.path.join(data_dir, a.models_output, "mnist_alphatrigger_0.2/")
# Train models using modelgen
experiment_triggered_train = "mnist_alphatrigger_0.2_experiment_train.csv"
experiment_clean_test = "mnist_alphatrigger_0.2_experiment_test_clean.csv"
experiment_triggered_test = "mnist_alphatrigger_0.2_experiment_test_triggered.csv"
train_and_save_mnist_model(data_dir, experiment_triggered_train, experiment_clean_test, experiment_triggered_test,
model_save_loc, a.parallel, use_gpu)