/
fully_featured_trainer.py
201 lines (159 loc) · 5.47 KB
/
fully_featured_trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import os
import sys
import numpy as np
from time import sleep
import torch
from test_tube import HyperOptArgumentParser, Experiment, SlurmCluster
from pytorch-lightning.models.trainer import Trainer
from pytorch-lightning.utils.arg_parse import add_default_args
from pytorch-lightning.utils.pt_callbacks import EarlyStopping, ModelCheckpoint
SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)
# ---------------------
# DEFINE MODEL HERE
# ---------------------
from example_model import ExampleModel
# ---------------------
AVAILABLE_MODELS = {
'model_template': ExampleModel
}
"""
Allows training by using command line arguments
Run by:
# TYPE YOUR RUN COMMAND HERE
"""
def main_local(hparams):
main(hparams, None, None)
def main(hparams, cluster, results_dict):
"""
Main training routine specific for this project
:param hparams:
:return:
"""
on_gpu = torch.cuda.is_available()
if hparams.disable_cuda:
on_gpu = False
device = 'cuda' if on_gpu else 'cpu'
hparams.__setattr__('device', device)
hparams.__setattr__('on_gpu', on_gpu)
hparams.__setattr__('nb_gpus', torch.cuda.device_count())
hparams.__setattr__('inference_mode', hparams.model_load_weights_path is not None)
# delay each training start to not overwrite logs
process_position, current_gpu = TRAINING_MODEL.get_process_position(hparams.gpus)
sleep(process_position + 1)
# init experiment
log_dir = os.path.dirname(os.path.realpath(__file__))
exp = Experiment(
name='test_tube_exp',
debug=True,
save_dir=log_dir,
version=0,
autosave=False,
description='test demo'
)
exp.argparse(hparams)
exp.save()
# build model
print('loading model...')
model = TRAINING_MODEL(hparams)
print('model built')
# callbacks
early_stop = EarlyStopping(
monitor=hparams.early_stop_metric,
patience=hparams.early_stop_patience,
verbose=True,
mode=hparams.early_stop_mode
)
model_save_path = '{}/{}/{}'.format(hparams.model_save_path, exp.name, exp.version)
checkpoint = ModelCheckpoint(
filepath=model_save_path,
save_function=None,
save_best_only=True,
verbose=True,
monitor=hparams.model_save_monitor_value,
mode=hparams.model_save_monitor_mode
)
# configure trainer
trainer = Trainer(
experiment=exp,
cluster=cluster,
checkpoint_callback=checkpoint,
early_stop_callback=early_stop,
)
# train model
trainer.fit(model)
def get_default_parser(strategy, root_dir):
possible_model_names = list(AVAILABLE_MODELS.keys())
parser = HyperOptArgumentParser(strategy=strategy, add_help=False)
add_default_args(parser, root_dir, possible_model_names=possible_model_names, rand_seed=SEED)
return parser
def get_model_name(args):
for i, arg in enumerate(args):
if 'model_name' in arg:
return args[i+1]
def optimize_on_cluster(hyperparams):
# enable cluster training
cluster = SlurmCluster(
hyperparam_optimizer=hyperparams,
log_path=hyperparams.tt_save_path,
test_tube_exp_name=hyperparams.tt_name
)
# email for cluster coms
cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True)
# configure cluster
cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
cluster.job_time = '48:00:00'
cluster.gpu_type = '1080ti'
cluster.memory_mb_per_node = 48000
# any modules for code to run in env
cluster.add_command('source activate pytorch_lightning')
# name of exp
job_display_name = hyperparams.tt_name.split('_')[0]
job_display_name = job_display_name[0:3]
# run hopt
print('submitting jobs...')
cluster.optimize_parallel_cluster_gpu(
main,
nb_trials=hyperparams.nb_hopt_trials,
job_name=job_display_name
)
if __name__ == '__main__':
model_name = get_model_name(sys.argv)
if model_name is None:
model_name = 'model_template'
# use default args
root_dir = os.path.split(os.path.dirname(sys.modules['__main__'].__file__))[0]
parent_parser = get_default_parser(strategy='random_search', root_dir=root_dir)
# allow model to overwrite or extend args
TRAINING_MODEL = AVAILABLE_MODELS[model_name]
parser = TRAINING_MODEL.add_model_specific_args(parent_parser)
hyperparams = parser.parse_args()
# format GPU layout
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
gpu_ids = hyperparams.gpus.split(';')
# RUN TRAINING
if hyperparams.on_cluster:
# Gets called when running via HPC cluster
print('RUNNING ON SLURM CLUSTER')
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(gpu_ids)
optimize_on_cluster(hyperparams)
elif hyperparams.single_run_gpu:
# run on 1 gpu
print(f'RUNNING 1 TRIAL ON GPU. gpu: {gpu_ids[0]}')
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids[0]
main(hyperparams, None, None)
elif hyperparams.local or hyperparams.single_run:
# run 1 trial but on CPU
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
print('RUNNING LOCALLY')
main(hyperparams, None, None)
else:
# multiple GPUs on same machine
print(f'RUNNING MULTI GPU. GPU ids: {gpu_ids}')
hyperparams.optimize_parallel_gpu(
main_local,
gpu_ids=gpu_ids,
nb_trials=hyperparams.nb_hopt_trials,
nb_workers=len(gpu_ids)
)