-
Notifications
You must be signed in to change notification settings - Fork 24
/
train_mvod_lstm5.py
319 lines (281 loc) · 12.7 KB
/
train_mvod_lstm5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/python3
"""Script for training the MobileVOD with 3 Bottleneck LSTM layers and 1 LSTM layer. As in mobilenet, here we use depthwise seperable convolutions
for reducing the computation without affecting accuracy much. Model is trained on Imagenet VID 2015 dataset.
Here we unroll LSTM for 10 steps and gives 10 consecutive frames of video as input.
Few global variables defined here are explained:
Global Variables
----------------
args : dict
Has all the options for changing various variables of the model as well as hyper-parameters for training.
dataset : VIDDataset (torch.utils.data.Dataset, For more info see datasets/vid_dataset.py)
optimizer : optim.SGD
scheduler : CosineAnnealingLR, MultiStepLR (torch.optim.lr_scheduler)
config : mobilenetv1_ssd_config (See config/mobilenetv1_ssd_config.py for more info, where you can change input size and ssd priors)
loss : MultiboxLoss (See network/multibox_loss.py for more info)
"""
import argparse
import os
import logging
import sys
import itertools
import torch
from torch.utils.data import DataLoader, ConcatDataset
from torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR
from utils.misc import str2bool, Timer, store_labels
from network.mvod_lstm5 import MobileVOD, SSD, MobileNetV1, MatchPrior
from datasets.vid_dataset import VIDDataset
from network.multibox_loss import MultiboxLoss
from config import mobilenetv1_ssd_config
from dataloaders.data_preprocessing import TrainAugmentation, TestTransform
parser = argparse.ArgumentParser(
description='Mobile Video Object Detection (Bottleneck LSTM) Training With Pytorch')
parser.add_argument('--datasets', help='Dataset directory path')
parser.add_argument('--freeze_net', action='store_true',
help="Freeze all the layers except the prediction head.")
parser.add_argument('--width_mult', default=1.0, type=float,
help='Width Multiplifier')
# Params for SGD
parser.add_argument('--lr', '--learning-rate', default=0.003, type=float,
help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float,
help='Momentum value for optim')
parser.add_argument('--weight_decay', default=5e-4, type=float,
help='Weight decay for SGD')
parser.add_argument('--gamma', default=0.1, type=float,
help='Gamma update for SGD')
parser.add_argument('--base_net_lr', default=None, type=float,
help='initial learning rate for base net.')
parser.add_argument('--ssd_lr', default=None, type=float,
help='initial learning rate for the layers not in base net and prediction heads.')
# Params for loading pretrained basenet or checkpoints.
parser.add_argument('--pretrained', help='Pre-trained model')
parser.add_argument('--resume', default=None, type=str,
help='Checkpoint state_dict file to resume training from')
# Scheduler
parser.add_argument('--scheduler', default="multi-step", type=str,
help="Scheduler for SGD. It can one of multi-step and cosine")
# Params for Multi-step Scheduler
parser.add_argument('--milestones', default="80,100", type=str,
help="milestones for MultiStepLR")
# Params for Cosine Annealing
parser.add_argument('--t_max', default=120, type=float,
help='T_max value for Cosine Annealing Scheduler.')
# Train params
parser.add_argument('--batch_size', default=1, type=int,
help='Batch size for training')
parser.add_argument('--num_epochs', default=200, type=int,
help='the number epochs')
parser.add_argument('--num_workers', default=4, type=int,
help='Number of workers used in dataloading')
parser.add_argument('--validation_epochs', default=5, type=int,
help='the number epochs')
parser.add_argument('--debug_steps', default=100, type=int,
help='Set the debug log output frequency.')
parser.add_argument('--sequence_length', default=10, type=int,
help='sequence_length of video to unfold')
parser.add_argument('--use_cuda', default=True, type=str2bool,
help='Use CUDA to train model')
parser.add_argument('--checkpoint_folder', default='models/',
help='Directory for saving checkpoint models')
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
args = parser.parse_args()
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu")
if args.use_cuda and torch.cuda.is_available():
torch.backends.cudnn.benchmark = True
logging.info("Use Cuda.")
def train(loader, net, criterion, optimizer, device, debug_steps=100, epoch=-1, sequence_length=10):
""" Train model
Arguments:
net : object of MobileVOD class
loader : validation data loader object
criterion : Loss function to use
device : device on which computation is done
optimizer : optimizer to optimize model
debug_steps : number of steps after which model needs to debug
sequence_length : unroll length of model
epoch : current epoch number
"""
net.train(True)
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
for i, data in enumerate(loader):
images, boxes, labels = data
for image, box, label in zip(images, boxes, labels):
image = image.to(device)
box = box.to(device)
label = label.to(device)
optimizer.zero_grad()
confidence, locations = net(image)
regression_loss, classification_loss = criterion(confidence, locations, label, box) # TODO CHANGE BOXES
loss = regression_loss + classification_loss
loss.backward(retain_graph=True)
optimizer.step()
running_loss += loss.item()
running_regression_loss += regression_loss.item()
running_classification_loss += classification_loss.item()
net.detach_hidden()
if i and i % debug_steps == 0:
avg_loss = running_loss / (debug_steps*sequence_length)
avg_reg_loss = running_regression_loss / (debug_steps*sequence_length)
avg_clf_loss = running_classification_loss / (debug_steps*sequence_length)
logging.info(
f"Epoch: {epoch}, Step: {i}, " +
f"Average Loss: {avg_loss:.4f}, " +
f"Average Regression Loss {avg_reg_loss:.4f}, " +
f"Average Classification Loss: {avg_clf_loss:.4f}"
)
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
net.detach_hidden()
def val(loader, net, criterion, device):
""" Validate model
Arguments:
net : object of MobileVOD class
loader : validation data loader object
criterion : Loss function to use
device : device on which computation is done
Returns:
loss, regression loss, classification loss
"""
net.eval()
running_loss = 0.0
running_regression_loss = 0.0
running_classification_loss = 0.0
num = 0
for _, data in enumerate(loader):
images, boxes, labels = data
for image, box, label in zip (images, boxes, labels):
image = image.to(device)
box = box.to(device)
label = label.to(device)
num += 1
with torch.no_grad():
confidence, locations = net(image)
regression_loss, classification_loss = criterion(confidence, locations, label, box)
loss = regression_loss + classification_loss
running_loss += loss.item()
running_regression_loss += regression_loss.item()
running_classification_loss += classification_loss.item()
net.detach_hidden()
return running_loss / num, running_regression_loss / num, running_classification_loss / num
def initialize_model(pred_enc, pred_dec):
""" Loads learned weights from pretrained checkpoint model
Arguments:
pred_enc : object of MobileNetV1
pred_dec : object of SSD
"""
if args.pretrained:
logging.info("Loading weights from pretrained netwok")
pretrained_net_dict = torch.load(args.pretrained)
model_dict = pred_enc.state_dict()
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_net_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
pred_enc.load_state_dict(model_dict)
model_dict = pred_dec.state_dict()
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_net_dict.items() if k in model_dict}
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict)
pred_dec.load_state_dict(model_dict)
if __name__ == '__main__':
timer = Timer()
logging.info(args)
config = mobilenetv1_ssd_config #config file for priors etc.
train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std)
target_transform = MatchPrior(config.priors, config.center_variance,
config.size_variance, 0.5)
test_transform = TestTransform(config.image_size, config.image_mean, config.image_std)
logging.info("Prepare training datasets.")
train_dataset = VIDDataset(args.datasets, transform=train_transform,
target_transform=target_transform)
label_file = os.path.join("models/", "vid-model-labels.txt")
store_labels(label_file, train_dataset._classes_names)
num_classes = len(train_dataset._classes_names)
logging.info(f"Stored labels into file {label_file}.")
logging.info("Train dataset size: {}".format(len(train_dataset)))
train_loader = DataLoader(train_dataset, args.batch_size,
num_workers=args.num_workers,
shuffle=True)
logging.info("Prepare Validation datasets.")
val_dataset = VIDDataset(args.datasets, transform=test_transform,
target_transform=target_transform, is_val=True)
logging.info(val_dataset)
logging.info("validation dataset size: {}".format(len(val_dataset)))
val_loader = DataLoader(val_dataset, args.batch_size,
num_workers=args.num_workers,
shuffle=False)
#num_classes = 30
logging.info("Build network.")
pred_enc = MobileNetV1(num_classes=num_classes, alpha = args.width_mult)
pred_dec = SSD(num_classes=num_classes, batch_size = args.batch_size, alpha = args.width_mult, is_test=False)
if args.resume is None:
initialize_model(pred_enc, pred_dec)
net = MobileVOD(pred_enc, pred_dec)
else:
net = MobileVOD(pred_enc, pred_dec)
print("Updating weights from resume model")
net.load_state_dict(
torch.load(args.resume,
map_location=lambda storage, loc: storage))
min_loss = -10000.0
last_epoch = -1
base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr
ssd_lr = args.ssd_lr if args.ssd_lr is not None else args.lr
if args.freeze_net:
logging.info("Freeze net.")
for param in pred_enc.parameters():
param.requires_grad = False
#Freezing upto new lstm layer
net.pred_decoder.conv13.requires_grad = False
net.pred_decoder.bottleneck_lstm1.requires_grad = False
net.pred_decoder.fmaps_1.requires_grad = False
net.pred_decoder.bottleneck_lstm2.requires_grad = False
net.pred_decoder.fmaps_2.requires_grad = False
net.pred_decoder.bottleneck_lstm3.requires_grad = False
net.pred_decoder.fmaps_3.requires_grad = False
net.pred_decoder.lstm4.requires_grad = False
net.pred_decoder.fmaps_4.requires_grad = False
net.to(DEVICE)
criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3,
center_variance=0.1, size_variance=0.2, device=DEVICE)
optimizer = torch.optim.SGD([{'params': [param for name, param in net.pred_encoder.named_parameters()], 'lr': base_net_lr},
{'params': [param for name, param in net.pred_decoder.named_parameters()], 'lr': ssd_lr},], lr=args.lr, momentum=args.momentum,
weight_decay=args.weight_decay)
logging.info(f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, "
+ f"Extra Layers learning rate: {ssd_lr}.")
if args.scheduler == 'multi-step':
logging.info("Uses MultiStepLR scheduler.")
milestones = [int(v.strip()) for v in args.milestones.split(",")]
scheduler = MultiStepLR(optimizer, milestones=milestones,
gamma=0.1, last_epoch=last_epoch)
elif args.scheduler == 'cosine':
logging.info("Uses CosineAnnealingLR scheduler.")
scheduler = CosineAnnealingLR(optimizer, args.t_max, last_epoch=last_epoch)
else:
logging.fatal(f"Unsupported Scheduler: {args.scheduler}.")
parser.print_help(sys.stderr)
sys.exit(1)
output_path = os.path.join(args.checkpoint_folder, f"lstm5")
if not os.path.exists(output_path):
os.makedirs(os.path.join(output_path))
logging.info(f"Start training from epoch {last_epoch + 1}.")
for epoch in range(last_epoch + 1, args.num_epochs):
scheduler.step()
train(train_loader, net, criterion, optimizer,
device=DEVICE, debug_steps=args.debug_steps, epoch=epoch, sequence_length=args.sequence_length)
if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1:
val_loss, val_regression_loss, val_classification_loss = val(val_loader, net, criterion, DEVICE)
logging.info(
f"Epoch: {epoch}, " +
f"Validation Loss: {val_loss:.4f}, " +
f"Validation Regression Loss {val_regression_loss:.4f}, " +
f"Validation Classification Loss: {val_classification_loss:.4f}"
)
model_path = os.path.join(output_path, f"WM-{args.width_mult}-Epoch-{epoch}-Loss-{val_loss}.pth")
torch.save(net.state_dict(), model_path)
logging.info(f"Saved model {model_path}")