/
train.py
110 lines (89 loc) · 3.61 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python2
# -*- coding: UTF-8 -*-
# File: train.py
# Author: Yuxin Wu <ppwwyyxx@gmail.com>
import tensorflow as tf
from utils import *
from utils.concurrency import EnqueueThread,coordinator_guard
from utils.summary import summary_moving_average, describe_model
from dataflow import DataFlow
from itertools import count
import argparse
def prepare():
global_step_var = tf.Variable(
0, trainable=False, name=GLOBAL_STEP_OP_NAME)
def start_train(config):
"""
Start training with the given config
Args:
config: a tensorpack config dictionary
"""
dataset_train = config['dataset_train']
assert isinstance(dataset_train, DataFlow), dataset_train.__class__
# a tf.train.Optimizer instance
optimizer = config['optimizer']
assert isinstance(optimizer, tf.train.Optimizer), optimizer.__class__
# a list of Callback instance
callbacks = config['callback']
# a tf.ConfigProto instance
sess_config = config.get('session_config', None)
assert isinstance(sess_config, tf.ConfigProto), sess_config.__class__
# input/output variables
input_vars = config['inputs']
input_queue = config['input_queue']
get_model_func = config['get_model_func']
max_epoch = int(config['max_epoch'])
enqueue_op = input_queue.enqueue(tuple(input_vars))
model_inputs = input_queue.dequeue()
# set dequeue shape
for qv, v in zip(model_inputs, input_vars):
qv.set_shape(v.get_shape())
output_vars, cost_var = get_model_func(model_inputs, is_training=True)
# build graph
G = tf.get_default_graph()
G.add_to_collection(FORWARD_FUNC_KEY, get_model_func)
for v in input_vars:
G.add_to_collection(INPUT_VARS_KEY, v)
for v in output_vars:
G.add_to_collection(OUTPUT_VARS_KEY, v)
describe_model()
global_step_var = G.get_tensor_by_name(GLOBAL_STEP_VAR_NAME)
avg_maintain_op = summary_moving_average(cost_var)
# maintain average in each step
with tf.control_dependencies([avg_maintain_op]):
grads = optimizer.compute_gradients(cost_var)
for grad, var in grads:
if grad:
tf.histogram_summary(var.op.name + '/gradients', grad)
train_op = optimizer.apply_gradients(grads, global_step_var)
sess = tf.Session(config=sess_config)
sess.run(tf.initialize_all_variables())
# start training:
coord = tf.train.Coordinator()
# a thread that keeps filling the queue
th = EnqueueThread(sess, coord, enqueue_op, dataset_train)
with sess.as_default(), \
coordinator_guard(
sess, coord, th, input_queue):
callbacks.before_train()
for epoch in xrange(1, max_epoch):
with timed_operation('epoch {}'.format(epoch)):
for step in xrange(dataset_train.size()):
fetches = [train_op, cost_var] + output_vars + model_inputs
results = sess.run(fetches)
cost = results[1]
outputs = results[2:2 + len(output_vars)]
inputs = results[-len(model_inputs):]
callbacks.trigger_step(inputs, outputs, cost)
# note that summary_op will take a data from the queue.
callbacks.trigger_epoch()
def main(get_config_func):
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', help='comma separated list of GPU(s) to use.') # nargs='*' in multi mode
args = parser.parse_args()
if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
with tf.Graph().as_default():
prepare()
config = get_config_func()
start_train(config)