diff --git a/.gitignore b/.gitignore
index c823960..5c7da07 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,6 @@
 *.pyc
 *.log
 *.swp
+*.tmp
+*.gz
+*.txt
diff --git a/cmds/run_DNN.py b/cmds/run_DNN.py
index ceecff8..fb8dcdb 100755
--- a/cmds/run_DNN.py
+++ b/cmds/run_DNN.py
@@ -94,17 +94,17 @@
 
     log('> ... finetuning the model')
     while (cfg.lrate.get_rate() != 0):
-        # one epoch of sgd training 
+        # one epoch of sgd training
         train_error = train_sgd(train_fn, cfg)
         log('> epoch %d, training error %f ' % (cfg.lrate.epoch, 100*numpy.mean(train_error)) + '(%)')
-        # validation 
+        # validation
         valid_error = validate_by_minibatch(valid_fn, cfg)
         log('> epoch %d, lrate %f, validation error %f ' % (cfg.lrate.epoch, cfg.lrate.get_rate(), 100*numpy.mean(valid_error)) + '(%)')
         cfg.lrate.get_next_rate(current_error = 100*numpy.mean(valid_error))
         # output nnet parameters and lrate, for training resume
         if cfg.lrate.epoch % cfg.model_save_step == 0:
             _nnet2file(dnn.layers, filename=wdir + '/nnet.tmp')
-            _lrate2file(cfg.lrate, wdir + '/training_state.tmp') 
+            _lrate2file(cfg.lrate, wdir + '/training_state.tmp')
 
     # save the model and network configuration
     if cfg.param_output_file != '':
@@ -117,8 +117,8 @@
     # output the model into Kaldi-compatible format
     if cfg.kaldi_output_file != '':
         dnn.write_model_to_kaldi(cfg.kaldi_output_file)
-        log('> ... the final Kaldi model is ' + cfg.kaldi_output_file) 
+        log('> ... the final Kaldi model is ' + cfg.kaldi_output_file)
 
-    # remove the tmp files (which have been generated from resuming training) 
+    # remove the tmp files (which have been generated from resuming training)
     os.remove(wdir + '/nnet.tmp')
-    os.remove(wdir + '/training_state.tmp') 
+    os.remove(wdir + '/training_state.tmp')
diff --git a/cmds/run_MTL.py b/cmds/run_MTL.py
index d07e1b9..e249fab 100755
--- a/cmds/run_MTL.py
+++ b/cmds/run_MTL.py
@@ -32,11 +32,11 @@
 from utils.utils import parse_arguments, parse_data_spec_mtl, parse_nnet_spec_mtl
 from utils.learn_rates import _lrate2file, _file2lrate
 
-from utils.network_config import NetworkConfig 
+from utils.network_config import NetworkConfig
 from learning.sgd import validate_by_minibatch
 
 # Implements Multi-Task Learning (MTL) in which several tasks share some lower hidden
-# layers (shared representation learning). Each task has its specific higher layers (in 
+# layers (shared representation learning). Each task has its specific higher layers (in
 # the simplest case, a task-specific softmax layer). References include:
 
 # J. Huang, J. Li, D. Yu, L. Deng, and Y. Gong. Cross-language knowledge transfer using
@@ -44,13 +44,13 @@
 
 # Y. Miao, and F. Metze. Improving language-universal feature extraction with deep maxout
 # and convolutional neural networks. Interspeech 2014.
- 
+
 
 if __name__ == '__main__':
 
     # check the arguments
     arg_elements = [sys.argv[i] for i in range(1, len(sys.argv))]
-    arguments = parse_arguments(arg_elements) 
+    arguments = parse_arguments(arg_elements)
 
     required_arguments = ['train_data', 'valid_data', 'task_number', 'shared_nnet_spec', 'indiv_nnet_spec', 'wdir']
     for arg in required_arguments:
@@ -64,25 +64,25 @@
     wdir = arguments['wdir']
 
     # various lists used in MTL
-    config_array = [] 
+    config_array = []
     train_fn_array = []; valid_fn_array = []
     dnn_array = []
-    
+
     # parse data specification
     train_data_spec_array = parse_data_spec_mtl(train_data_spec)
     valid_data_spec_array = parse_data_spec_mtl(valid_data_spec)
     if len(train_data_spec_array) != task_number or len(valid_data_spec_array) != task_number:
         print "Error: #datasets in data specification doesn't match #tasks"; exit(1)
     # split shared_spec ans indiv_spec into individual task's networks
-    nnet_spec_array, shared_layers_num = parse_nnet_spec_mtl(shared_spec, indiv_spec)   
+    nnet_spec_array, shared_layers_num = parse_nnet_spec_mtl(shared_spec, indiv_spec)
     if len(nnet_spec_array) != task_number:
         print "Error: #networks specified by --indiv-spec doesn't match #tasks"; exit(1)
     # parse network configuration from arguments, and initialize data reading
     for n in xrange(task_number):
         network_config = NetworkConfig()
         network_config.parse_config_dnn(arguments, nnet_spec_array[n])
-        network_config.init_data_reading(train_data_spec_array[n], valid_data_spec_array[n]) 
-        config_array.append(network_config) 
+        network_config.init_data_reading(train_data_spec_array[n], valid_data_spec_array[n])
+        config_array.append(network_config)
 
     numpy_rng = numpy.random.RandomState(89677)
     theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
@@ -104,7 +104,7 @@
         # get the training, validation and testing function for the model
         log('> ... getting the finetuning functions for task %d' % (n))
         train_fn, valid_fn = dnn.build_finetune_functions((cfg.train_x, cfg.train_y), (cfg.valid_x, cfg.valid_y), batch_size=cfg.batch_size)
-        # add dnn and the functions to the list   
+        # add dnn and the functions to the list
         dnn_array.append(dnn)
         train_fn_array.append(train_fn); valid_fn_array.append(valid_fn)
         # check the working dir to decide whether it's resuming training; if yes, load the tmp network files for initialization
@@ -137,7 +137,7 @@
             batch_numbers_per_chunk[n] = config_array[n].train_sets.cur_frame_num / config_array[n].batch_size
         # although we set one single trunk size, the actual size of data chunks we read in may differ
         # across the tasks. this is because we may reach the end of the data file. thus, we loop over
-        # the max number of mini-batches, but do the checking on each individual task 
+        # the max number of mini-batches, but do the checking on each individual task
         for batch_index in xrange(max(batch_numbers_per_chunk)):  # loop over mini-batches
             for n in active_tasks:
                 if batch_index < batch_numbers_per_chunk[n]:
diff --git a/learning/sgd.py b/learning/sgd.py
index 19376f6..b321c6b 100755
--- a/learning/sgd.py
+++ b/learning/sgd.py
@@ -28,7 +28,11 @@ def validate_by_minibatch_verbose(valid_fn, valid_sets, valid_xy, batch_size):
     while (not valid_sets.is_finish()):
         valid_sets.load_next_partition(valid_xy)
         for batch_index in xrange(valid_sets.cur_frame_num / batch_size):  # loop over mini-batches
-            valid_error.append(valid_fn(index=batch_index))
+            mean_error = valid_fn(index=batch_index)
+            valid_error += [mean_error] * batch_size
+        if valid_sets.cur_frame_num % batch_size > 0:
+            mean_error = valid_fn(index = valid_sets.cur_frame_num / batch_size)
+            valid_error += [mean_error] * (valid_sets.cur_frame_num % batch_size)
     valid_sets.initialize_read()
     return valid_error
 
@@ -39,7 +43,11 @@ def validate_by_minibatch(valid_fn, cfg):
     while (not valid_sets.is_finish()):
         valid_sets.load_next_partition(valid_xy)
         for batch_index in xrange(valid_sets.cur_frame_num / batch_size):  # loop over mini-batches
-            valid_error.append(valid_fn(index=batch_index))
+            mean_error = valid_fn(index=batch_index)
+            valid_error += [mean_error] * batch_size
+        if valid_sets.cur_frame_num % batch_size > 0:
+            mean_error = valid_fn(index = valid_sets.cur_frame_num / batch_size)
+            valid_error += [mean_error] * (valid_sets.cur_frame_num % batch_size)
     valid_sets.initialize_read()
     return valid_error
 
@@ -56,19 +64,27 @@ def train_sgd_verbose(train_fn, train_sets, train_xy, batch_size, learning_rate,
     while (not train_sets.is_finish()):
         train_sets.load_next_partition(train_xy)
         for batch_index in xrange(train_sets.cur_frame_num / batch_size):  # loop over mini-batches
-            train_error.append(train_fn(index=batch_index, learning_rate = learning_rate, momentum = momentum))
+            mean_error = train_fn(index=batch_index, learning_rate = learning_rate, momentum = momentum)
+            train_error += [mean_error] * batch_size
+        if train_sets.cur_frame_num % batch_size > 0:
+            mean_error = train_fn(index=train_sets.cur_frame_num / batch_size, learning_rate = learning_rate, momentum = momentum)
+            train_error += [mean_error] * (train_sets.cur_frame_num % batch_size)
     train_sets.initialize_read()
     return train_error
 
 def train_sgd(train_fn, cfg):
     train_sets = cfg.train_sets; train_xy = cfg.train_xy
     batch_size = cfg.batch_size
-    learning_rate = cfg.lrate.get_rate(); momentum = cfg.momentum 
-    
+    learning_rate = cfg.lrate.get_rate(); momentum = cfg.momentum
+
     train_error = []
     while (not train_sets.is_finish()):
         train_sets.load_next_partition(train_xy)
         for batch_index in xrange(train_sets.cur_frame_num / batch_size):  # loop over mini-batches
-            train_error.append(train_fn(index=batch_index, learning_rate = learning_rate, momentum = momentum))
+            mean_error = train_fn(index=batch_index, learning_rate = learning_rate, momentum = momentum)
+            train_error += [mean_error] * batch_size
+        if train_sets.cur_frame_num % batch_size > 0:
+            mean_error = train_fn(index=train_sets.cur_frame_num / batch_size, learning_rate = learning_rate, momentum = momentum)
+            train_error += [mean_error] * (train_sets.cur_frame_num % batch_size)
     train_sets.initialize_read()
     return train_error
diff --git a/utils/sda_config.py b/utils/sda_config.py
index 927fbcb..f6208a7 100755
--- a/utils/sda_config.py
+++ b/utils/sda_config.py
@@ -24,12 +24,12 @@ class SdAConfig():
 
     def __init__(self):
 
-        # parameters related with training 
+        # parameters related with training
         self.epochs = 5                  # number of training epochs for each layer
         self.batch_size = 128            # size of mini-batches
         self.corruption_levels=[0.2 for n in xrange(100)]  # denoising factor; we use an array for future extension to layer-specific factor
         self.learning_rates = [.01 for n in xrange(100)]   # learning rate for each layer
-        self.momentum = 0                # momentum 
+        self.momentum = 0                # momentum
 
         self.ptr_layer_number = 0        # number of layers to be trained
         self.hidden_activation = T.nnet.sigmoid   # activation function of the hidden layer/output
@@ -37,7 +37,7 @@ def __init__(self):
                                                                    # if we normaze the input data with mean (maybe also with variance)
                                                                    # normalization, then we need the tanh activation function to reconstruct
                                                                    # the input
-        
+
         # for maxout autoencoder
         self.do_maxout = False           # whether to apply maxout on the hidden layer
         self.pool_size = 1               # pooling size of maxout
@@ -48,7 +48,7 @@ def __init__(self):
         self.train_x = None
         self.train_y = None
 
-        # interfaces for validation data. we don't do validation for RBM, so these variables will be None 
+        # interfaces for validation data. we don't do validation for RBM, so these variables will be None
         # we have these variables because we want to use the _cfg2file function from io_func/model_io.py
         self.valid_sets = None
         self.valid_xy = None
@@ -78,7 +78,7 @@ def init_data_reading(self, train_data_spec):
     def init_activation(self):
         self.activation = parse_activation(self.activation_text)
 
-    # parse the arguments to get the values for various variables 
+    # parse the arguments to get the values for various variables
     def parse_config_common(self, arguments):
         if arguments.has_key('corruption_level'):
             self.corruption_levels = [float(arguments['corruption_level']) for n in xrange(100)]