In [1]:
import os
import sys
import time
import theano
import lasagne
import pickle
import numpy as np

import theano.tensor as T

from utils import iterate_minibatches

Using gpu device 0: GRID K520 (CNMeM is disabled)


Couldn't import dot_parser, loading of dot files will not be possible.


In [15]:
def build_custom_cnn(input_var=None, widths=None, drop_input=.2,
                     drop_hidden=.5):
    # By default, this creates the same network as `build_mlp`, but it can be
    # customized with respect to the number and size of hidden layers. This
    # mostly showcases how creating a network in Python code can be a lot more
    # flexible than a configuration file. Note that to make the code easier,
    # all the layers are just called `network` -- there is no need to give them
    # different names if all we return is the last one we created anyway; we
    # just used different names above for clarity.

    widths = widths if widths is not None else [100]

    # Input layer and dropout (with shortcut `dropout` for `DropoutLayer`):
    network = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
                                        input_var=input_var)
    if drop_input:
        network = lasagne.layers.dropout(network, p=drop_input)
    # Hidden layers and dropout:
    nonlin = lasagne.nonlinearities.rectify

    
    # Convolutional layer with 32 kernels of size 5x5. Strided and padded
    # convolutions are supported as well; see the docstring.
    network = lasagne.layers.Conv2DLayer(
            network, num_filters=64, filter_size=(5, 5),
            stride=1, pad=2,
            nonlinearity=lasagne.nonlinearities.rectify,
            W=lasagne.init.GlorotUniform())
    # Expert note: Lasagne provides alternative convolutional layers that
    # override Theano's choice of which implementation to use; for details
    # please see http://lasagne.readthedocs.org/en/latest/user/tutorial.html.
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))

    # Another convolution with 32 5x5 kernels, and another 2x2 pooling:
    network = lasagne.layers.Conv2DLayer(
            network, num_filters=128, filter_size=(5, 5),
            stride=1, pad=2,
            nonlinearity=lasagne.nonlinearities.rectify)
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))

    
    for width in widths:
        network = lasagne.layers.DenseLayer(
                network, width, nonlinearity=nonlin)
        if drop_hidden:
            network = lasagne.layers.dropout(network, p=drop_hidden)
    # Output layer:
    softmax = lasagne.nonlinearities.softmax
    network = lasagne.layers.DenseLayer(network, 10, nonlinearity=softmax)
    return network

In [16]:
def generate_train_acc(input_X, target_y, widths, drop_hidden=0.5, learning_rate=1e-4, model_name=None):

    dense_output = build_custom_cnn(input_X, widths=widths, drop_hidden=drop_hidden)
    if model_name is not None and os.path.isfile(os.path.join('models', model_name + '.npz')):
        with np.load(os.path.join('models', model_name + '.npz')) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            lasagne.layers.set_all_param_values(dense_output, param_values)
        
    #предсказание нейронки (theano-преобразование)
    y_predicted = lasagne.layers.get_output(dense_output)

    #все веса нейронки (shared-переменные)
    all_weights = lasagne.layers.get_all_params(dense_output)

    #функция ошибки - средняя кроссэнтропия
    loss = lasagne.objectives.categorical_crossentropy(y_predicted,target_y).mean()
    #loss += lasagne.regularization.l2(0.001)
    accuracy = lasagne.objectives.categorical_accuracy(y_predicted,target_y).mean()

    #сразу посчитать словарь обновлённых значений с шагом по градиенту, как раньше
    updates = lasagne.updates.adam(loss, all_weights, learning_rate=learning_rate)

    #функция, которая обучает сеть на 1 шаг и возвращащет значение функции потерь и точности
    train_fun = theano.function([input_X,target_y],[loss,accuracy],updates=updates)
    accuracy_fun = theano.function([input_X,target_y],accuracy)
    return dense_output, train_fun, accuracy_fun

In [11]:
def run(X_train,y_train,X_val,y_val,X_test,y_test, **kwargs):
    batch_size = kwargs.get('batch_size', 200)
    widths = kwargs.get('widths', [1024, 1024])
    num_epochs = kwargs.get('num_epochs', 4)
    model_name = kwargs.get('model_name', 'default_dense_model')
    snapshot_frequency = kwargs.get('snap_freq', 5)
    print_frequency = kwargs.get('print_freq', 10)
    learning_rate = kwargs.get('learning_rate', 1e-4)
    restart = kwargs.get('restart', False)
    drop_hidden = kwargs.get('drop_hidden', 0.5)
    
    network, train, acc = generate_train_acc(T.tensor4("X"),
                                             T.vector("target Y integer", dtype='int32'),
                                             widths,
                                             drop_hidden,
                                             learning_rate,
                                             None if restart else model_name)
    if os.path.isfile(os.path.join('models', model_name + '.npz')) and not restart:
        result = pickle.load(open(os.path.join('models', model_name + '.dict'), 'r'))
        start_epoch = len(result['train_err'])
    else:
        start_epoch = 0
        result = {}
        result['train_err'] = []
        result['train_acc'] = []
        result['val_acc'] = []
        result['epoch_times'] = []
    
    # Just profile if you need
    #pr = cProfile.Profile()
    #pr.enable()
    max_val_acc = 0.0
    for epoch in range(start_epoch, num_epochs + start_epoch):
        train_err = 0
        train_acc = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train,batch_size):
            inputs, targets = batch
            train_err_batch, train_acc_batch= train(inputs, targets)
            train_err += train_err_batch
            train_acc += train_acc_batch
            train_batches += 1

        # And a full pass over the validation data:
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, batch_size):
            inputs, targets = batch
            val_acc += acc(inputs, targets)
            val_batches += 1

        # Then we print the results for this epoch:
        if epoch % print_frequency == 0:
            print("for dense")
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, start_epoch + num_epochs, time.time() - start_time))
            print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
            print("  train accuracy:\t\t{:.2f} %".format(
                train_acc / train_batches * 100))
            print("  validation accuracy:\t\t{:.2f} %".format(
                val_acc / val_batches * 100))
            sys.stdout.flush()
        result["train_err"].append(train_err / train_batches)
        result["train_acc"].append(train_acc / train_batches * 100)
        result["val_acc"].append(val_acc / val_batches * 100)
        if result["val_acc"][-1] > max_val_acc:
            np.savez(os.path.join('models', model_name + '_best.npz'), *lasagne.layers.get_all_param_values(network))
            max_val_acc = result["val_acc"][-1]
        result['epoch_times'].append(time.time() - start_time)
        
        if epoch % snapshot_frequency == 0 or epoch + 1 == num_epochs + start_epoch:
            np.savez(os.path.join('models', model_name + '.npz'), *lasagne.layers.get_all_param_values(network))
            with open(os.path.join('models', model_name + '.dict'), 'wb') as pickle_file:
                pickle.dump(result, pickle_file)
            
            
    # Just profile if you need
    #pr.disable()
    #pr.print_stats(sort='cumtime')

In [12]:
from mnist.mnist import load_dataset
X_train,y_train,X_val,y_val,X_test,y_test = load_dataset()
print(X_train.shape,y_train.shape)

((50000, 1, 28, 28), (50000,))


In [None]:
params = {
    'num_epochs': 300,
    'learning_rate': 1e-4,
    'snap_freq': 1,
    'print_freq': 5,
    'model_name': 'adam_1e-4_1024_dense_model',
    'drop_hidden': 0.85,
    'restart': True
}

run(X_train,y_train,X_val,y_val,X_test,y_test, **params)

for dense
Epoch 1 of 300 took 28.088s
  training loss (in-iteration):		1.528479
  train accuracy:		46.42 %
  validation accuracy:		76.19 %


In [54]:
params = {
    'num_epochs': 300,
    'learning_rate': 2e-4,
    'snap_freq': 1,
    'print_freq': 5,
    'model_name': 'adam_1em3_dense_model',
    'drop_hidden': 0.8,
    'restart': False
}

run(X_train,y_train,X_val,y_val,X_test,y_test, **params)

for dense
Epoch 1 of 300 took 14.425s
  training loss (in-iteration):		0.442331
  train accuracy:		86.83 %
  validation accuracy:		95.76 %
for dense
Epoch 6 of 300 took 14.928s
  training loss (in-iteration):		0.056697
  train accuracy:		98.20 %
  validation accuracy:		98.17 %
for dense
Epoch 11 of 300 took 14.264s
  training loss (in-iteration):		0.034192
  train accuracy:		98.89 %
  validation accuracy:		98.73 %
for dense
Epoch 16 of 300 took 13.944s
  training loss (in-iteration):		0.023334
  train accuracy:		99.23 %
  validation accuracy:		98.66 %
for dense
Epoch 21 of 300 took 13.820s
  training loss (in-iteration):		0.016460
  train accuracy:		99.49 %
  validation accuracy:		98.74 %
for dense
Epoch 26 of 300 took 14.503s
  training loss (in-iteration):		0.013185
  train accuracy:		99.55 %
  validation accuracy:		98.79 %
for dense
Epoch 31 of 300 took 14.488s
  training loss (in-iteration):		0.009023
  train accuracy:		99.69 %
  validation accuracy:		98.93 %
for dense
Epoch 36 of 

KeyboardInterrupt: 