In [132]:
import random, string, os, sys
import numpy as np
import cntk as C
from __future__ import print_function

In [115]:
num_classes = 5
input_dim = 4

In [142]:
def make_data(n):
    names = [''.join(random.choice(string.ascii_lowercase) for i in range(10)) for i in range(n//5)]
    data = []
    for i in range(n):
        vect = [0] * 4
        vect[0], vect[1] = random.choice(names), random.choice(names)
        vect[0], vect[1] = random.choice([0, 1]), random.choice([0, 1])
        vect[2], vect[3] = random.choice([0, 1]), random.choice([0, 1])
        
        vect.append((vect[2] + vect[3]) % random.randint(1, num_classes))
        data.append(vect)
       # labels.append(random.randint(0, n//10) + vect[2] + vect[3] + sum([ord(i) for i in vect[0]]))
    return data

In [143]:
# Save the data files into a format compatible with CNTK text reader
def savetxt(filename, ndarray):
    dir = os.path.dirname(filename)
    
    print("Saving", filename )
    with open(filename, 'w') as f:
        labels = list(map(' '.join, np.eye(num_classes, dtype=np.uint).astype(str)))
        for row in ndarray:
            row_str = row.astype(str)
            label_str = labels[row[-1]]
            feature_str = ' '.join(row_str[:-1])
            line = '|labels {} |features {}\n'.format(label_str, feature_str)
            f.write(line)
           # print(line)

In [144]:
data = make_data(2000)

In [145]:
savetxt("data", np.array(data))

Saving data


In [146]:
# Read a CTF formatted text (as mentioned above) using the CTF deserializer from a file
def create_reader(path, is_training, input_dim, num_label_classes):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
        labels = C.io.StreamDef(field='labels', shape=num_label_classes, is_sparse=False),
        features   = C.io.StreamDef(field='features', shape=input_dim, is_sparse=False)
    )), randomize = is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

In [147]:
num_hidden_layers = 2
hidden_layers_dim = 400

In [148]:
input = C.input_variable(num_features)
label = C.input_variable(num_classes)

In [149]:
def create_model(features):
    with C.layers.default_options(init = C.layers.glorot_uniform(), activation = C.ops.relu):
            h = features
            for _ in range(num_hidden_layers):
                h = C.layers.Dense(hidden_layers_dim)(h)
            r = C.layers.Dense(num_classes, activation = None)(h)
            return r
        
z = create_model(input)

In [150]:
loss = C.cross_entropy_with_softmax(z, label)

In [151]:
label_error = C.classification_error(z, label)

In [152]:
learning_rate = 0.2
lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
learner = C.sgd(z.parameters, lr_schedule)
trainer = C.Trainer(z, (loss, label_error), [learner])

In [153]:
# Define a utility function to compute the moving average sum.
# A more efficient implementation is possible with np.cumsum() function
def moving_average(a, w=5):
    if len(a) < w:
        return a[:]    # Need to send a copy of the array
    return [val if idx < w else sum(a[(idx-w):idx])/w for idx, val in enumerate(a)]


# Defines a utility that prints the training progress
def print_training_progress(trainer, mb, frequency, verbose=1):
    training_loss = "NA"
    eval_error = "NA"

    if mb%frequency == 0:
        training_loss = trainer.previous_minibatch_loss_average
        eval_error = trainer.previous_minibatch_evaluation_average
        if verbose: 
            print ("Minibatch: {0}, Loss: {1:.4f}, Error: {2:.2f}%".format(mb, training_loss, eval_error*100))
        
    return mb, training_loss, eval_error

In [154]:
minibatch_size = 64
num_samples_per_sweep = 60000
num_sweeps_to_train_with = 10
num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size
train_file = os.path.join("data")

In [155]:
# Create the reader to training data set
reader_train = create_reader(train_file, True, input_dim, num_classes)

# Map the data streams to the input and labels.
input_map = {
    label  : reader_train.streams.labels,
    input  : reader_train.streams.features
} 

# Run the trainer on and perform model training
training_progress_output_freq = 500

plotdata = {"batchsize":[], "loss":[], "error":[]}

for i in range(0, int(num_minibatches_to_train)):
    
    # Read a mini batch from the training data file
    data = reader_train.next_minibatch(minibatch_size, input_map = input_map)
    
    trainer.train_minibatch(data)
    batchsize, loss, error = print_training_progress(trainer, i, training_progress_output_freq, verbose=1)
    
    if not (loss == "NA" or error =="NA"):
        plotdata["batchsize"].append(batchsize)
        plotdata["loss"].append(loss)
        plotdata["error"].append(error)

Minibatch: 0, Loss: 1.6156, Error: 78.12%
Minibatch: 500, Loss: 0.5696, Error: 25.00%
Minibatch: 1000, Loss: 0.3953, Error: 17.19%
Minibatch: 1500, Loss: 0.3779, Error: 18.75%
Minibatch: 2000, Loss: 0.3627, Error: 17.19%
Minibatch: 2500, Loss: 0.4498, Error: 17.19%
Minibatch: 3000, Loss: 0.4143, Error: 18.75%
Minibatch: 3500, Loss: 0.4957, Error: 20.31%
Minibatch: 4000, Loss: 0.5111, Error: 28.12%
Minibatch: 4500, Loss: 0.3784, Error: 20.31%
Minibatch: 5000, Loss: 0.3215, Error: 12.50%
Minibatch: 5500, Loss: 0.4815, Error: 31.25%
Minibatch: 6000, Loss: 0.4234, Error: 20.31%
Minibatch: 6500, Loss: 0.4449, Error: 21.88%
Minibatch: 7000, Loss: 0.4665, Error: 25.00%
Minibatch: 7500, Loss: 0.4521, Error: 26.56%


KeyboardInterrupt: 