In [34]:
#!/usr/bin/env python3

import matplotlib.pyplot as plt
import numpy as np
import pickle as pkl
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
import os 
from math import sqrt 

# plt.close('all')  # if you like

# load dataset
with open('isolet_crop_train.pkl', 'rb') as f:
    train_data = pkl.load(f)
with open('isolet_crop_test.pkl', 'rb') as f:
    test_data = pkl.load(f)

X_alltrain, Y_alltrain = train_data
X_test, Y_test = test_data

def onehotK(labels):
    label_binarizer = sklearn.preprocessing.LabelBinarizer()
    label_binarizer.fit(range(max(labels)))
    labels = label_binarizer.transform(labels)
    return labels

# convert labels 
Y_alltrain = onehotK(Y_alltrain)
Y_test = onehotK(Y_test)

# split the datasets: big training set into training and validation 
x_train,x_allval,y_train,y_allval = train_test_split(X_alltrain,Y_alltrain, test_size = 0.3,stratify = Y_alltrain)
# split validation data into smaller validation and early stopping
x_val,x_estop,y_val,y_estop = train_test_split(x_allval,y_allval, test_size = 0.5,stratify = y_allval)

# normalize features to zero mean and unit variance
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train) # fit scaler and apply transformation
x_estop = scaler.transform(x_estop) # apply same transformation to test data
x_val = scaler.transform(x_val) # apply same transformation to test data

# for a) 
xtrain = x_train
ytrain = y_train

nHiddenLayers = 1 # number of hidden layers
n_hidden_1 = 20 # number of nodes in each layer

# from data
nSamples,nFeat = xtrain.shape
nClasses = ytrain.shape[1]
nInput = nClasses
nOutput = nClasses

lay = 4;
opt = 3; # which training algorithm

# placeholders for data and labels
x = tf.placeholder("float",shape = (None,nFeat))
y_true = tf.placeholder("float",shape=(None,nClasses))
    
# initialize and store layers weight & bias for 1 layer scenario
weights = {
    'h1': tf.Variable(tf.truncated_normal([nFeat, n_hidden_1], stddev=1.0 / sqrt(float(nFeat))),trainable=True),
    'out': tf.Variable(tf.truncated_normal([n_hidden_1, nOutput],stddev=1.0 / sqrt(float(n_hidden_1))),trainable=True)
}

if lay == 3: # relu case
    biases = {
        'b1': tf.Variable(tf.constant(0.1,shape=[n_hidden_1]),trainable=True),
        'out': tf.Variable(tf.constant(0.1,shape=[nOutput]),trainable=True)
    }
else:   
    biases = {
        'b1': tf.Variable(tf.zeros([n_hidden_1]),trainable=True),
        'out': tf.Variable(tf.zeros([nOutput]),trainable=True)
    }

# define the network
def defineNN(x,lay):
    
    if lay == 1:
        layer_1 = tf.nn.sigmoid((tf.matmul(x, weights['h1'])+ biases['b1']))
    elif lay == 2:
        layer_1 = tf.nn.tanh((tf.matmul(x, weights['h1'])+ biases['b1']))
    elif lay ==3:
        layer_1 = tf.nn.relu((tf.matmul(x, weights['h1'])+ biases['b1']))
    elif lay == 4:
        layer_1 = tf.nn.crelu((tf.matmul(x, weights['h1'])+ biases['b1']))
    elif lay == 5:
        layer_1 = tf.nn.swish((tf.matmul(x, weights['h1'])+ biases['b1']))
    
    # Hidden fully connected layer
    #layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    # Output fully connected layer with a neuron for each class: logits ready for softmax 
    #y = tf.nn.softmax(tf.matmul(layer_1, weights['out']) + biases['out'])
    y = (tf.matmul(layer_1, weights['out']) + biases['out'])
    
    return y

y = defineNN(x,lay)

# check also implementations of 
# tf.nn.softmax_cross_entropy_with_logits
# tf.nn.sparse_softmax_cross_entropy_with_logits

with tf.name_scope('cross_entropy'):
    ce = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y)
    cost = tf.reduce_mean(ce)
    tf.summary.scalar('cross_entropy',cost)
with tf.name_scope('train'):
    if opt == 1:
        if lay == 3:
            lr = 0.001 # learning rate     
        else:
            lr = 0.5
        train_step = tf.train.GradientDescentOptimizer(lr).minimize(cost) # learning rate = 0.001
    elif opt ==2:
        lr = 0.001 # learning rate        
        train_step = tf.train.RMSPropOptimizer(lr).minimize(cost)
    elif opt ==3: 
        lr = 0.001 # learning rate                
        train_step = tf.train.AdamOptimizer(lr).minimize(cost)
with tf.name_scope('misclassification'):
    correct = tf.equal(tf.argmax(y,1),tf.argmax(y_true,1))    
    misclassification = (1-tf.reduce_mean(tf.cast(correct,tf.float64)))*100
    tf.summary.scalar('misclassification',misclassification)

with tf.Session() as sess:
    ## Actual training step
    # init variables to start from scratch
    sess.run(tf.global_variables_initializer())

    # create list to monitor how error decreases
    estop_error_list = []
    train_error_list = []
    estop_mcr_list = []
    train_mcr_list = []

    # Create minibtaches to train faster
    k_batch = 40 # size of minibatch in n. examples
    xbatch_list = np.array_split(xtrain, k_batch)
    ybatch_list = np.array_split(ytrain, k_batch)

    a = 0
    for k in range(500):
        # run training steps in minibatches 
        for x_minibatch,labels_minibatch in zip(xbatch_list,ybatch_list):
            sess.run(train_step, feed_dict={x: x_minibatch, y_true:labels_minibatch})
        
        ww = sess.run(weights, feed_dict={x: xtrain, y_true: ytrain})

        train_err = sess.run(cost, feed_dict={x: xtrain, y_true: ytrain})
        estop_err = sess.run(cost, feed_dict={x: x_estop, y_true: y_estop})
    
        # Compute the mcr over the whole dataset
        train_mcr = sess.run(misclassification, feed_dict={x: xtrain, y_true: ytrain})
        estop_mcr = sess.run(misclassification, feed_dict={x: x_estop, y_true: y_estop})
    
        # Put cee and mcr into the lists
        estop_error_list.append(estop_err)
        train_error_list.append(train_err)
        estop_mcr_list.append(estop_mcr)
        train_mcr_list.append(train_mcr)
        
        #if opt == 1: # criteria for Stochastic GD
            #if k > 1 and (val_error_list[-1]-val_error_list[-2])>-1e-3:
                #print("Converged at epoch %d with misclassification rate of %.2f %%" % (k,train_mcr_list[-1]))
                #stopping = k
                #break
        #else: # criteria for RMSProp and ADAM
        if k > 1 and (estop_error_list[-1]-estop_error_list[-2] > 0) and a == 0:
            a=1
            print("Converged at epoch %d with misclassification rate of %.2f on E validation set %%" % (k-1,estop_mcr_list[-1]))
            validation_mcr = sess.run(misclassification, feed_dict={x: x_val, y_true: y_val})
            print("                           misclassification rate of %.2f on V validation set %%" % (validation_mcr))            
            break

sess.close()

fig,ax_list = plt.subplots(1,2)
ax_list[0].plot(train_error_list, color='blue', label='training', lw=2)
ax_list[0].plot(estop_error_list, color='green', label='test', lw=2)
ax_list[1].plot(train_mcr_list, color='blue', label='training', lw=2)
ax_list[1].plot(estop_mcr_list, color='green', label='test', lw=2)

ax_list[0].set_title('Cross-entropy')
ax_list[0].set_xlabel('Training epoch')
ax_list[0].set_ylabel('Cross-entropy')
ax_list[1].set_title('Misclassification Rate (%)')
ax_list[1].set_xlabel('Training epoch')
ax_list[1].set_ylabel('Misclassification Rate (%)')
ax_list[0].legend(loc='best')
ax_list[1].legend(loc='best')
plt.show()


# Re-train 
# Use all train + estop data; stop at iteration; test on validation set 




ValueError: Dimensions must be equal, but are 40 and 20 for 'MatMul_61' (op: 'MatMul') with input shapes: [?,40], [20,26].

In [None]:
plt.hist(ww['h1'])
np.shape(ww['h1'])