# Deep learning: singleton classification in XRENNER


In [1]:
cd ~/Desktop/deep_xrenner/

/Users/zangsir/Desktop/deep_xrenner


In [2]:
ls

[1m[34mData[m[m/               mlp_update.py       singleton_net.p     xrenner_xrenner.py


In [3]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

want: in the notMNIST data set like this:

Training set (200000, 28, 28) (200000,)

Validation set (10000, 28, 28) (10000,)

Test set (10000, 28, 28) (10000,)

and after transformation:

Training set (200000, 784) (200000, 10)

Validation set (10000, 784) (10000, 10)

Test set (10000, 784) (10000, 10)

## load data

In [122]:
def load_data(datafile):
    data = np.loadtxt(datafile, delimiter=',')

    # first ten values are the one hot encoded y (target) values
    ####y = data[:, 0:10]
    y = data[:, -1]
    # y[y == 0] = -1 # if you are using a tanh transfer function make the 0 into -1
    # y[y == 1] = .90 # try values that won't saturate tanh

    data = data[:, :-1]  # x data
    # data = data - data.mean(axis = 1)
    data -= data.min()  # scale the data so values are between 0 and 1
    data /= data.max()  # scale
    out = []
    labels=[]
    print(data.shape)
    label_dict={1:[0,1],0:[1,0]}
    # populate the tuple list with the data
    for i in range(data.shape[0]):
        fart = list((data[i, :].tolist()))  # don't mind this variable name
        out.append(fart)
        #two classes one hot coding:two classes are [NS,S] y=1:[0,1]; y=0:[1,0]
        labels.append(label_dict[y[i]])

    return np.array(out,dtype=np.float32),np.array(labels,dtype=np.float32)

In [98]:
datafile='Data/singleton_data_mini.tab'
X,labels = load_data(datafile)


(40000, 14)


In [36]:
X.shape[0]

40000

In [101]:
X

array([[  6.71140943e-03,   2.68456377e-02,   0.00000000e+00, ...,
          0.00000000e+00,   6.10128103e-04,   1.13177221e-05],
       [  1.34228189e-02,   6.71140943e-03,   6.71140943e-03, ...,
          0.00000000e+00,   1.22025621e-03,   2.26354441e-05],
       [  6.71140943e-03,   1.34228189e-02,   0.00000000e+00, ...,
          0.00000000e+00,   2.44051241e-03,   4.52708882e-05],
       ..., 
       [  6.71140943e-03,   2.68456377e-02,   0.00000000e+00, ...,
          0.00000000e+00,   4.72849282e-03,   2.08693999e-03],
       [  1.94630876e-01,   2.01342274e-02,   6.71140943e-03, ...,
          0.00000000e+00,   4.88102483e-03,   2.11065519e-03],
       [  5.36912754e-02,   1.34228189e-02,   1.34228189e-02, ...,
          0.00000000e+00,   5.79621736e-03,   2.25294661e-03]], dtype=float32)

In [32]:
labels

array([[0, 1],
       [0, 1],
       [1, 0],
       ..., 
       [0, 1],
       [0, 1],
       [0, 1]])

## divide into dev, val, test sets

In [111]:
train_prop=0.9
val_prop=0.05
test_prop=0.05

train_size=X.shape[0]*train_prop
val_size=X.shape[0]*val_prop
test_size=X.shape[0]*test_prop

#should shuffle the dataset

In [50]:
def randomize(dataset, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_dataset = dataset[permutation,:]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels

In [113]:
#pipeline
#load data
datafile='Data/singleton_data_mini.tab'
X,labels = load_data(datafile)

#shuffle dataset
dataset_shf,labels_shf = randomize(X, labels)

#get sizes of division
train_dataset=dataset_shf[:int(train_size),:]
valid_dataset=dataset_shf[int(train_size):int(train_size+val_size),:]
test_dataset=dataset_shf[int(train_size+val_size):,:]
train_labels=labels_shf[:int(train_size)]
valid_labels=labels_shf[int(train_size):int(train_size+val_size)]
test_labels=labels_shf[int(train_size+val_size):]

#divide the dataset
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

In [115]:
train_dataset

array([[ 0.02684564,  0.00671141,  0.        , ...,  0.        ,
         0.00503356,  0.00433419],
       [ 0.00671141,  0.02684564,  0.        , ...,  0.        ,
         0.0059657 ,  0.00409857],
       [ 0.06711409,  0.02013423,  0.        , ...,  0.        ,
         0.00198292,  0.00483374],
       ..., 
       [ 0.01342282,  0.01342282,  0.        , ...,  0.        ,
         0.00491079,  0.00612212],
       [ 0.02013423,  0.02013423,  0.        , ...,  0.        ,
         0.00233882,  0.00421499],
       [ 0.06040268,  0.02013423,  0.00671141, ...,  0.        ,
         0.00457596,  0.00025422]], dtype=float32)

In [106]:
print (train_dataset.shape,train_labels.shape)
print (valid_dataset.shape,valid_labels.shape)
print (test_dataset.shape,test_labels.shape)

(36000, 14) (36000, 2)
(2000, 14) (2000, 2)
(2000, 14) (2000, 2)


In [87]:
#wrote a function to return all the singleton indexes, but accidentlly deleted that. 

In [116]:
#check if the labels are still good after shuffling
#it is good. 
print (train_dataset[12],train_labels[12])
print (train_dataset[39],train_labels[39])
print (train_dataset[10001],train_labels[10001])

[ 0.00671141  0.02684564  0.          0.          0.          0.          0.
  0.          0.00671141  0.          0.          0.          0.00494525
  0.00033726] [ 0.  1.]
[ 0.01342282  0.02013423  0.00671141  0.          0.          0.          0.
  0.00671141  0.          0.          0.          0.          0.00064533
  0.00386733] [ 0.  1.]
[ 0.02684564  0.02013423  0.00671141  0.          0.          0.          0.
  0.          0.00671141  0.          0.          0.          0.00366077
  0.00437005] [ 0.  1.]


In [117]:
train_dataset

array([[ 0.02684564,  0.00671141,  0.        , ...,  0.        ,
         0.00503356,  0.00433419],
       [ 0.00671141,  0.02684564,  0.        , ...,  0.        ,
         0.0059657 ,  0.00409857],
       [ 0.06711409,  0.02013423,  0.        , ...,  0.        ,
         0.00198292,  0.00483374],
       ..., 
       [ 0.01342282,  0.01342282,  0.        , ...,  0.        ,
         0.00491079,  0.00612212],
       [ 0.02013423,  0.02013423,  0.        , ...,  0.        ,
         0.00233882,  0.00421499],
       [ 0.06040268,  0.02013423,  0.00671141, ...,  0.        ,
         0.00457596,  0.00025422]], dtype=float32)

In [118]:
num_nodes= 1024
batch_size = 128
input_size=X.shape[1]
num_labels=2

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, input_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    weights_1 = tf.Variable(
      tf.truncated_normal([input_size, num_nodes]))
    biases_1 = tf.Variable(tf.zeros([num_nodes]))
    weights_2 = tf.Variable(
      tf.truncated_normal([num_nodes, num_labels]))
    biases_2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    relu_layer=tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)#notice the shape of tf_train_dataset and weights_1
    logits = tf.matmul(relu_layer, weights_2) + biases_2
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(
     tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
    test_prediction =  tf.nn.softmax(
     tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

In [120]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [121]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 0.576419
Minibatch accuracy: 77.3%
Validation accuracy: 86.7%
Minibatch loss at step 500: 0.303830
Minibatch accuracy: 89.8%
Validation accuracy: 86.7%
Minibatch loss at step 1000: 0.433002
Minibatch accuracy: 83.6%
Validation accuracy: 86.7%
Minibatch loss at step 1500: 0.393721
Minibatch accuracy: 86.7%
Validation accuracy: 86.7%
Minibatch loss at step 2000: 0.296616
Minibatch accuracy: 89.8%
Validation accuracy: 86.7%
Minibatch loss at step 2500: 0.398171
Minibatch accuracy: 86.7%
Validation accuracy: 86.7%
Minibatch loss at step 3000: 0.321350
Minibatch accuracy: 89.8%
Validation accuracy: 86.7%
Test accuracy: 85.0%


## data process pipeline using tensorflow

In [123]:
#pipeline
#load data
datafile='Data/singleton_data_mini.tab'
datafile='Data/singleton_data.tab'
X,labels = load_data(datafile)

#shuffle dataset
dataset_shf,labels_shf = randomize(X, labels)

#get sizes of division
train_dataset=dataset_shf[:int(train_size),:]
valid_dataset=dataset_shf[int(train_size):int(train_size+val_size),:]
test_dataset=dataset_shf[int(train_size+val_size):,:]
train_labels=labels_shf[:int(train_size)]
valid_labels=labels_shf[int(train_size):int(train_size+val_size)]
test_labels=labels_shf[int(train_size+val_size):]

#divide the dataset
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

(360115, 12)


In [124]:
#set of a tf graph
num_nodes= 1024
batch_size = 128
input_size=X.shape[1]
num_labels=2

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,
                                      shape=(batch_size, input_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    weights_1 = tf.Variable(
      tf.truncated_normal([input_size, num_nodes]))
    biases_1 = tf.Variable(tf.zeros([num_nodes]))
    weights_2 = tf.Variable(
      tf.truncated_normal([num_nodes, num_labels]))
    biases_2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    relu_layer=tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)#notice the shape of tf_train_dataset and weights_1
    logits = tf.matmul(relu_layer, weights_2) + biases_2
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(
     tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1), weights_2) + biases_2)
    test_prediction =  tf.nn.softmax(
     tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1), weights_2) + biases_2)

In [126]:
#doing the training
num_steps = 10000

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 0.768551
Minibatch accuracy: 56.2%
Validation accuracy: 80.3%
Minibatch loss at step 500: 0.389067
Minibatch accuracy: 86.7%
Validation accuracy: 80.3%
Minibatch loss at step 1000: 0.461038
Minibatch accuracy: 82.8%
Validation accuracy: 80.3%
Minibatch loss at step 1500: 0.503479
Minibatch accuracy: 78.9%
Validation accuracy: 80.3%
Minibatch loss at step 2000: 0.467837
Minibatch accuracy: 80.5%
Validation accuracy: 80.3%
Minibatch loss at step 2500: 0.403675
Minibatch accuracy: 85.2%
Validation accuracy: 80.3%
Minibatch loss at step 3000: 0.498244
Minibatch accuracy: 78.9%
Validation accuracy: 80.3%
Minibatch loss at step 3500: 0.429803
Minibatch accuracy: 83.6%
Validation accuracy: 80.3%
Minibatch loss at step 4000: 0.395747
Minibatch accuracy: 85.2%
Validation accuracy: 80.3%
Minibatch loss at step 4500: 0.410204
Minibatch accuracy: 85.2%
Validation accuracy: 80.3%
Minibatch loss at step 5000: 0.516976
Minibatch accuracy: 78.9%
Validation accurac