In [75]:
import pandas as pd

df = pd.read_csv('vocab.nips.txt')
df = df.sample(frac=1).reset_index(drop=True)

def countVowels(row):
    s = row['a2i']
    return s.count('a') + s.count('e') + s.count('i') + s.count('o') + s.count('u')


df['vowels'] = df.apply(countVowels, axis=1)
print(df.shape)
print(df.columns.values)
df = df[df.a2i.str.len() <= 10]
print(df.shape)

df['binary'] = df.apply(lambda row: ''.join(format(ord(x), 'b') for x in row[0]).rjust(70, '0'), axis=1)
df.to_csv('vocab.vowels.txt')

(12418, 2)
['a2i' 'vowels']
(10916, 2)


<function print>

In [89]:
# This loads a pandas dataframe with a few columns.  They are:
# word   - The word (all words are 10 characters or less)
# vowels - The number of vowels in the word
# binary - A string of 70 bits, left-padded with 0, of the 7-bit ASCII codes for each letter of the word

df = pd.read_csv('vocab.vowels.txt', names=['id', 'word', 'vowels', 'binary'], index_col='id', skiprows=1)
print(df)



             word  vowels                                             binary
id                                                                          
0            ator       2  0000000000000000000000000000000000000000001100...
1       mjolsness       2  0000000110110111010101101111110110011100111101...
2             lse       1  0000000000000000000000000000000000000000000000...
3             arc       1  0000000000000000000000000000000000000000000000...
4        modelled       3  0000000000000011011011101111110010011001011101...
5       crucially       3  0000000110001111100101110101110001111010011100...
6         lateral       3  0000000000000000000001101100110000111101001100...
7      wavelength       3  1110111110000111101101100101110110011001011101...
9        ignoring       3  0000000000000011010011100111110111011011111110...
10            twn       0  0000000000000000000000000000000000000000000000...
11        damasio       4  0000000000000000000001100100110000111011011100...

In [97]:
print([c for c in (str(i) for i in df['binary'])])

# TODO split the binary column into a numpy array

['0000000000000000000000000000000000000000001100001111010011011111110010', '0000000110110111010101101111110110011100111101110110010111100111110011', '0000000000000000000000000000000000000000000000000110110011100111100101', '0000000000000000000000000000000000000000000000000110000111100101100011', '0000000000000011011011101111110010011001011101100110110011001011100100', '0000000110001111100101110101110001111010011100001110110011011001111001', '0000000000000000000001101100110000111101001100101111001011000011101100', '1110111110000111101101100101110110011001011101110110011111101001101000', '0000000000000011010011100111110111011011111110010110100111011101100111', '0000000000000000000000000000000000000000000000000111010011101111101110', '0000000000000000000001100100110000111011011100001111001111010011101111', '0000000000000000000000000000110100011011111101100110110011001011110010', '0000000000000000000000000000110001111010001100001110111011001111100101', '110111011001011100111110000111101001

In [None]:

import tensorflow as tf
import numpy as np


tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.INFO)

idx = 0 # The layer index
dropout_rate = 0.666
model_dir = "results"

def normalize(mode, input):
    return tf.layers.batch_normalization(input, training=True)

def convolution(mode, input_layer, filters, kernel_size, padding="VALID"):
    global idx
    idx = idx+1
    print("Layer: conv" + str(idx))
    
    return normalize(mode, tf.layers.separable_conv2d(
          name="conv" + str(idx) + "_",
          inputs=input_layer,
          filters=filters,
          kernel_size=kernel_size,
          padding=padding,
          activation=tf.nn.relu))
                     
def pool(mode, input_layer, pool_size=[2,2], strides=[2,2]):
    global idx
    idx = idx+1
    print("Layer: pool" + str(idx) + "_")
    
    return tf.layers.max_pooling2d(inputs=input_layer, pool_size=pool_size, strides=strides, name="pool" + str(idx))

def deep(mode, layer, units, reshape=None):
    global idx
    idx = idx+1
    print("Layer: deep" + str(idx) + "_")
    
    if reshape != None:
        layer = tf.reshape(layer, reshape)
    layer = tf.layers.dropout(inputs=layer, rate=dropout_rate, training=mode == tf.estimator.ModeKeys.TRAIN)
    layer = tf.layers.dense(inputs=layer, units=units, activation=tf.nn.relu)
    layer = tf.layers.batch_normalization(layer, training=True)
    return layer



In [None]:


                     
def model_fn(features, labels, mode):
    """Neural Network Model."""
    with tf.device("/gpu:0"):

        # Input Layer
        initial = tf.reshape(features, [-1, 70])
        outputs = 11

        layer = initial
        
        k  = [64, 64, 64, 64]

        # Convolutional layers
        layer = deep(mode, layer, k[1])
        layer = deep(mode, layer, k[2])
        layer = deep(mode, layer, k[3])
        layer = deep(mode, layer, k[4])
        
        
        # Logits Layer (there are 11 possible outputs)
        logits = tf.layers.dense(inputs=layer, units=num_outputs, name="last_layer")

        predictions = {
          # Generate predictions (for PREDICT and EVAL mode)
          "classes": tf.argmax(input=logits, axis=1),
          # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
          # `logging_hook`.
          "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
        }
        
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
        
        # Calculate Loss (for both TRAIN and EVAL modes)
        onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_outputs)
        loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)

        tf.summary.scalar('loss', loss)
        tf.summary.merge_all()
        
        # Configure the Training Op (for TRAIN mode)
        if mode == tf.estimator.ModeKeys.TRAIN:
            optimizer = tf.train.AdamOptimizer()
            train_op = optimizer.minimize(
                loss=loss,
                global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

        # Add evaluation metrics (for EVAL mode)
        
        
        eval_metric_ops = {
          "accuracy": tf.metrics.accuracy(
              labels=labels, predictions=predictions["classes"])}
        return tf.estimator.EstimatorSpec(
            mode=mode, 
            loss=loss, 
            eval_metric_ops=eval_metric_ops
        )

In [None]:
def trainTheModel(train_data, train_labels, eval_data, eval_labels):
    global idx
    # Create the Estimator
    session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
    session_config.gpu_options.per_process_gpu_memory_fraction = 0.75
    
    run_config = tf.estimator.RunConfig()
    run_config = run_config.replace(
        save_checkpoints_steps=1000, 
        session_config=session_config,
        keep_checkpoint_max=1000)

    estimator = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir, config=run_config)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)
    
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)
    
    summary_hook = tf.train.SummarySaverHook(
        100,
        output_dir=model_dir,
        scaffold=tf.train.Scaffold())
    
    for epoch in range(10):
        # train
        idx=0
        estimator.train(
            input_fn=train_input_fn,
            steps=500, hooks=[summary_hook])
        tf.reset_default_graph()
        idx=0
        estimator.evaluate(input_fn=eval_input_fn, steps=100)
    