In [1]:
import nltk
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing



In [2]:
arpabet = nltk.corpus.cmudict.dict()

In [3]:
for word in ('s', 'see', 'sea', 'compute', 'comput', 'seesea'):
    try:
        print(arpabet[word])
    except Exception as e:
        print(e)

[['EH1', 'S']]
[['S', 'IY1']]
[['S', 'IY1']]
[['K', 'AH0', 'M', 'P', 'Y', 'UW1', 'T']]
'comput'
'seesea'


In [4]:
len(arpabet)

123455

# Batch Processing

* Normalize the record sizes
* Do not one-hot encode, leave that for the neural net
* Do label encoding to numerics

In [5]:
import itertools


class LabelEncoder(object):
    '''
    A progressive label encoder
    '''
    def __init__(self, dim=1):
        self.dim = dim
        self.classes_ = dict()
        self.lookup_ = dict()
        self.max_sequence_size = 0
    
    @property
    def n_classes(self):
        return len(self.classes_)
    
    def _all_labels(self, data, update_size=False):
        #TODO use self.dim
        for seq in data:
            if update_size:
                self.max_sequence_size = max(self.max_sequence_size, len(seq))
            for label in seq:
                yield label
    
    def fit(self, data):
        for t in map(self.tokenize_label, self._all_labels(data, update_size=True)):
            pass
        #self.onehot_encoder = sklearn.preprocessing.OneHotEncoder(self.n_classes)
    
    def refit(self, data):
        batch_size = len(data)
        max_size = self.max_sequence_size
        o_encoded = np.empty((batch_size, max_size), dtype='object')
        o_encoded.fill(self.lookup_[0])

        for i, seq in enumerate(data):
            o_encoded[i,0:len(seq)] = seq
        return o_encoded
    
    def transform(self, data):
        batch_size = len(data)
        max_size = self.max_sequence_size
        o_encoded = np.zeros((batch_size, max_size), dtype='int32')

        for i, seq in enumerate(data):
            for j, syl in enumerate(seq):
                o_encoded[i,j] = self.tokenize_label(syl)
        return o_encoded
    
    def inverse_transform(self, data):
        batch_size = len(data)
        max_size = self.max_sequence_size
        decoded = np.zeros((batch_size, max_size), dtype='object')

        for i, seq in enumerate(data):
            for j, index in enumerate(seq):
                decoded[i,j] = self.lookup_[index]
        return decoded
        
    def onehot_label(self, label):
        token = self.tokenize_label(label)
        return self.onehot_encoder.fit_transform(token).toarray()
    
    def tokenize_label(self, label):
        if label in self.classes_:
            return self.classes_[label]
        else:
            index = len(self.classes_)
            self.classes_[label] = index
            self.lookup_[index] = label
            return index


def track_and_refit(data, null_class=''):
    encoder = LabelEncoder()
    encoder.fit([[null_class]])
    encoder.fit(data)
    return encoder.transform(data), encoder


In [6]:
X_, y_ = [], []

for word, utterances in arpabet.items():
    word = '\uFF08' + word + '\uFF09'
    X_.extend([list(word)]*len(utterances))
    y_.extend(utterances)

    
print(X_[0], y_[0])

X, X_enc = track_and_refit(X_)
X_classes = list(X_enc.classes_.keys())
y, y_enc = track_and_refit(y_)
y_classes = list(y_enc.classes_.keys())

col_chars = [tf.contrib.layers.sparse_column_with_keys(column_name="char"+str(i), keys=X_classes)
            for i in range(X_enc.max_sequence_size)]
col_seq = [tf.contrib.layers.sparse_column_with_keys(column_name="seq"+str(i), keys=y_classes)
            for i in range(y_enc.max_sequence_size)]

print(len(X), len(y))
print(X[0])
print(y[0])
n_classes = y_enc.n_classes
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)#, stratify=n_classes)

['（', 's', 'y', 'n', 'c', 'h', 'r', 'o', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', '）'] ['S', 'IH2', 'NG', 'K', 'R', 'AH0', 'N', 'AH0', 'Z', 'EY1', 'SH', 'AH0', 'N']
133737 133737
[ 1  2  3  4  5  6  7  8  4  9 10 11 12  9  8  4 13  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0]
[ 1  2  3  4  5  6  7  6  8  9 10  6  7  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0]


In [None]:
from sklearn import datasets, metrics, cross_validation
import tensorflow as tf
from tensorflow.contrib import layers
from tensorflow.contrib import learn

#tf.reset_default_graph()


def my_model(features, target):
    print('Model:', features.get_shape(), '->', target.get_shape())
    
    feature_classes = X_enc.n_classes
    target_classes = y_enc.n_classes
    sequence_size = y_enc.max_sequence_size
    print(feature_classes, target_classes)
    target_one_hot = tf.one_hot(target, target_classes, 1, 0)
    features = tf.to_float(tf.one_hot(features, feature_classes, 1, 0))

    cell = tf.nn.rnn_cell.LSTMCell(num_units=target_classes, state_is_tuple=True)
    
    #lstm = tf.nn.rnn_cell.BasicLSTMCell(target_classes, state_is_tuple=True)
    #stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([lstm] * 2,
    #    state_is_tuple=True)
    #cell = stacked_lstm


    outputs, states = tf.nn.dynamic_rnn(
        cell=cell,
        dtype=tf.float32,
        #sequence_length=[sequence_size]*128, #should chomp based on sequence eof
        time_major=False,
        #sequence_length=[1, sequence_size],
        inputs=features)

    #output_fw, output_bw = outputs
    #states_fw, states_bw = states

    features = outputs
    '''
    result = tf.contrib.learn.run_n(
        {"output_fw": output_fw, "output_bw": output_bw, "states_fw": states_fw, "states_bw": states_bw},
        n=1,
        feed_dict=None)
    '''
    #print('Onehot:', features.get_shape(), '->', target.get_shape())
    
    #conv_filter = tf.Variable(tf.zeros([5, feature_classes, target_classes]))
    #layer = tf.nn.conv1d(features, conv_filter, stride=1, padding='SAME')
    #features = tf.tanh(features)
    #print('Conv1', features.get_shape())
    
    #features = layers.stack(features, layers.fully_connected, [target_classes, target_classes])
    print('Features:', features.get_shape())
    #features = tf.nn.dropout(features, .3)
    #features = layers.fully_connected(features, target_classes, activation_fn=tf.tanh)#TODO fully connected tanh + dropout
    features = tf.slice(features, [0,1,0], [-1, sequence_size, target_classes])
    #features = tf.pad(features, [[0,0][0,-1],[0,0]], 'CONSTANT')
    print('Features:', features.get_shape())
    
    #TODO softmax, time insensitive.
    #bs = features.get_shape()[0]
    #sparse_labels = tf.SparseTensor(indices=tf.argmax(target_one_hot, axis=2), values=tf.fill(bs, 1), shape=[bs, sequence_length])
    #loss = tf.nn.ctc_loss(features, sparse_labels, sequence_length=tf.fill(bs, sequence_size), preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=False)
    
    #set up sequence to sequence loss function
    multi_feats = tf.split(1, sequence_size, features)
    multi_targets = tf.split(1, sequence_size, target_one_hot)
    losses = list()
    predictions = list()
    for seq_target, seq_feats in zip(multi_targets, multi_feats):
        #seq_feats = tf.squeeze(seq_feats)
        #print('S:', seq_feats.get_shape(), '->', seq_target.get_shape())
        #seq_loss = tf.contrib.losses.softmax_cross_entropy(seq_feats, seq_target)
        seq_pred = tf.squeeze(tf.argmax(seq_feats, 2))
        #losses.append(seq_loss)
        predictions.append(seq_pred)
    #losses = tf.pack(losses)
    #or
    #losses = tf.nn.ctc_loss(features, target, sequence_size, time_major=False)#, preprocess_collapse_repeated=False, ctc_merge_repeated=True, time_major=True)
    
    #loss = tf.reduce_mean(losses)
    logits_flat = tf.reshape(features, [-1, target_classes])
    target_flat = tf.reshape(target, [-1])
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_flat, labels=target_flat))
    
    prediction = tf.pack(predictions, axis=1)
    
    
    print('loss:', loss.get_shape())
    #print('predictions:', predictions)
    print('p[0]', predictions[0].get_shape())
    print('prediction:', prediction.get_shape())
    
    '''
    logits = layers.fully_connected(features, target_classes) #softmax
    #loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
    
    #loss = tf.nn.softmax_cross_entropy_with_logits(logits, target)
    print('logits/labels:', logits.get_shape(), target.get_shape())
    '''
    
    print(tf.trainable_variables())
    
    #prediction, loss = (
    #    tf.contrib.learn.models.logistic_regression_zero_init(features, target)
    #)
    train_op = tf.contrib.layers.optimize_loss(
        loss, tf.contrib.framework.get_global_step(), optimizer='Adagrad',
        learning_rate=0.01)
    #return {'class': tf.argmax(prediction, 1), 'prob': prediction}
    
    return prediction, loss, train_op


classifier = learn.SKCompat(learn.Estimator(model_fn=my_model))
classifier.fit(X_train, y_train, steps=10000, batch_size=128)

predictions = list(classifier.predict(X_test)) #TODO undo onhot?
#print("Accuracy:", metrics.accuracy_score(y_test, predictions))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f40e8089080>, 'tf_random_seed': None, '_environment': 'local', '_is_chief': True, '_master': '', '_task_id': 0, 'save_checkpoints_secs': 600, 'keep_checkpoint_every_n_hours': 10000, 'keep_checkpoint_max': 5, 'save_summary_steps': 100, '_num_ps_replicas': 0, 'save_checkpoints_steps': None, '_evaluation_master': '', 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
}
Model: (?, 35) -> (?, 32)
57 71
Features: (?, 35, 71)
Features: (?, 32, 71)
loss: ()
p[0] <unknown>
prediction: <unknown>
[<tensorflow.python.ops.variables.Variable object at 0x7f40e3f86080>, <tensorflow.python.ops.variables.Variable object at 0x7f40e3f86048>]
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:step = 1, loss = 4.15643
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpm9fm74kg/model.ckpt.
INFO:tensorflow:step = 

In [None]:
#print("Accuracy:", metrics.accuracy_score(y_test, predictions))
#X_enc.inverse_transform(X_test[:5])
#y_enc.inverse_transform(predictions[:5])
print(X_test[0], predictions[0])
print(X_enc.inverse_transform([X_test[0]]))
print(y_enc.inverse_transform([predictions[0]]))