In [2]:
import nltk
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing



In [3]:
arpabet = nltk.corpus.cmudict.dict()

In [3]:
for word in ('s', 'see', 'sea', 'compute', 'comput', 'seesea'):
    try:
        print(arpabet[word])
    except Exception as e:
        print(e)

[['EH1', 'S']]
[['S', 'IY1']]
[['S', 'IY1']]
[['K', 'AH0', 'M', 'P', 'Y', 'UW1', 'T']]
'comput'
'seesea'


In [4]:
len(arpabet)

123455

In [16]:
import os

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def write_examples(X, y, name):
    """Converts a dataset to tfrecords."""

    filename = os.path.join('./', name + '.tfrecords')
    print('Writing', filename)
    writer = tf.python_io.TFRecordWriter(filename)
    for Xs, ys in zip(X, y):
        example = tf.train.Example(features=tf.train.Features(feature={
            'chars': _int64_feature(Xs),
            'phoneme': _int64_feature(ys)}))
        writer.write(example.SerializeToString())
    writer.close()
    return filename

# Batch Processing

* Normalize the record sizes
* Do not one-hot encode, leave that for the neural net
* Do label encoding to numerics

In [65]:
import itertools


class LabelEncoder(object):
    '''
    A progressive label encoder
    '''
    def __init__(self, dim=1):
        self.dim = dim
        self.classes_ = dict()
        self.lookup_ = dict()
        self.max_sequence_size = 0
    
    @property
    def n_classes(self):
        return len(self.classes_)
    
    def _all_labels(self, data, update_size=False):
        #TODO use self.dim
        for seq in data:
            if update_size:
                self.max_sequence_size = max(self.max_sequence_size, len(seq))
            for label in seq:
                yield label
    
    def fit(self, data):
        for t in map(self.tokenize_label, self._all_labels(data, update_size=True)):
            pass
        #self.onehot_encoder = sklearn.preprocessing.OneHotEncoder(self.n_classes)
    
    def refit(self, data):
        batch_size = len(data)
        max_size = self.max_sequence_size
        o_encoded = np.empty((batch_size, max_size), dtype='object')
        o_encoded.fill(self.lookup_[0])

        for i, seq in enumerate(data):
            o_encoded[i,0:len(seq)] = seq
        return o_encoded
    
    def transform(self, data):
        batch_size = len(data)
        max_size = self.max_sequence_size
        o_encoded = np.zeros((batch_size, max_size), dtype='int32')

        for i, seq in enumerate(data):
            for j, syl in enumerate(seq):
                o_encoded[i,j] = self.tokenize_label(syl)
        return o_encoded
    
    def inverse_transform(self, data):
        batch_size = len(data)
        max_size = self.max_sequence_size
        decoded = np.zeros((batch_size, max_size), dtype='object')

        for i, seq in enumerate(data):
            for j, index in enumerate(seq):
                decoded[i,j] = self.lookup_[index]
        return decoded
        
    def onehot_label(self, label):
        token = self.tokenize_label(label)
        return self.onehot_encoder.fit_transform(token).toarray()
    
    def tokenize_label(self, label):
        if label in self.classes_:
            return self.classes_[label]
        else:
            index = len(self.classes_)
            self.classes_[label] = index
            self.lookup_[index] = label
            return index


def track_and_refit(data, null_class=''):
    encoder = LabelEncoder()
    encoder.fit([[null_class]])
    encoder.fit(data)
    return encoder.transform(data), encoder


In [66]:
X_, y_ = [], []

for word, utterances in arpabet.items():
    X_.extend([list(word)]*len(utterances))
    y_.extend(utterances)

    
print(X_[0], y_[0])

X, X_enc = track_and_refit(X_)
X_classes = list(X_enc.classes_.keys())
y, y_enc = track_and_refit(y_)
y_classes = list(y_enc.classes_.keys())

col_chars = [tf.contrib.layers.sparse_column_with_keys(column_name="char"+str(i), keys=X_classes)
            for i in range(X_enc.max_sequence_size)]
col_seq = [tf.contrib.layers.sparse_column_with_keys(column_name="seq"+str(i), keys=y_classes)
            for i in range(y_enc.max_sequence_size)]

print(len(X), len(y))
print(X[0])
print(y[0])
n_classes = y_enc.n_classes
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)#, stratify=n_classes)

['w', 'i', 'n', 'w', 'o', 'r', 'd'] ['W', 'IH1', 'N', 'W', 'ER0', 'D']
133737 133737
[1 2 3 1 4 5 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 2 3 1 4 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [108]:
from sklearn import datasets, metrics, cross_validation
import tensorflow as tf
from tensorflow.contrib import layers
from tensorflow.contrib import learn

def my_model(features, target):
    print('Model:', features.get_shape(), '->', target.get_shape())
    
    feature_classes = X_enc.n_classes
    target_classes = y_enc.n_classes
    sequence_size = y_enc.max_sequence_size
    print(feature_classes, target_classes)
    target = tf.to_float(tf.one_hot(target, target_classes, 1, 0))
    features = tf.to_float(tf.one_hot(features, feature_classes, 1, 0))
    print('Onehot:', features.get_shape(), '->', target.get_shape())
    features = layers.stack(features, layers.fully_connected, [target_classes, target_classes])
    print('Features:', features.get_shape())
    features = tf.slice(features, [0,0,0], [-1, sequence_size, target_classes])
    
    print('Features:', features.get_shape())
    
    #Use `tf.contrib.losses.softmax_cross_entropy` and explicit logits computation.
    # Compute logits (1 per class) and compute loss.
    
    #set up sequence to sequence loss function
    multi_feats = tf.split(1, sequence_size, features)
    multi_targets = tf.split(1, sequence_size, target)
    losses = list()
    predictions = list()
    for seq_target, seq_feats in zip(multi_targets, multi_feats):
        #seq_feats = tf.squeeze(seq_feats)
        #print('S:', seq_feats.get_shape(), '->', seq_target.get_shape())
        seq_loss = tf.contrib.losses.softmax_cross_entropy(seq_feats, seq_target)
        #seq_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(seq_feats, seq_target)
        seq_pred = tf.squeeze(tf.argmax(seq_feats, 2))
        losses.append(seq_loss)
        predictions.append(seq_pred)
    losses = tf.pack(losses) #tf.pack?
    loss = tf.reduce_mean(losses)
    prediction = tf.pack(predictions, axis=1)
    
    print('loss:', loss.get_shape())
    #print('predictions:', predictions)
    print('p[0]', predictions[0].get_shape())
    print('prediction:', prediction.get_shape())
    
    '''
    logits = layers.fully_connected(features, target_classes) #softmax
    #loss = tf.contrib.losses.softmax_cross_entropy(logits, target)
    
    #loss = tf.nn.softmax_cross_entropy_with_logits(logits, target)
    print('logits/labels:', logits.get_shape(), target.get_shape())
    '''
    
    #prediction, loss = (
    #    tf.contrib.learn.models.logistic_regression_zero_init(features, target)
    #)
    train_op = tf.contrib.layers.optimize_loss(
        loss, tf.contrib.framework.get_global_step(), optimizer='Adagrad',
        learning_rate=0.01)
    #return {'class': tf.argmax(prediction, 1), 'prob': prediction}
    return prediction, loss, train_op


classifier = learn.SKCompat(learn.Estimator(model_fn=my_model))
classifier.fit(X_train, y_train, steps=100000)

predictions = list(classifier.predict(X_test)) #TODO undo onhot?
#print("Accuracy:", metrics.accuracy_score(y_test, predictions))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_evaluation_master': '', '_environment': 'local', 'keep_checkpoint_every_n_hours': 10000, 'tf_random_seed': None, '_task_type': None, '_is_chief': True, 'save_summary_steps': 100, 'save_checkpoints_secs': 600, 'save_checkpoints_steps': None, '_master': '', 'keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8c1cbe1e80>, '_task_id': 0, 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
}
Model: (?, 33) -> (?, 32)
55 71
Onehot: (?, 33, 55) -> (?, 32, 71)
Features: (?, 33, 71)
Features: (?, 32, 71)
loss: ()
p[0] <unknown>
prediction: <unknown>
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:loss = 4.28798, step = 1
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpqu9frjzg/model.ckpt.
INFO:tensorflow:loss = 2.48076, step = 101
INFO:tensorflow:global_step/sec: 15.6119
INFO:tensorflow:loss = 1.33764, step = 201
IN

In [109]:
#print("Accuracy:", metrics.accuracy_score(y_test, predictions))
#X_enc.inverse_transform(X_test[:5])
#y_enc.inverse_transform(predictions[:5])
print(X_test[0], predictions[0])
print(X_enc.inverse_transform([X_test[0]]))
print(y_enc.inverse_transform([predictions[0]]))

[ 1  2  3 11  5 13 18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0] [ 0 40  0 14  8  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0]
[['w' 'i' 'n' 'f' 'r' 'e' 'y' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
  '' '' '' '' '' '' '' '' '' '' '']]
[['' 'IH0' '' 'F' 'R' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' '' ''
  '' '' '' '' '' '' '' '' '']]
