In [1]:
import collections
import os
import sys

#from os import path
import random
#import tempfile
#import time

import numpy as np
import tensorflow as tf
from tensorflow.python.framework import random_seed

from tensorflow.contrib.learn.python.learn.datasets import base
# Datasets = collections.namedtuple('Datasets', ['train', 'validation', 'test'])

# Local dir where PRB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'
# Filenames.
TRAIN = "ptb.train.txt"
VALID = "ptb.valid.txt"
TEST = "ptb.test.txt"

# Number of characters in a single phrase.
PHRASE_LENGTH=100


#### Helper functions

In [2]:
def _parse_document(filename):
    """Parses document using space as delimiter."""
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "<eos>").split()

def _build_vocab(filename):
    """Builds and returns a vocabulary for a given document."""
    # Parse document.
    data = _parse_document(filename)
    # Transform data to dictionary (key - value)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    # Returns dictionary that can be used for decoding of the document.
    return word_to_id

def encode_doc_to_one_hot(dense_data_vector, num_classes):
    """Convert data from dense vector of scalars to vector of one-hot vectors."""
    num_labels = len(dense_data_vector)
    result = np.zeros(shape=(num_labels, num_classes))
    result[np.arange(num_labels), dense_data_vector] = 1
    return result.astype(int)

def _extract_document(filename, word_to_id_dict, one_hot=False):
    """Reades a document and encodeds it using a dictionary."""
    data = _parse_document(filename)
    encoded_doc = [word_to_id[word] for word in data if word in word_to_id]
    if one_hot == True:
        return encode_doc_to_one_hot(encoded_doc, len(word_to_id))
    # else: 
    return encoded_doc


#### Helper function tests

In [3]:
datafile = os.path.join(PTB_DIR, TRAIN)

# Build dictionary
word_to_id = _build_vocab(datafile)
print ("Vocabulary =", word_to_id)

#num_classes = len(word_to_id)
#print ("Vocabulary size =", num_classes)

#parsed_doc = _parse_document(datafile)
#print ("Document =", parsed_doc[0:10])

#encoded_doc = encode_doc_to_one_hot(datafile, word_to_id)
#print ("Encoded Document =",encoded_doc[0:10])

#one_hot_doc = dense_to_one_hot (encoded_doc, num_classes)
#print ("One-hot encoded document =", one_hot_doc[0:10])

encoded_doc = _extract_document(datafile, word_to_id, False)
print("Encoded Document =", encoded_doc[0:10])

doc_size = len(encoded_doc)
print("Number of elements in document = ", doc_size)

#mydict = {'george':16,'amber':19}
#print(list(mydict.keys())[list(mydict.values()).index(16)]) # Prints george

Encoded Document = [9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983]
Number of elements in document =  929589


In [4]:
TMP_PHRASE_LENGTH = 2
# Divide document into phrases of a given size.
print ("Doc =", encoded_doc[0:500])
print("Number of elements in document = ", doc_size)
num_phrases = 10 #int(doc_size/PHRASE_LENGTH)
print ("Number of phrases = ", num_phrases)
# Process data into phrases.
phrases = np.array([encoded_doc[i*TMP_PHRASE_LENGTH:(i+1)*TMP_PHRASE_LENGTH] for i in range(num_phrases)])
print("Phrase[0] =", phrases[0])
labels = np.array([encoded_doc[i*TMP_PHRASE_LENGTH+1:(i+1)*TMP_PHRASE_LENGTH+1] for i in range(num_phrases)])
print("Labels[1] =", labels[0])
perm = np.arange(num_phrases)
print("Indices =",perm)

print("Shuffling")
np.random.shuffle(perm)
print("Indices =",perm)
phrases = phrases[perm]
print("Phrase[0] =", phrases[0])

Doc = [9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984, 9986, 9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995, 9996, 9997, 9998, 9999, 2, 9256, 1, 3, 72, 393, 33, 2133, 0, 146, 19, 6, 9207, 276, 407, 3, 2, 23, 1, 13, 141, 4, 1, 5465, 0, 3081, 1596, 96, 2, 7682, 1, 3, 72, 393, 8, 337, 141, 4, 2477, 657, 2170, 955, 24, 521, 6, 9207, 276, 4, 39, 303, 438, 3684, 2, 6, 942, 4, 3150, 496, 263, 5, 138, 6092, 4241, 6036, 30, 988, 6, 241, 760, 4, 1015, 2786, 211, 6, 96, 4, 431, 4115, 5, 14, 45, 55, 3, 72, 195, 1244, 220, 2, 0, 3150, 7426, 1, 13, 4052, 1, 496, 14, 6885, 0, 1, 22, 113, 2652, 8068, 5, 14, 2474, 5250, 10, 464, 52, 3004, 466, 1244, 15, 2, 1, 80, 0, 167, 4, 35, 2645, 1, 65, 10, 558, 6092, 3574, 1898, 666, 1, 7, 27, 1, 4241, 6036, 7, 3, 2, 366, 1976, 3178, 46, 220, 45, 55, 6, 40, 195, 0, 467, 342, 1292, 7, 325, 9, 35, 1491, 916, 4, 3199, 6, 8967, 371, 5, 1141, 35, 1411, 5, 0, 434, 2, 6, 1, 1, 15, 39, 13, 31, 393, 1366, 2, 64, 275, 1921, 43, 72, 195, 157, 1442, 2395, 4,

### Dataset helper class for storing parsed text. 

In [5]:
class TextDataSet(object):

  def __init__(self,
               text,
               phrase_length=100,
               seed=None):
    """Construct a DataSet. Divides (already parsed and encoded) text data into phrases.
    Seed arg provides for convenient deterministic testing.
    """
    # Set seed.
    seed1, seed2 = random_seed.get_seed(seed)
    # If op level seed is not set, use whatever graph level seed is returned
    np.random.seed(seed1 if seed is None else seed2)

    self._text = text
    self._phrase_length = phrase_length
    self._epochs_completed = 0
    self._index_in_epoch = 0
    
    # Divide document into phrases of a given size.
    doc_size = len(text)
    self._num_examples = int(doc_size/phrase_length)
    # DATA: Process text into phrases.
    self._data = np.array([text[i*phrase_length:(i+1)*phrase_length] for i in range(self._num_examples)])
    # LABELS: Process text into phrases - label is next char, so shifted by one.
    self._labels = np.array([text[i*phrase_length+1:(i+1)*phrase_length+1] for i in range(self._num_examples)])
        
  @property
  def data(self):
    return self._data

  @property
  def labels(self):
    return self._labels

  @property
  def batch_length(self):
    return self._batch_length

  @property
  def num_examples(self):
    return self._num_examples

  @property
  def epochs_completed(self):
    return self._epochs_completed

  def next_batch(self, batch_size, shuffle=True):
    """Return the next `batch_size` examples from this data set."""
    start = self._index_in_epoch
    # Shuffle for the first epoch
    if self._epochs_completed == 0 and start == 0 and shuffle:
      perm0 = np.arange(self._num_examples)
      np.random.shuffle(perm0)
      self._data = self.data[perm0]
      self._labels = self.labels[perm0]
    # Go to the next epoch
    if start + batch_size > self._num_examples:
      # Finished epoch
      self._epochs_completed += 1
      # Get the rest examples in this epoch
      rest_num_examples = self._num_examples - start
      data_rest_part = self._data[start:self._num_examples]
      labels_rest_part = self._labels[start:self._num_examples]
      # Shuffle the data
      if shuffle:
        perm = np.arange(self._num_examples)
        np.random.shuffle(perm)
        self._data = self.data[perm]
        self._labels = self.labels[perm]
      # Start next epoch
      start = 0
      self._index_in_epoch = batch_size - rest_num_examples
      end = self._index_in_epoch
      data_new_part = self._data[start:end]
      labels_new_part = self._labels[start:end]
      return numpy.concatenate((data_rest_part, data_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0)
    else:
      self._index_in_epoch += batch_size
      end = self._index_in_epoch
      return self._data[start:end], self._labels[start:end]



### Reads Penn Tree Bank

In [6]:
def read_ptb(dir,
        phrase_length=100,
        one_hot=False,
        seed=None):

    train_file = os.path.join(PTB_DIR, TRAIN)
    valid_file = os.path.join(PTB_DIR, VALID)
    test_file = os.path.join(PTB_DIR, TEST)

    # Build dictionary on the basis of train data.
    word_to_id = _build_vocab(train_file)
    #print (word_to_id)   
    
    # Load data.
    train_data = _extract_document(train_file, word_to_id, one_hot)
    validaton_data = _extract_document(valid_file, word_to_id, one_hot)
    test_data = _extract_document(test_file, word_to_id, one_hot)

    options = dict(phrase_length=100,seed=seed)

    # Create datasets.
    train = TextDataSet(train_data, **options)
    validation = TextDataSet(validaton_data, **options)
    test = TextDataSet(test_data, **options)

    return base.Datasets(train=train, validation=validation, test=test)

### Load ptb

In [7]:
ptb = read_ptb(PTB_DIR, PHRASE_LENGTH, False)

In [9]:
# get next batch.
print(ptb.train.next_batch(1))

(array([[ 110,  762,  110,  969,  484, 1469,  152,   67,   14,    9,   43,
           5,   25,  382,    2,    1,    7,   49, 8734,   23, 4804, 7670,
          10,    0,   35, 2227,  328,  147, 1521,  175,   32, 4931, 2374,
           6,  965,  382,   99,    2,   14,    9,   73,    7,   39,    1,
         110,   87,   32,  358,    5,  712,   18,    2,   64,   79,  147,
          34,    6,  965,   18,   57,  374,    2,   54,    4,   23, 4804,
           9,    1, 8457,   13,   10,    0,  139,    4,    0,  354,   13,
          32,    6, 2235,  752,   36,  446,    2, 1279,  166,   99,   14,
           9, 3481,    2,   39,    1, 3563, 1104,   54, 2049,  683,  609,
           5]]), array([[ 762,  110,  969,  484, 1469,  152,   67,   14,    9,   43,    5,
          25,  382,    2,    1,    7,   49, 8734,   23, 4804, 7670,   10,
           0,   35, 2227,  328,  147, 1521,  175,   32, 4931, 2374,    6,
         965,  382,   99,    2,   14,    9,   73,    7,   39,    1,  110,
          87,   32, 