In [1]:
import collections
import os
import sys

#from os import path
import random
#import tempfile
#import time

import numpy as np
import tensorflow as tf
from tensorflow.python.framework import random_seed

from tensorflow.contrib.learn.python.learn.datasets import base
# Datasets = collections.namedtuple('Datasets', ['train', 'validation', 'test'])

# Local dir where PRB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'
# Filenames.
TRAIN = "ptb.char.train.txt"
VALID = "ptb.char.valid.txt"
TEST = "ptb.char.test.txt"

# Number of characters in a single phrase.
PHRASE_LENGTH=100


#### Helper functions

In [33]:
def _parse_document(filename):
    """Parses document using space as delimiter."""
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "<eos>").split()

def _build_vocab(filename):
    """Builds and returns a vocabulary for a given document."""
    # Parse document.
    data = _parse_document(filename)
    # Transform data to dictionary (key - value)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    # Returns dictionary that can be used for decoding of the document.
    return word_to_id

def encode_doc_to_one_hot(dense_data_vector, num_classes):
    """Convert data from dense vector of scalars to vector of one-hot vectors."""
    num_labels = len(dense_data_vector)
    result = np.zeros(shape=(num_labels, num_classes))
    result[np.arange(num_labels), dense_data_vector] = 1
    return result.astype(int)

def _extract_document(filename, word_to_id_dict, one_hot=False):
    """Reades a document and encodeds it using a dictionary."""
    data = _parse_document(filename)
    encoded_doc = [word_to_id[word] for word in data if word in word_to_id]
    if one_hot == True:
        return encode_doc_to_one_hot(encoded_doc, len(word_to_id))
    # else: 
    return encoded_doc


#### Helper function tests

In [35]:
datafile = os.path.join(PTB_DIR, TRAIN)

# Build dictionary
word_to_id = _build_vocab(datafile)
print ("Vocabulary =", word_to_id)

#num_classes = len(word_to_id)
#print ("Vocabulary size =", num_classes)

#parsed_doc = _parse_document(datafile)
#print ("Document =", parsed_doc[0:10])

#encoded_doc = encode_doc_to_one_hot(datafile, word_to_id)
#print ("Encoded Document =",encoded_doc[0:10])

#one_hot_doc = dense_to_one_hot (encoded_doc, num_classes)
#print ("One-hot encoded document =", one_hot_doc[0:10])

encoded_doc = _extract_document(datafile, word_to_id, False)
print("Encoded Document =", encoded_doc[0:10])

doc_size = len(encoded_doc)
print("Number of elements in document = ", doc_size)

#mydict = {'george':16,'amber':19}
#print(list(mydict.keys())[list(mydict.values()).index(16)]) # Prints george

Vocabulary = {'x': 29, '2': 41, '9': 38, '<eos>': 24, 'd': 12, 'm': 14, '6': 46, 'N': 26, 't': 2, 'k': 17, 's': 7, 'w': 21, '&': 35, 'q': 33, 'u': 11, 'y': 19, '$': 31, '>': 23, '3': 39, '0': 36, '1': 37, '/': 48, 'c': 13, 'a': 3, '*': 49, 'n': 4, 'b': 20, '\\': 44, '7': 45, 'i': 6, '_': 0, '#': 40, 'h': 9, 'o': 5, '8': 42, 'j': 30, '.': 27, 'g': 18, '-': 32, '<': 22, 'l': 10, 'z': 34, 'f': 15, '5': 43, 'p': 16, 'v': 25, '4': 47, 'e': 1, "'": 28, 'r': 8}
Encoded Document = [[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 

In [40]:
TMP_PHRASE_LENGTH = 2
# Divide document into phrases of a given size.
print ("Doc =", encoded_doc[0:500])
print("Number of elements in document = ", doc_size)
num_phrases = 10 #int(doc_size/PHRASE_LENGTH)
print ("Number of phrases = ", num_phrases)
# Process data into phrases.
phrases = np.array([encoded_doc[i*TMP_PHRASE_LENGTH:(i+1)*TMP_PHRASE_LENGTH] for i in range(num_phrases)])
print("Phrase[0] =", phrases[0])
labels = np.array([encoded_doc[i*TMP_PHRASE_LENGTH+1:(i+1)*TMP_PHRASE_LENGTH+1] for i in range(num_phrases)])
print("Labels[1] =", labels[0])
perm = np.arange(num_phrases)
print("Indices =",perm)

print("Shuffling")
np.random.shuffle(perm)
print("Indices =",perm)
phrases = phrases[perm]
print("Phrase[0] =", phrases[0])

Doc = [[0 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [1 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
Number of elements in document =  5017482
Number of phrases =  10
Phrase[0] = [[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]
Labels[1] = [[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]
Indices = [0 1 2 3 4 5 6 7 8 9]
Shuffling
Indices = [2 0 9 3 4 1 7 6 5 8]
Phrase[0] = [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [41]:
class TextDataSet(object):

  def __init__(self,
               text,
               phrase_length=100,
               seed=None):
    """Construct a DataSet. Divides (already parsed and encoded) text data into phrases.
    Seed arg provides for convenient deterministic testing.
    """
    # Set seed.
    seed1, seed2 = random_seed.get_seed(seed)
    # If op level seed is not set, use whatever graph level seed is returned
    np.random.seed(seed1 if seed is None else seed2)

    self._text = text
    self._phrase_length = phrase_length
    self._epochs_completed = 0
    self._index_in_epoch = 0
    
    # Divide document into phrases of a given size.
    doc_size = len(text)
    self._num_examples = int(doc_size/phrase_length)
    # DATA: Process text into phrases.
    self._data = np.array([text[i*phrase_length:(i+1)*phrase_length] for i in range(self._num_examples)])
    # LABELS: Process text into phrases - label is next char, so shifted by one.
    self._labels = np.array([text[i*phrase_length+1:(i+1)*phrase_length+1] for i in range(self._num_examples)])
        
  @property
  def data(self):
    return self._data

  @property
  def labels(self):
    return self._labels

  @property
  def batch_length(self):
    return self._batch_length

  @property
  def num_examples(self):
    return self._num_examples

  @property
  def epochs_completed(self):
    return self._epochs_completed

  def next_batch(self, batch_size, shuffle=True):
    """Return the next `batch_size` examples from this data set."""
    start = self._index_in_epoch
    # Shuffle for the first epoch
    if self._epochs_completed == 0 and start == 0 and shuffle:
      perm0 = np.arange(self._num_examples)
      np.random.shuffle(perm0)
      self._data = self.data[perm0]
      self._labels = self.labels[perm0]
    # Go to the next epoch
    if start + batch_size > self._num_examples:
      # Finished epoch
      self._epochs_completed += 1
      # Get the rest examples in this epoch
      rest_num_examples = self._num_examples - start
      data_rest_part = self._data[start:self._num_examples]
      labels_rest_part = self._labels[start:self._num_examples]
      # Shuffle the data
      if shuffle:
        perm = np.arange(self._num_examples)
        np.random.shuffle(perm)
        self._data = self.data[perm]
        self._labels = self.labels[perm]
      # Start next epoch
      start = 0
      self._index_in_epoch = batch_size - rest_num_examples
      end = self._index_in_epoch
      data_new_part = self._data[start:end]
      labels_new_part = self._labels[start:end]
      return numpy.concatenate((data_rest_part, data_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0)
    else:
      self._index_in_epoch += batch_size
      end = self._index_in_epoch
      return self._data[start:end], self._labels[start:end]



In [42]:
def read_ptb(dir,
        phrase_length=100,
        one_hot=False,
        seed=None):

    train_file = os.path.join(PTB_DIR, TRAIN)
    valid_file = os.path.join(PTB_DIR, VALID)
    test_file = os.path.join(PTB_DIR, TEST)

    # Build dictionary on the basis of train data.
    word_to_id = _build_vocab(train_file)
    #print (word_to_id)   
    
    # Load data.
    train_data = _extract_document(train_file, word_to_id, one_hot)
    validaton_data = _extract_document(valid_file, word_to_id, one_hot)
    test_data = _extract_document(test_file, word_to_id, one_hot)

    options = dict(phrase_length=100,seed=seed)

    # Create datasets.
    train = TextDataSet(train_data, **options)
    validation = TextDataSet(validaton_data, **options)
    test = TextDataSet(test_data, **options)

    return base.Datasets(train=train, validation=validation, test=test)

In [49]:
ptb = read_ptb(PTB_DIR, PHRASE_LENGTH, False)

In [50]:
print(ptb.train.next_batch(1))

(array([[ 0, 26,  0, 14,  6, 10, 10,  6,  5,  4,  0,  7,  9,  3,  8,  1,  7,
         0, 21,  3,  7,  0, 15,  3,  8,  0, 20,  1, 10,  5, 21,  0, 10,  3,
         7,  2,  0, 21,  1,  1, 17,  0, 28,  7,  0, 22, 11,  4, 17, 23,  0,
         3, 25,  1,  8,  3, 18,  1,  0,  5, 15,  0,  4,  1,  3,  8, 10, 19,
         0, 26,  0, 14,  6, 10, 10,  6,  5,  4, 24, 15,  5,  8,  0,  5, 13,
         2,  5, 20,  1,  8,  0,  7,  5,  0, 15,  3,  8,  0, 12,  3],
       [ 0,  7,  1, 13, 11,  8,  6,  2,  6,  1,  7,  0, 13, 10,  1,  3,  8,
         6,  4, 18,  0, 15,  6,  8, 14,  7,  0,  2,  5,  0,  2,  9,  1,  0,
         4,  3,  2,  6,  5,  4,  3, 10,  0,  3,  7,  7,  5, 13,  6,  3,  2,
         6,  5,  4,  0,  5, 15,  0,  7,  1, 13, 11,  8,  6,  2,  6,  1,  7,
         0, 12,  1,  3, 10,  1,  8,  7,  0,  6,  4, 13, 10, 11, 12,  1,  0,
         5,  4, 10, 19,  0,  2,  9,  5,  7,  1,  0,  2,  8,  3, 12],
       [10,  6,  1, 25,  1,  0,  2,  9,  3,  2,  0,  2,  9,  1,  8,  1,  0,
         9,  3, 25,  1,  