In [38]:
from __future__ import print_function
import collections
import numpy as np
import tensorflow as tf
import zipfile
from six.moves.urllib.request import urlretrieve

In [35]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [39]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
words = read_data(filename)

In [40]:
words[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [57]:
class WordNumericEncoder:
    def __init__(self, words, common=0, rare_word_token="UNK"):
        self._words = words
        self._rare_word_token = rare_word_token
        self._counter = collections.Counter(words)
        
        self._set_items(common)
        self._build_dictionary()
        self._encode_words()
        
    def _set_items(self, common=0):
        self._items = [[self._rare_word_token, -1]]
        if common <= 0:
            common = len(self._words)
            self._items = []
        self._items.extend(self._counter.most_common(common))
    
    def _build_dictionary(self):
        self._dictionary = dict()
        for word, _ in self._items:
            self._dictionary[word] = len(self._dictionary)
    
    def _encode_words(self):
        data = list()
        unk_count = 0
        for word in self._words:
            if word in self._dictionary:
                index = self._dictionary[word]
            else:
                index = 0  # items['UNK']
                unk_count = unk_count + 1
            data.append(index)
        self._items[0][1] = unk_count
        self._data = data
        
    def get_data(self):
        return self._data
    
    def get_reverse_dictionary(self):
        return dict(zip(self._dictionary.values(), self._dictionary.keys())) 
        
        
e = WordNumericEncoder(words[:20], 15)
encoded = e.get_data()[:10]
reverse = e.get_reverse_dictionary()
print(words[:10]) # ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
print(encoded) # [0, 5, 4, 13, 14, 1, 7, 0, 6, 11]
print([reverse[i] for i in encoded]) # ['UNK', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'UNK', 'used', 'against']


['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
[0, 5, 4, 13, 14, 1, 7, 0, 6, 11]
['UNK', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'UNK', 'used', 'against']
