## Bienvenidos!

En estos notebooks vamos a correr una red neuronal para interpretar texto.

In [9]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import tensorflow as tf
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

In [10]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [11]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
words = read_data(filename)
print('Data size %d' % len(words))

Data size 17005207


In [21]:
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


In [93]:
data_index = 0

# skip_window: int : How many words to consider left and right.
def generate_cbow_batch(batch_size, skip_window):
  global data_index
#   assert batch_size % num_skips == 0
#   assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size, skip_window*2), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size,1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
#   buffer = []
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
    print("data_index:{} buffer {}".format(data_index,buffer[data_index-1]))
  
  for i in range(batch_size):
    batch[i] = list(buffer)[:skip_window] + list(buffer)[skip_window+1:]
    labels[i] = buffer[skip_window]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
    
  return batch, labels

In [94]:
batch,labels = generate_cbow_batch(128,1)

data_index:1 buffer 5234
data_index:2 buffer 3081
data_index:3 buffer 12


In [95]:
batch[:10]


array([[5234,   12],
       [3081,    6],
       [  12,  195],
       [   6,    2],
       [ 195, 3134],
       [   2,   46],
       [3134,   59],
       [  46,  156],
       [  59,  128],
       [ 156,  742]])

In [96]:
labels[:10]

array([[3081],
       [  12],
       [   6],
       [ 195],
       [   2],
       [3134],
       [  46],
       [  59],
       [ 156],
       [ 128]])

In [97]:
for i in range (np.random.randint(28) , np.random.randint(28)+100 ):
    print ('input: ({},{}) -- label:({})'.format(reverse_dictionary[batch[i,0]], reverse_dictionary[batch[i,1]],
                                                reverse_dictionary[labels[i][0]]))

input: (of,first) -- label:(abuse)
input: (abuse,used) -- label:(first)
input: (first,against) -- label:(used)
input: (used,early) -- label:(against)
input: (against,working) -- label:(early)
input: (early,class) -- label:(working)
input: (working,radicals) -- label:(class)
input: (class,including) -- label:(radicals)
input: (radicals,the) -- label:(including)
input: (including,diggers) -- label:(the)
input: (the,of) -- label:(diggers)
input: (diggers,the) -- label:(of)
input: (of,english) -- label:(the)
input: (the,revolution) -- label:(english)
input: (english,and) -- label:(revolution)
input: (revolution,the) -- label:(and)
input: (and,sans) -- label:(the)
input: (the,UNK) -- label:(sans)
input: (sans,of) -- label:(UNK)
input: (UNK,the) -- label:(of)
input: (of,french) -- label:(the)
input: (the,revolution) -- label:(french)
input: (french,whilst) -- label:(revolution)
input: (revolution,the) -- label:(whilst)
input: (whilst,term) -- label:(the)
input: (the,is) -- label:(term)
input