# NLP para predecir textos del libro del Mago de Oz




In [1]:
# Librerias basicas
import sys
import numpy as np

# Para las redes neuronales
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# Para NLP
import string
from keras.preprocessing.sequence import pad_sequences

# 1. Carga del dataset

In [4]:
# Carga de datos, el libro fue limpiado en el editor de texto desde el capitulo 1 hasta final.
file = "sample_data/the_oz.txt"
texto = open(file, 'r', encoding='utf-8').read()
texto = texto.lower()
texto = " ".join(texto.split())
texto



# 2. Procesamiento inicial

In [5]:
# Revision
caracteres_lista = sorted(list(set(texto)))
print('Antes de limpiar los sigos de puntuacion: %d' % (len(caracteres_lista)))

Antes de limpiar los sigos de puntuacion: 49


In [6]:
# Eliminar puntuacion
puntuacion = list(string.punctuation)
for i in puntuacion:
  texto = texto.replace(i , ' ')

# Reemplazo
caracteres = sorted(list(set(texto)))
print('Despues de limpiar los sigos de puntuacion: %d' % (len(caracteres)))

Despues de limpiar los sigos de puntuacion: 37


In [7]:
# Caracteres a numericos
char_to_int = dict((c, i) for i,c in enumerate(caracteres))
char_to_int

{' ': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10,
 'a': 11,
 'b': 12,
 'c': 13,
 'd': 14,
 'e': 15,
 'f': 16,
 'g': 17,
 'h': 18,
 'i': 19,
 'j': 20,
 'k': 21,
 'l': 22,
 'm': 23,
 'n': 24,
 'o': 25,
 'p': 26,
 'q': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'x': 34,
 'y': 35,
 'z': 36}

In [8]:
# Numericos a caracteres
int_to_char=dict((i,c) for i, c in enumerate(caracteres))
int_to_char

{0: ' ',
 1: '0',
 2: '1',
 3: '2',
 4: '3',
 5: '4',
 6: '5',
 7: '6',
 8: '7',
 9: '8',
 10: '9',
 11: 'a',
 12: 'b',
 13: 'c',
 14: 'd',
 15: 'e',
 16: 'f',
 17: 'g',
 18: 'h',
 19: 'i',
 20: 'j',
 21: 'k',
 22: 'l',
 23: 'm',
 24: 'n',
 25: 'o',
 26: 'p',
 27: 'q',
 28: 'r',
 29: 's',
 30: 't',
 31: 'u',
 32: 'v',
 33: 'w',
 34: 'x',
 35: 'y',
 36: 'z'}

In [9]:
# Dimensionalidad del texto
n_caracteres = len(texto)
n_vocabulario = len(caracteres)
print('Longitud del libro: ',n_caracteres)
print('Vocabulario: ',n_vocabulario)

Longitud del libro:  233887
Vocabulario:  37


# Creacion del dataset y redimensionalidad

In [10]:
seq_long = 100
dataX = []
dataY = []
for i in range(0, n_caracteres - seq_long, 1):
  seq_in = texto[i : i + seq_long]
  seq_out = texto[i + seq_long]
  dataX.append([char_to_int[caracter] for caracter in seq_in])
  dataY.append(char_to_int[seq_out])

n_patrones = len(dataX)
print('Total de subsequencias: ',n_patrones)

Total de subsequencias:  233787


In [11]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patrones, seq_long, 1))

# normalize
X = X / float(n_vocabulario)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

# Modelo de LSTM basico

## Modelo

1. Capa unica de LSTM oculta con 300 unidades de memoria. 
2. La red de Dropout del 20%. 
3. La capa de salida Densa con función de activación Softmax. 
4. Perdida logarítmica (`categorical_crossentropy`)
5. Optimización de Adam.

In [60]:
# Modelamiento
modelo_lstm = Sequential()
modelo_lstm.add(LSTM(300,input_shape=(X.shape[1],X.shape[2])))
modelo_lstm.add(Dropout(0.2))# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
modelo_lstm.add(Dense(y.shape[1],activation='softmax'))

modelo_lstm.compile(loss='categorical_crossentropy',optimizer='adam')

## Puntos de control

In [61]:
# puntos de control en drive
ruta= "sample_data/Lstm/pesos-mejora-{epoch:02d}-{loss:4f}.hdf5"
control= ModelCheckpoint(ruta, monitor='loss', verbose=1, save_best_only=True, mode='min')
callback_lista = [control]

## Entrenamiento y resultados

In [62]:
# Entrenamiento
modelo_lstm.fit(X, y, epochs=10, batch_size= 128, callbacks= callback_lista)

Epoch 1/10

Epoch 00001: loss improved from inf to 2.73236, saving model to sample_data/Lstm/pesos-mejora-01-2.732360.hdf5
Epoch 2/10

Epoch 00002: loss improved from 2.73236 to 2.54223, saving model to sample_data/Lstm/pesos-mejora-02-2.542225.hdf5
Epoch 3/10

Epoch 00003: loss improved from 2.54223 to 2.41402, saving model to sample_data/Lstm/pesos-mejora-03-2.414017.hdf5
Epoch 4/10

Epoch 00004: loss improved from 2.41402 to 2.32372, saving model to sample_data/Lstm/pesos-mejora-04-2.323721.hdf5
Epoch 5/10

Epoch 00005: loss improved from 2.32372 to 2.24779, saving model to sample_data/Lstm/pesos-mejora-05-2.247787.hdf5
Epoch 6/10

Epoch 00006: loss improved from 2.24779 to 2.18228, saving model to sample_data/Lstm/pesos-mejora-06-2.182283.hdf5
Epoch 7/10

Epoch 00007: loss improved from 2.18228 to 2.12774, saving model to sample_data/Lstm/pesos-mejora-07-2.127740.hdf5
Epoch 8/10

Epoch 00008: loss improved from 2.12774 to 2.07342, saving model to sample_data/Lstm/pesos-mejora-08-2.

<keras.callbacks.History at 0x7f44d4c3a190>

## Generacion de texto con LSTM

In [65]:
# Carga de peso con mas baja perdida : fue de 1.98
ruta_pesos = '/content/sample_data/Lstm/pesos-mejora-10-1.985240.hdf5'
modelo_base.load_weights(ruta_pesos)
modelo_base.compile(loss='categorical_crossentropy',optimizer='adam')

In [69]:
# Evaluacion final
inicio = np.random.randint(0, len(dataX)-1)
patron = dataX[inicio]

print('Frase Semilla')
print("\"", ''.join([int_to_char[valor] for valor in patron]), "\"")
print('\n\nInicio del texto generado\n')

for i in range(500):
  x = np.reshape(patron, (1, len(patron), 1))
  x = x / float(n_vocabulario)
  prediccion = modelo_base.predict(x, verbose=0)
  indice= np.argmax(prediccion)
  resultado = int_to_char[indice]
  seq_in = [int_to_char[valor] for valor in patron]
  sys.stdout.write(resultado)
  patron.append(indice)
  atron = patron[1:len(patron)]

print('\n Hecho')

Frase Semilla
" reproachful voice   you did   said the woggle bug  promptly   and i beg your pardon  i will really t "


Inicio del texto generado

oeee    io   said the scarecrow   and the saarecrow    io whuh toeee    io wou are   ankwered the scarecrow   and the saarecrow    io whuh toeee    io wou are   ankwered the scarecrow   and the saarecrow    io whuh toeee    io wou are   ankwered the scarecrow   and the saarecrow    io whuh toeee    io wou are   ankwered the scarecrow   and the saarecrow    io whuh toeee    io wou are   ankwered the scarecrow   and the saarecrow    io whuh toeee    io wou are   ankwered the scarecrow   and the sa
 Hecho


# Modelo de LSTM profundo RNN - con pocas epochs

## Modelo RNN

1. Capa LSTM con 128 neuronas y entrada con dropout al 20%.
2. Capa Dropout al 30%.
3. Capa LSTM con 256  neuronas.
4. Capa Dropout al 30%.
5. Capa Densa de salida con activación softmax al ser clasificación.

In [76]:
# Modelamiento
model_lstm_rnn = Sequential()
model_lstm_rnn.add(LSTM(128, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model_lstm_rnn.add(Dropout(0.3))
model_lstm_rnn.add(LSTM(256))
model_lstm_rnn.add(Dropout(0.3))
model_lstm_rnn.add(Dense(y.shape[1], activation='softmax'))
model_lstm_rnn.compile(loss='categorical_crossentropy', optimizer='adam')

## Puntos de control

In [77]:
# puntos de control en drive
ruta= "sample_data/lstm_rnn/pesos-mejora-{epoch:02d}-{loss:4f}.hdf5"
control= ModelCheckpoint(ruta, monitor='loss', verbose=1, save_best_only=True, mode='min')
callback_lista = [control]

## Entrenamiento y resultados

In [85]:
#  Entrenamiento
model_lstm_rnn.fit(X,y,epochs=20,batch_size=5000,callbacks=callback_lista) 

Epoch 1/20

Epoch 00001: loss improved from 2.45829 to 2.41289, saving model to sample_data/lstm_rnn/pesos-mejora-01-2.412894.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.41289 to 2.39671, saving model to sample_data/lstm_rnn/pesos-mejora-02-2.396710.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.39671 to 2.38445, saving model to sample_data/lstm_rnn/pesos-mejora-03-2.384445.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.38445 to 2.37211, saving model to sample_data/lstm_rnn/pesos-mejora-04-2.372112.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.37211 to 2.36206, saving model to sample_data/lstm_rnn/pesos-mejora-05-2.362062.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.36206 to 2.34990, saving model to sample_data/lstm_rnn/pesos-mejora-06-2.349900.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.34990 to 2.33578, saving model to sample_data/lstm_rnn/pesos-mejora-07-2.335778.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.33578 to 2.32073, saving model to sam

<keras.callbacks.History at 0x7f44a764bd50>

## Generacion de texto con LSTM-RNN

In [90]:
# Carga de peso con mas baja perdida : fue de 2.10
ruta_pesos = '/content/sample_data/lstm_rnn/pesos-mejora-20-2.103874.hdf5'
model_lstm_rnn.load_weights(ruta_pesos)
model_lstm_rnn.compile(loss='categorical_crossentropy',optimizer='adam')

In [91]:
# Evaluacion final
inicio = np.random.randint(0, len(dataX)-1)
patron = dataX[inicio]

print('Frase Semilla')
print("\"", ''.join([int_to_char[valor] for valor in patron]), "\"")
print('\n\nInicio del texto generado\n')

for i in range(500):
  x = np.reshape(patron, (1, len(patron), 1))
  x = x / float(n_vocabulario)
  prediccion = modelo_base.predict(x, verbose=0)
  indice= np.argmax(prediccion)
  resultado = int_to_char[indice]
  seq_in = [int_to_char[valor] for valor in patron]
  sys.stdout.write(resultado)
  patron.append(indice)
  atron = patron[1:len(patron)]

print('\n Hecho')

Frase Semilla
" tree  the saw horse  not understanding the action  stepped backward and snapped the string easily  b "


Inicio del texto generado

nd the soadsedr with a saal  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soadse buu  and the soa
 Hecho


# Modelo de LSTM profundo RNN - con muchas epochs

## Modelo RNN

1. Capa LSTM con 128 neuronas y entrada con dropout al 20%.
2. Capa Dropout al 30%.
3. Capa LSTM con 256  neuronas.
4. Capa Dropout al 30%.
5. Capa Densa de salida con activación softmax al ser clasificación.

In [12]:
# Modelamiento
model_lstm_rnn = Sequential()
model_lstm_rnn.add(LSTM(128, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model_lstm_rnn.add(Dropout(0.3))
model_lstm_rnn.add(LSTM(256))
model_lstm_rnn.add(Dropout(0.3))
model_lstm_rnn.add(Dense(y.shape[1], activation='softmax'))
model_lstm_rnn.compile(loss='categorical_crossentropy', optimizer='adam')

## Puntos de control

In [14]:
# puntos de control en drive
ruta= "sample_data/lstm_rnn2/pesos-mejora-{epoch:02d}-{loss:4f}.hdf5"
control= ModelCheckpoint(ruta, monitor='loss', verbose=1, save_best_only=True, mode='min')
callback_lista = [control]

## Entrenamiento y resultados

In [15]:
#  Entrenamiento
model_lstm_rnn.fit(X,y,epochs=70,batch_size=3000,callbacks=callback_lista)

Epoch 1/70

Epoch 00001: loss improved from inf to 2.90020, saving model to sample_data/lstm_rnn2/pesos-mejora-01-2.900204.hdf5
Epoch 2/70

Epoch 00002: loss improved from 2.90020 to 2.82329, saving model to sample_data/lstm_rnn2/pesos-mejora-02-2.823286.hdf5
Epoch 3/70

Epoch 00003: loss improved from 2.82329 to 2.80272, saving model to sample_data/lstm_rnn2/pesos-mejora-03-2.802723.hdf5
Epoch 4/70

Epoch 00004: loss improved from 2.80272 to 2.74891, saving model to sample_data/lstm_rnn2/pesos-mejora-04-2.748911.hdf5
Epoch 5/70

Epoch 00005: loss improved from 2.74891 to 2.67811, saving model to sample_data/lstm_rnn2/pesos-mejora-05-2.678111.hdf5
Epoch 6/70

Epoch 00006: loss improved from 2.67811 to 2.60853, saving model to sample_data/lstm_rnn2/pesos-mejora-06-2.608532.hdf5
Epoch 7/70

Epoch 00007: loss improved from 2.60853 to 2.56260, saving model to sample_data/lstm_rnn2/pesos-mejora-07-2.562600.hdf5
Epoch 8/70

Epoch 00008: loss improved from 2.56260 to 2.51890, saving model to 

<keras.callbacks.History at 0x7f1ae2e0dcd0>

## Generacion de texto final

In [17]:
# Carga de peso con mas baja perdida : fue de 2.10
ruta_pesos = '/content/sample_data/lstm_rnn2/pesos-mejora-70-1.526478.hdf5'
model_lstm_rnn.load_weights(ruta_pesos)
model_lstm_rnn.compile(loss='categorical_crossentropy',optimizer='adam')

In [20]:
# Evaluacion final
inicio = np.random.randint(0, len(dataX)-1)
patron = dataX[inicio]

print('Frase Semilla')
print("\"", ''.join([int_to_char[valor] for valor in patron]), "\"")
print('\n\nInicio del texto generado\n')

for i in range(500):
  x = np.reshape(patron, (1, len(patron), 1))
  x = x / float(n_vocabulario)
  prediccion = model_lstm_rnn.predict(x, verbose=0)
  indice= np.argmax(prediccion)
  resultado = int_to_char[indice]
  seq_in = [int_to_char[valor] for valor in patron]
  sys.stdout.write(resultado)
  patron.append(indice)
  atron = patron[1:len(patron)]

print('\n Hecho')

Frase Semilla
" cially if one has good brains  i have known for some time that i am fitted to occupy a far more exal "


Inicio del texto generado

ent   said the scarecrow   i will be the good san    i don t yill ae a pertonacling    i don t you the pueen of the emerald city    i m good to see   said the scarecrow   i will be the good san    i don t yill ae a pertonacling    i don t you the pueen of the emerald city    i m good to see   said the scarecrow   i will be the good san    i don t yill ae a pertonacling    i don t you the pueen of the emerald city    i m good to see   said the scarecrow   i will be the good san    i don t yill a
 Hecho
