## Char Prediction using LSTM

1. Download data of Alice in Wonderland or Dracula from https://www.gutenberg.org/browse/scores/top in plain text format
2. Create an char_to_int map which maps each character used in the novel to an integer. example {a: 3}
3. Read data from the text file and do the following:
    3.1 Create a sliding window in which it takes in first 100 characters as the input sequence and 101th character as the output sequence. (It slides over every character).
    For example: 
        "Avul Pakir Jainulabdeen Abdul Kalam better known as A.P.J. Abdul Kalam"
        You should slide from "A" to the 100th char and 101th char will be your output.
        Then you should start sliding from "v" to the 100th char and 101th char will be your output.
    The input and the output sequence should be converted to their integer representation using the char_to_int map.
    With this you basically have two arrays seqIn and seqOut with each element containing integer representation of 100 characters and 1 character respectively.
    seqIn = [[10........15], [5.....25]...] seqOut = [5, 2, 5]
4. Now reshape your seqIn as (NumberOfSamples, 100, 1) - So you basically get this [[[10]........[15]], [[5]..... [25]]...]
5. One hot encode your seqOut using np_utils.to_categorical

6. Now create a simple model with LSTM followed by a Dense layer.

7. Then, given a seed sentence predict the next character using the model created.


In [1]:
import numpy as np
import pandas as pd
import keras
from keras.preprocessing import sequence
from keras.layers import Dense,LSTM,Embedding
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import h5py

Using TensorFlow backend.


In [2]:
file = open('Alice_in_Wonderland')

In [3]:
data = file.read()

In [4]:
chars = list(set(data))
data_size = len(data)
chars_size = len(chars)

In [5]:
char_to_int  = {ch:i for i,ch in enumerate(chars)}

In [6]:
int_to_char = {i:ch for i,ch in enumerate(chars)}

In [7]:
r = data[0:100]
k = [char_to_int[letter] for letter in r]
type(k)

list

In [8]:
def sliding_window(data, seq_len):
#     char_to_int = char_to_int_conv(data)
    start = 0
    end = seq_len
    seqIn = []
    seqOut = []
    total_seq_in = []
    total_seq_out = []
    data_len = len(data)
    for count in range(0,(data_len - seq_len)):
        seqIn = data[start:end]
        seqOut = data[end]
        start = start+1
        end = end+1
#         conv_seqIn = [char_to_int[letter] for letter in seqIn]
#         conv_seqOut = [char_to_int[letter] for letter in seqOut]
        total_seq_in.append(seqIn)
        total_seq_out.append(seqOut)
    
    return({'seqIn':total_seq_in,'seqOut':total_seq_out})
        

In [9]:
def char_to_int_conv(data,total_seq_in,total_seq_out):
    chars = list(set(data))
    seqIn =[]
    seqOut = []
    char_to_int  = {ch:i for i,ch in enumerate(chars)}
    for sent_in,sent_out in zip(total_seq_in,total_seq_out):
        list_in = [char_to_int[letter_in] for letter_in in sent_in]
        list_out =[char_to_int[letter_out] for letter_out in sent_out]
        seqIn.append(list_in)
        seqOut.append(list_out)
#     seqOut_list = []
#     if len(seqOut[0])==1:
#         for item in seqOut:
#             for element in item:
#                 seqOut.append(element)
#         seqOut = seqOut_list
    return {'seqIn':seqIn,'seqOut':seqOut}

In [10]:
temp = sliding_window(data,25)

In [11]:
temp['seqOut'][0]

'’'

In [12]:
r=[]
k=[]
for letter in temp['seqIn'][0:4]:
    r=[]
    for word in letter:
        r.append(word)
    k.append(r)


In [13]:
k = char_to_int_conv(data,temp['seqIn'],temp['seqOut'])
jj = k['seqOut'][0:3]

In [14]:
(np.array(k['seqIn'])).shape

(163792, 25)

In [15]:
t = np.array(k['seqIn']).reshape(163792,25,1)

In [16]:
t[0]

array([[72],
       [69],
       [77],
       [54],
       [37],
       [ 9],
       [71],
       [32],
       [ 2],
       [23],
       [71],
       [37],
       [34],
       [26],
       [37],
       [69],
       [66],
       [75],
       [76],
       [32],
       [53],
       [55],
       [ 7],
       [ 9],
       [37]])

In [17]:
def reshape(seqIn):
    seqIn = np.array(seqIn)
    seqIn_shape = seqIn.shape
    return seqIn.reshape(seqIn_shape[0], seqIn_shape[1],1)

In [18]:
seqIn_reshape = reshape(k['seqIn'])

In [19]:
import pandas as pd
r = pd.Series(k['seqOut'])

In [20]:
chars_size = len(list(set(data)))

In [21]:
chars_size

86

In [22]:
len(chars)

86

In [23]:
z =k['seqOut']

In [24]:
lossa = []
for item in z:
    for row in item:
        lossa.append(row)

In [25]:
z = np.array(z)

In [26]:
z.max()

85

In [27]:
z.flatten()

array([75, 76, 32, ..., 31, 14, 14])

In [28]:
a =list(z)

In [29]:
type(a)

list

In [30]:
def convert_to_categorical(seqOut):
    seqOut = (np.array(seqOut)).flatten()
    seqOut_hot_encode = keras.utils.to_categorical(seqOut, num_classes = len(set(seqOut)))
    return seqOut_hot_encode

In [31]:
seqOut_one_hot_encode = convert_to_categorical(z)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(seqIn_reshape, seqOut_one_hot_encode, test_size=0.01, random_state=1)
X_train.shape

(162154, 25, 1)

In [50]:
model = Sequential()

In [51]:
model.add(LSTM(256, input_shape = (X_train.shape[1],X_train.shape[2]), return_sequences = True))

In [52]:
model.add(LSTM(256, input_shape = (X_train.shape[1],X_train.shape[2])))

In [53]:
model.add(Dense(512,activation = 'relu'))

In [57]:
model.add(Dense(86, activation="sigmoid"))

In [58]:
model.compile(loss="categorical_crossentropy", optimizer="adam",metrics=['accuracy'])

In [59]:
model.fit(X_train, y_train, batch_size=32, epochs=1, validation_split=0.2, verbose = 1)

Train on 129723 samples, validate on 32431 samples
Epoch 1/1


<keras.callbacks.History at 0x7fd2c14d3fd0>

In [60]:
model1_predictions = model.predict_classes(X_test)



In [61]:
evaluation = model.evaluate(X_test, y_test)



In [62]:
evaluation

[3.2297951057280376, 0.16910866910866912]

In [63]:
x_test = """Longitude either, but though that was a lot"""

In [64]:
len(x_test)

43

In [65]:
sent = sliding_window(x_test,25)

In [66]:
input_x_test = char_to_int_conv(x_test,sent['seqIn'],sent['seqOut'])

In [67]:
seqIn_reshape_test = reshape(input_x_test['seqIn'])

In [68]:
x_test_model = model.predict_classes(seqIn_reshape_test)



In [69]:
predicts = [int_to_char[letter] for letter in x_test_model]

In [70]:
len(predicts)

18

In [71]:
predicts

[' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ']

In [None]:
## Adding weights

In [None]:
model.load_weights('weights-improvement-49-1.2575',by_name= False)