In [1]:
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Masking, Dropout
from keras.models import Model, Sequential
import numpy as np
import pandas as pd

Using TensorFlow backend.


In [2]:
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

In [3]:
base_dir = '/home/vaibhavpawar/codes/misc/pred_using_name/'

In [4]:
#load data
trdata = np.loadtxt(base_dir + 'madhavi_texts_tr.txt', dtype='int32', delimiter=',')
tsdata = np.loadtxt(base_dir + 'madhavi_texts_ts.txt', dtype='int32', delimiter=',')

In [5]:
trdata.shape

(561992, 26)

In [6]:
len_window = 25
min_chars = 3

In [7]:
# first len_window columns correspond to the historical text window
# last column is the next character to be predicted by the model
x_tr = trdata[:, 0:len_window]
y_tr = trdata[:, len_window]

x_vl = tsdata[:, 0:len_window]
y_vl = tsdata[:, len_window]

In [12]:
max_charid = max(np.max(y_vl),np.max(y_tr))

In [13]:
max_charid

68

In [14]:
# create one hot of y
y_tr1 = np.zeros((y_tr.shape[0], max_charid))
for i in range(0, y_tr.shape[0]):
    y_tr1[i, y_tr[i]-1] = 1
    
y_vl1 = np.zeros((y_vl.shape[0], max_charid))
for i in range(0, y_vl.shape[0]):
    y_vl1[i, y_vl[i]-1] = 1

In [15]:
time_steps = len_window
onehot_vec_size = max_charid

lstm_size1 = 512

input1 = Input(shape=(time_steps,), dtype='int32', name = 'input')

# embedding layer to convert into one-hot encoded vector
# 0 is mapped to all zeros - this will be ignored when masked
# hence, 1st row of embedding matrix is all zero
# rest of the matrix is just an identity matrix
# this matrix is marked as non-trainable 

embedding_matrix = np.zeros((onehot_vec_size + 1, onehot_vec_size))
embedding_matrix[1:,:] = np.identity(onehot_vec_size)

embedding_layer = Embedding(onehot_vec_size + 1,
                            onehot_vec_size,
                            weights=[embedding_matrix],
                            input_length=time_steps,
                            trainable=False)

embedded_sequences1 = embedding_layer(input1)

lstm1 = LSTM(lstm_size1, dropout=0.2, recurrent_dropout=0.2)

y1 = Masking(mask_value=0.0)(embedded_sequences1)

#sequence_out, y1h, state_c = lstm1(y1)
y1 = lstm1(y1)
y = Dropout(0.3)(y1)
y = Dense(max_charid, activation='softmax')(y)

model = Model(inputs = input1, outputs = y)

In [16]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [17]:
def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]
file_path = base_dir + 'madhavi_email_char_lstm_model_weights1.hdf5'
callbacks = get_callbacks(filepath=file_path, patience=2)

In [18]:
model.fit(x_tr, y_tr1,
          batch_size=64,
          epochs=100,
          validation_data=(x_vl, y_vl1),
          callbacks=callbacks)

Train on 561992 samples, validate on 143551 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


<keras.callbacks.History at 0x7f85763019e8>

In [19]:
mapping = {' ': 2, '!': 1, '"': 4, '#': 3, '%': 5, '&': 7, "'": 6, '(': 9, ')': 8,
 '*': 11, '+': 10, ',': 13, '-': 12, '.': 15, '/': 14, '0': 17, '1': 16, '2': 19,
 '3': 18, '4': 21, '5': 20, '6': 23, '7': 22, '8': 25, '9': 24, ':': 27, ';': 26,
 '<': 29, '=': 28, '>': 31, '?': 30, '@': 32, 'E': 68, 'S': 67, 'U': 0, '[': 33, '\\': 35,
 ']': 34, '_': 36, 'a': 37, 'b': 39, 'c': 38, 'd': 41, 'e': 40, 'f': 43, 'g': 42,
 'h': 45, 'i': 44, 'j': 47, 'k': 46, 'l': 49, 'm': 48, 'n': 51, 'o': 50, 'p': 53,
 'q': 52, 'r': 55, 's': 54, 't': 57, 'u': 56, 'v': 59, 'w': 58, 'x': 61, 'y': 60,
 'z': 63, '{': 62, '|': 65, '}': 64, '~': 66}

In [20]:
rev_mapping = {}
for key in mapping:
    rev_mapping[mapping[key]-1] = key

In [21]:
rev_mapping[51]

'q'

In [28]:
seed_ = 'why'
len_seed_ = len(seed_)
randomness = 0.4

seed = ''.join(['U' for i in range(0, len_window - len_seed_ -1)]) + 'S' + seed_

seed_input = np.zeros((1, len_window), dtype='int32')
generated = ''
while True:
    for i in range(0, len_window):
        seed_input[0, i] = mapping[seed[i]]
        pred = model.predict(seed_input)
        
    charid = np.random.choice(max_charid, replace=False, p=pred.reshape((max_charid, )))
    if np.random.uniform() <= randomness:
        char = rev_mapping[charid]
    else:
        char = rev_mapping[np.argmax(pred)]
        
    if char == 'E':
        break
        
    generated = generated + char
    
    seed = seed[1:] + char
    seed_input = np.zeros((1, len_window), dtype='int32')
    
#print(seed.replace('U', '').replace('S', '') + generated)
print(seed_ + generated)

why is the code campaign kanishk. thanks & regards, madhavi kaivalya k | +91-9833943305 | senior manager - analytics | loylty rewardz <http://bit.do/lranalytics> what's analytics?
