In [28]:
import pickle
import gensim
import numpy as np
import pandas as pd

In [21]:
# get dataframe and load model
with open('clean_line_df.pkl', 'rb') as f:
    clean_line_df = pickle.load(f)
    
doc2vec_model = gensim.models.doc2vec.Doc2Vec.load("poem_doc2vec_dbow2000.pkl")

In [17]:
lines = clean_line_df.words.to_list()

In [18]:
print(lines[0:5])

[['philosophic'], ['in', 'its', 'complex', 'ovoid', 'emptiness'], ['a', 'skillful', 'pundit', 'coined', 'it', 'as', 'a', 'sort'], ['of', 'stopgap', 'doorstop', 'for', 'those'], ['quaint', 'equations']]


In [26]:
lines_vector=[]

t = 50000

for i in range(len(lines)):
    if i % t == 0:
        print("line", i, ":", lines[i])
        print("***")
    line = lines[i]
    lines_vector.append(doc2vec_model.infer_vector(line))
    
#save the lines_vector
lines_vector_file = "lines_vector_500_a001_ma001_s10000.pkl"
with open('line_vectors', 'wb') as f:
    pickle.dump((lines_vector), f)

line 0 : ['philosophic']
***
line 50000 : ['and', 'placid', 'blank', 'as', 'a', 'lake']
***
line 100000 : ['stupidity', 'secrecy']
***
line 150000 : ['and', 'swore', 'this', 'mare', 'was', 'far', 'more', 'worth']
***
line 200000 : ['and', 'have', 'kept', 'their', 'lives', 'and', 'their', 'wives', 'and', 'their', 'children', 'and']
***
line 250000 : ['fancy', 'this']
***
line 300000 : ['here', 'is', 'my', 'hand']
***
line 350000 : ['and', 'he', 'whose', 'soul', 'is', 'flat—the', 'sky']
***
line 400000 : ['the', 'moral', 'of', 'the', 'story', 'was', 'plain', 'enough']
***
line 450000 : ['the', 'cars', 'moved', 'down', 'the', 'street', 'slowly', 'as', 'always', 'so', 'many']
***


In [31]:
n_sequenced_lines = 6
v_dim = 2000

lines_label = [str(i) for i in range(len(lines))]
X_train = np.zeros((len(lines), n_sequenced_lines, v_dim), dtype=np.float)
y_train = np.zeros((len(lines), v_dim), dtype=np.float)


t = 10000
for i in range(len(lines_label)-n_sequenced_lines-1):
    if i % t == 0: print("new sequence: ", i)
    
    for k in range(n_sequenced_lines):
        line = lines_label[i+k]
        vect = lines_vector[i+k]
        
        if i % t == 0:
            print("  ", k + 1 ,"th vector for this sequence. line ", line, "(vector dim = ", len(vect), ")")
            
        for j in range(len(vect)):
            X_train[i, k, j] = vect[j]
    
    line_target = lines_label[i+n_sequenced_lines]
    v_target = lines_vector[i+n_sequenced_lines]
    if i % t == 0: print("  y vector for sequence ", line_target, ": (vector dim = ", len(v_target), ")")
    for j in range(len(v_target)):
        y_train[i, j] = v_target[j]

print(X_train.shape, y_train.shape)

new sequence:  0
   1 th vector for this sequence. Sentence  0 (vector dim =  2000 )
   2 th vector for this sequence. Sentence  1 (vector dim =  2000 )
   3 th vector for this sequence. Sentence  2 (vector dim =  2000 )
   4 th vector for this sequence. Sentence  3 (vector dim =  2000 )
   5 th vector for this sequence. Sentence  4 (vector dim =  2000 )
   6 th vector for this sequence. Sentence  5 (vector dim =  2000 )
  y vector for this sequence  6 : (vector dim =  2000 )
new sequence:  1000
   1 th vector for this sequence. Sentence  1000 (vector dim =  2000 )
   2 th vector for this sequence. Sentence  1001 (vector dim =  2000 )
   3 th vector for this sequence. Sentence  1002 (vector dim =  2000 )
   4 th vector for this sequence. Sentence  1003 (vector dim =  2000 )
   5 th vector for this sequence. Sentence  1004 (vector dim =  2000 )
   6 th vector for this sequence. Sentence  1005 (vector dim =  2000 )
  y vector for this sequence  1006 : (vector dim =  2000 )
new sequence: 

In [35]:
from __future__ import print_function
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Embedding, Flatten, Bidirectional, Input, LSTM
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy, mean_squared_error, mean_absolute_error
from keras.losses import logcosh

def bidirectional_lstm_model(seq_length, vector_dim):
    print('Building LSTM model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vector_dim)))
    model.add(Dropout(0.5))
    model.add(Dense(vector_dim))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='logcosh', optimizer=optimizer, metrics=['acc'])
    print('LSTM model built.')
    return model

In [36]:
rnn_size = 32 # size of RNN
# vector_dim = 500
learning_rate = 0.0001 #learning rate

model_sequence = bidirectional_lstm_model(n_sequenced_lines, v_dim)

Building LSTM model...
LSTM model built.


In [41]:
batch_size = 30 # minibatch size

callbacks=[EarlyStopping(patience=3, monitor='val_loss'),
           ModelCheckpoint(filepath='line_sequence_lstm.{epoch:02d}.hdf5',\
                           monitor='val_loss', verbose=1, mode='auto', save_freq=1000)]

history = model_sequence.fit(X_train, y_train,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=40,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
model_sequence.save('line_sequence_lstm.final.hdf5')

Epoch 1/40
  998/14569 [=>............................] - ETA: 4:56 - loss: 5.2908e-04 - acc: 0.3345
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
 1999/14569 [===>..........................] - ETA: 4:33 - loss: 5.2704e-04 - acc: 0.3310
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
 2999/14569 [=====>........................] - ETA: 4:10 - loss: 5.2353e-04 - acc: 0.3319
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm.01.hdf5
Epoch 00001: saving model to line_sequence_lstm