Story Generation using LSTM Model

In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, Activation
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
import requests
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Using TensorFlow backend.


In [2]:
train_stories = pd.read_csv("story_generation_dataset/ROCStories_train.csv", encoding="utf8")
test_stories = pd.read_csv("story_generation_dataset/ROCStories_test.csv", encoding="utf8")
val_stories = pd.read_csv("story_generation_dataset/ROCStories_val.csv", encoding="utf8")

In [3]:
train_stories = train_stories.append(val_stories)
train_stories = train_stories[:2000]
test_stories = train_stories[:500]

In [4]:
train_stories.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,He ended up buying the truck he wanted despite...
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.


In [5]:
test_stories.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,He ended up buying the truck he wanted despite...
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.


In [6]:
print('Train Dataset')
print(train_stories.values.shape)
train_array = train_stories.values[:,1:].reshape(-1).tolist()
print(len(train_array))
train_array[:10]

Train Dataset
(2000, 7)
12000


['David Drops the Weight',
 'David noticed he had put on a lot of weight recently.',
 'He examined his habits to try and figure out the reason.',
 "He realized he'd been eating too much fast food lately.",
 'He stopped going to burger places and started a vegetarian diet.',
 'After a few weeks, he started to feel much better.',
 'Frustration',
 'Tom had a very short temper.',
 'One day a guest made him very angry.',
 'He punched a hole in the wall of his house.']

In [7]:
print('Test Dataset')
print(test_stories.values.shape)
test_array = test_stories.values[:,1:].reshape(-1).tolist()
print(len(test_array))

Test Dataset
(500, 7)
3000


In [8]:
n = 10
array = train_stories.values[:n,1:].reshape(-1).tolist()
vecn = np.load("data/train_vectors.npy")
vec = vecn.tolist()
print(len(vec))

12000


In [17]:
def nn(qvec, vectors, array, k=5):
      sentences = []
      qvec /= np.linalg.norm(qvec)
      vectors /= np.linalg.norm(vectors)
      scores = np.dot(qvec, vectors.T).flatten()
      sorted_args = np.argsort(scores)[::-1]
      for i in range(k):
            for sa in sorted_args[i]:
                  sentences.append(array[sa])
      for i, s in enumerate(sentences):
            print (s, sorted_args[i])

In [9]:
vt = vec[::6]
v1 = vec[1::6]
v2 = vec[2::6]
v3 = vec[3::6]
v4 = vec[4::6]
v5 = vec[5::6]

In [10]:
len(vt),len(v1),len(v2),len(v3),len(v4)

(2000, 2000, 2000, 2000, 2000)

In [11]:
v0 = np.zeros(2400).tolist()
X = []
y = []
m = len(v1)
for i in range(m):
    X.append([v0 , v0 , v0 , v1[i]])
    y.append(v2[i])
    X.append([v0, v0, v1[i], v2[i]])
    y.append(v3[i])
    X.append([v0, v1[i], v2[i], v3[i]])
    y.append(v4[i])
    X.append([v1[i], v2[i], v3[i], v4[i]])
    y.append(v5[i])
    
X = np.asarray(X) # X.shape is (samples, timesteps, features)
y = np.asarray(y)

In [12]:
maxvec = 10
for i in vec:
    if min(i) < maxvec:
        maxvec = min(i)
maxvec

-0.18974100053310394

In [13]:
from keras.preprocessing import sequence 
from keras.models import Sequential 
from keras.layers import Dense, Dropout, Embedding, LSTM 

model = Sequential() 
model.add(LSTM(2400, dropout=0.2, recurrent_dropout=0.2, input_shape=(4,2400))) 
# model.add(Dense(250, activation='relu')) 
# model.add(Dropout(0.2)) 
# model.add(Dense(1, activation='sigmoid'))
model.compile(loss='mean_squared_error', optimizer='rmsprop') 




Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



In [14]:
model.fit(X, y, batch_size=16, epochs=10)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21daeffbf98>

In [15]:
p = 3
q = 4
pred = model.predict(X[p:q,:,:])

In [18]:
# nn(vec[7], vecn, array)
# array[7]
print("Input Sentence")
nn(X[p:q,3,:].squeeze().tolist(), vecn, array, k=1)
print("Actual Output")
nn(y[p:q,:].squeeze().tolist(), vecn, array, k=1)
print("Predicted Output")
nn(pred.squeeze().tolist(), vecn, array)
np.dot(pred.squeeze(), y[p:q,:].squeeze().T) #Should be close to 2, that means they are same (1 when normalized)

Input Sentence
He stopped going to burger places and started a vegetarian diet. 4
Actual Output
After a few weeks, he started to feel much better. 5
Predicted Output


IndexError: list index out of range

In [None]:
import tensorflow as tf

data = tf.placeholder(tf.float32, [None, 4, 4800]) 
target = tf.placeholder(tf.float32, [None, 1])
num_hidden=24
cell = tf.nn.rnn_cell.LSTMCell(24,state_is_tuple=True)
val, state = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)

In [None]:
def tokenize(sent):
      tokenized_sent = []
      sent = re.split(r'[`\=~!@#^&*()_+\[\]{};\\:"|<,./<>?]|\n| ', sent)
      for w in sent:
            w.lower().replace('.','')
            tokenized_sent.append(w)
      return tokenized_sent

In [None]:
# Tokenization of each document
def tokenize_doc(doc):
    tokenized_doc = []
    for sent in train_array:
        tokenized_doc.append(tokenize(sent))
    return tokenized_doc


In [None]:
train_tokenized_doc = tokenize_doc(train_array)
test_tokenized_doc = tokenize_doc(test_array)
print(train_tokenized_doc[:10])

[['David', 'Drops', 'the', 'Weight'], ['David', 'noticed', 'he', 'had', 'put', 'on', 'a', 'lot', 'of', 'weight', 'recently', ''], ['He', 'examined', 'his', 'habits', 'to', 'try', 'and', 'figure', 'out', 'the', 'reason', ''], ['He', 'realized', "he'd", 'been', 'eating', 'too', 'much', 'fast', 'food', 'lately', ''], ['He', 'stopped', 'going', 'to', 'burger', 'places', 'and', 'started', 'a', 'vegetarian', 'diet', ''], ['After', 'a', 'few', 'weeks', '', 'he', 'started', 'to', 'feel', 'much', 'better', ''], ['Frustration'], ['Tom', 'had', 'a', 'very', 'short', 'temper', ''], ['One', 'day', 'a', 'guest', 'made', 'him', 'very', 'angry', ''], ['He', 'punched', 'a', 'hole', 'in', 'the', 'wall', 'of', 'his', 'house', '']]


In [None]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(train_tokenized_doc)]
tagged_data[:10]

[TaggedDocument(words=['David', 'Drops', 'the', 'Weight'], tags=[0]),
 TaggedDocument(words=['David', 'noticed', 'he', 'had', 'put', 'on', 'a', 'lot', 'of', 'weight', 'recently', ''], tags=[1]),
 TaggedDocument(words=['He', 'examined', 'his', 'habits', 'to', 'try', 'and', 'figure', 'out', 'the', 'reason', ''], tags=[2]),
 TaggedDocument(words=['He', 'realized', "he'd", 'been', 'eating', 'too', 'much', 'fast', 'food', 'lately', ''], tags=[3]),
 TaggedDocument(words=['He', 'stopped', 'going', 'to', 'burger', 'places', 'and', 'started', 'a', 'vegetarian', 'diet', ''], tags=[4]),
 TaggedDocument(words=['After', 'a', 'few', 'weeks', '', 'he', 'started', 'to', 'feel', 'much', 'better', ''], tags=[5]),
 TaggedDocument(words=['Frustration'], tags=[6]),
 TaggedDocument(words=['Tom', 'had', 'a', 'very', 'short', 'temper', ''], tags=[7]),
 TaggedDocument(words=['One', 'day', 'a', 'guest', 'made', 'him', 'very', 'angry', ''], tags=[8]),
 TaggedDocument(words=['He', 'punched', 'a', 'hole', 'in', 't

In [None]:
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)
# Save trained doc2vec model
model.save("train_doc2vec.model")
## Load saved doc2vec model
model= Doc2Vec.load("train_doc2vec.model")

In [None]:
# find most similar doc 
def find_similar_doc(doc):
      train_doc = tokenize(doc)
      sim = model.docvecs.most_similar(positive=[model.infer_vector(train_doc)],topn=len(train_array))
      return sim

In [None]:
def createList(r1, r2):
    return [item for item in range(r1, r2+1)]

In [None]:
d = find_similar_doc("I owns a cat.")
len(d)
d

  after removing the cwd from sys.path.


[(5305, 0.7682952284812927),
 (3002, 0.7473195791244507),
 (5260, 0.7441461086273193),
 (1081, 0.7402247786521912),
 (6440, 0.727687418460846),
 (7461, 0.7229527235031128),
 (5442, 0.7216861248016357),
 (9435, 0.7139107584953308),
 (5980, 0.7098371386528015),
 (1888, 0.7085011005401611),
 (4654, 0.7079470753669739),
 (3673, 0.6966375112533569),
 (11014, 0.6959055662155151),
 (7277, 0.6938141584396362),
 (11945, 0.690593421459198),
 (11577, 0.6897913813591003),
 (10039, 0.6858797073364258),
 (3837, 0.6854057908058167),
 (6536, 0.6760256290435791),
 (8021, 0.6694120168685913),
 (10001, 0.6673581004142761),
 (11464, 0.6672711372375488),
 (6610, 0.6656190156936646),
 (8017, 0.664003312587738),
 (8268, 0.6614720225334167),
 (3169, 0.6589463353157043),
 (241, 0.6512991786003113),
 (3452, 0.6498394012451172),
 (11763, 0.6497655510902405),
 (5605, 0.6495252251625061),
 (3077, 0.6490643620491028),
 (5240, 0.6483838558197021),
 (5732, 0.6450398564338684),
 (5501, 0.6437827944755554),
 (7136, 0.6

In [None]:
for prob in d:
      print(prob[1])

0.7682952284812927
0.7473195791244507
0.7441461086273193
0.7402247786521912
0.727687418460846
0.7229527235031128
0.7216861248016357
0.7139107584953308
0.7098371386528015
0.7085011005401611
0.7079470753669739
0.6966375112533569
0.6959055662155151
0.6938141584396362
0.690593421459198
0.6897913813591003
0.6858797073364258
0.6854057908058167
0.6760256290435791
0.6694120168685913
0.6673581004142761
0.6672711372375488
0.6656190156936646
0.664003312587738
0.6614720225334167
0.6589463353157043
0.6512991786003113
0.6498394012451172
0.6497655510902405
0.6495252251625061
0.6490643620491028
0.6483838558197021
0.6450398564338684
0.6437827944755554
0.6437220573425293
0.6423120498657227
0.6412375569343567
0.6404728889465332
0.639252781867981
0.6369235515594482
0.6368131041526794
0.6354795694351196
0.6327435970306396
0.6307733654975891
0.6302300095558167
0.6286240816116333
0.627567708492279
0.627284049987793
0.6262721419334412
0.624991774559021
0.6248891353607178
0.624530017375946
0.6239079236984253
0

In [None]:
similar_matrix = []
index_ls = []
X,Y=[],[]
with open("output/similar_matrix.txt","w") as sfile:
      for i in range(0,len(train_array),6):  #240000
            prob_ls = []
            tmp_index_ls = []
            tmp_index_ls = createList(i,i+5)
            index_ls.append(tmp_index_ls)
            sim_output = ""
            sim_output += train_array[i] + ","
            sim = find_similar_doc(sim_output)
            for tup in sim:
                  prob_ls.append(tup[1])   #240000
            #similar_matrix.append(prob_ls)
            sfile.write(prob_ls)
sfile.close()
X = index_ls
Y = similar_matrix
X = np.array(X)
Y = np.array(Y)
print(X.shape,Y.shape)
print(X[:5],Y[:5])



  after removing the cwd from sys.path.


TypeError: write() argument must be str, not list

In [None]:
np.savetxt("output/similar_matrix.txt",Y,fmt="%s")

In [None]:
import numpy as np
vec = Y
vt = vec[::6]
v1 = vec[1::6]
v2 = vec[2::6]
v3 = vec[3::6]
v4 = vec[4::6]
v5 = vec[5::6]
print(np.array(vt).shape)
print(np.array(v1).shape)
print(np.array(v2).shape)
print(np.array(v3).shape)
print(np.array(v4).shape)
print(np.array(v5).shape)

['1']
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)


In [None]:
v0 = np.zeros(4800).tolist()
X = []
Y = []
m = min(len(vt),len(v1),len(v2),len(v3),len(v4),len(v5))
for i in range(m):
    X.append([v0 , v0 , v0 , v1[i]])
    Y.append(v2[i])
    X.append([v0, v0, v1[i], v2[i]])
    Y.append(v3[i])
    X.append([v0, v1[i], v2[i], v3[i]])
    Y.append(v4[i])
    X.append([v1[i], v2[i], v3[i], v4[i]])
    Y.append(v5[i])
    
X = np.asarray(X)  # X.shape is (samples, timesteps, features)
Y = np.asarray(Y)  

  from ipykernel import kernelapp as app


In [None]:
maxvec = 10
for i in vec:
    if min(i) < maxvec:
        maxvec = min(i)
maxvec

[ 43542  22278 178548 140628 141942  12840 101172  35052 230778  93060]


In [None]:
char = []
for story in train_stories.values[:,2:7]:
      for sentence in story:
            for word in sentence.split(' '):
                  if any(c in '"!@#$%^&*()-+?_=,<>/"0123456789' for c in word):
                        continue
                  word = re.sub('[^a-zA-Z0-9 \n\.]','', word.lower().replace('.',''))
                  if not word == "":
                        char.append(word)
char.append('.')
print('Vocab before sorted:',len(char))
print(char[:100])
sorted_char = sorted(list(set(char)))
print('\nVocab after sorted:',len(sorted_char))
print(sorted_char[:100])

Vocab before sorted: 1667842
['david', 'noticed', 'he', 'had', 'put', 'on', 'a', 'lot', 'of', 'weight', 'recently', 'he', 'examined', 'his', 'habits', 'to', 'try', 'and', 'figure', 'out', 'the', 'reason', 'he', 'realized', 'hed', 'been', 'eating', 'too', 'much', 'fast', 'food', 'lately', 'he', 'stopped', 'going', 'to', 'burger', 'places', 'and', 'started', 'a', 'vegetarian', 'diet', 'after', 'a', 'few', 'he', 'started', 'to', 'feel', 'much', 'better', 'tom', 'had', 'a', 'very', 'short', 'temper', 'one', 'day', 'a', 'guest', 'made', 'him', 'very', 'angry', 'he', 'punched', 'a', 'hole', 'in', 'the', 'wall', 'of', 'his', 'house', 'toms', 'guest', 'became', 'afraid', 'and', 'left', 'quickly', 'tom', 'sat', 'on', 'his', 'couch', 'filled', 'with', 'regret', 'about', 'his', 'actions', 'marcus', 'needed', 'clothing', 'for', 'a', 'business']

Vocab after sorted: 24888
['.', 'a', 'aa', 'aaa', 'aaliyah', 'aardvark', 'aardvarks', 'aaron', 'aarons', 'aason', 'aback', 'abandon', 'abandoned', 'abbey'

In [None]:
#create sequences
wordlist = char
seq_length = 30
vocab_size = len(sorted_char)
sequences_step = 1
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))

nb sequences: 1667812


In [None]:
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
Y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
        Y[i, vocab[next_words[i]]] = 1

In [None]:
learning_rate = 0.001
rnn_size = 256
batch_size = 64
num_epochs = 10

240000


In [None]:
def lstm(seq_length, vocab_size):
      print('Build LSTM model.')
      '''model = Sequential()
      model.add(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size))
      model.add(Dropout(0.6))
      model.add(Dense(vocab_size))
      model.add(Activation('softmax'))
      optimizer = Adam(lr=learning_rate)
      model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])'''
      md = Sequential()
      md.add(LSTM(4800, dropout=0.2, recurrent_dropout=0.2, input_shape=(None,6))) 
      # model.add(Dense(250, activation='relu')) 
      # model.add(Dropout(0.2)) 
      # model.add(Dense(1, activation='sigmoid'))
      md.compile(loss='mean_squared_error', optimizer='rmsprop') 

      return md

In [None]:
md = lstm(seq_length, vocab_size)
md.summary()

Build LSTM model.
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 4800)              92294400  
                                                                 
Total params: 92,294,400
Trainable params: 92,294,400
Non-trainable params: 0
_________________________________________________________________


In [None]:
#fit the model
history = md.fit(X, Y, 
                 batch_size=batch_size,
                 epochs=num_epochs,
                 validation_split=0.25)

Epoch 1/10


ValueError: in user code:

    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "d:\Downloads\python3.7\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\input_spec.py", line 296, in assert_input_compatibility
        f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 1, 6), found shape=(None, 6)


In [None]:
def generate_stories(original_doc,similar_index,train_array):
      txt = "Story Title: " + original_doc[0]
      txt += "Original Story:" + "\n" + original_doc[1:]
      txt += "Generated Story:" + "\n"
      for i in similar_index: 
            txt += train_array[i] + " "
      return txt + "\n\n"

In [None]:
sim_index = 0
with open("output/story-generated.txt","a") as tfile:
      for i in range(0,len(test_array),6):
            story = generate_stories(test_array[i:i+6],similar_index[sim_index],train_array)
            sim_index += 1
            tfile.write(story)


In [None]:
'''n = 10 
train_array = train_stories.values[:n,1:].reshape(-1).tolist()
vecn = np.load("test_doc2vec.model")
vec = vecn.tolist()
len(vec)

def nn(qvec, vectors, train_array, k=5):
    qvec /= np.linalg.norm(qvec)
    vectors /= np.linalg.norm(vectors)
    scores = np.dot(qvec, vectors.T).flatten()
    sorted_args = np.argsort(scores)[::-1]
    sentences = [train_array[a] for a in sorted_args[:k]]
    for i, s in enumerate(sentences):
        print (s, sorted_args[i])'''

ValueError: Cannot load file containing pickled data when allow_pickle=False