Story Generation using LSTM Model

In [6]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM, Activation
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
import requests
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [7]:
train_stories = pd.read_csv("story_generation_dataset/ROCStories_train.csv", encoding="utf8")
test_stories = pd.read_csv("story_generation_dataset/ROCStories_test.csv", encoding="utf8")
val_stories = pd.read_csv("story_generation_dataset/ROCStories_val.csv", encoding="utf8")

In [8]:
train_stories = train_stories.append(val_stories)

In [9]:
train_stories.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,He ended up buying the truck he wanted despite...
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.


In [10]:
test_stories.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,a34dd5ad-761f-4369-acaf-42e146479c9b,Bad Dream,Tommy was very close to his dad and loved him ...,His was a cop and was shot and killed on duty.,Tommy cried in his mother's arms at the funeral.,Tommy suddenly woke up in a cold sweat.,"Realizing he had just had a bad dream, he went..."
1,d14fc434-01da-4b39-9e7b-3733c510ac29,Scary Movies,Tim was dating a girl who was easily scared.,He decided to have a horror movie night.,She reluctantly agreed.,Tim's girlfriend was scared and stayed close t...,Tim's plan worked perfectly.
2,56800cdf-d149-489f-9ee4-a87ea3138533,Red Butterfly,Samuel collected butterflies for his collection.,One afternoon he spotted a bright red butterfly.,He tried to catch it but it kept flying higher.,Samuel got a ladder and went to the roof of hi...,He missed his footing and went tumbling off th...
3,0ba922c2-afe4-444f-a9b7-8bcf202ebf65,Dirty Feet,She ran outside without her shoes.,She was excited to catch the ice cream man.,She ordered her ice cream and ran home.,She ran into the house.,Her mother yelled because her feet were dirty.
4,f81f29cd-b6f4-4faf-af9a-b779787a6b34,Bullfrog,There once was a man named Larry Butterfrog.,He went down to buy World of Warcraft.,"However, he lacked the money.","So, he had to go to his mom, who was miffed.",And she told him to get a job.


In [11]:
print('Train Dataset')
print(train_stories.values.shape)
train_array = train_stories.values[:,1:].reshape(-1).tolist()
print(len(train_array))
train_array[:10]

Train Dataset
(40000, 7)
240000


['David Drops the Weight',
 'David noticed he had put on a lot of weight recently.',
 'He examined his habits to try and figure out the reason.',
 "He realized he'd been eating too much fast food lately.",
 'He stopped going to burger places and started a vegetarian diet.',
 'After a few weeks, he started to feel much better.',
 'Frustration',
 'Tom had a very short temper.',
 'One day a guest made him very angry.',
 'He punched a hole in the wall of his house.']

In [12]:
print('Test Dataset')
print(test_stories.values.shape)
test_array = test_stories.values[:,1:].reshape(-1).tolist()
print(len(test_array))

Test Dataset
(10000, 7)
60000


In [13]:
def tokenize(sent):
      tokenized_sent = []
      sent = re.split(r'[`\=~!@#^&*()_+\[\]{};\\:"|<,./<>?]|\n| ', sent)
      for w in sent:
            w.lower().replace('.','')
            tokenized_sent.append(w)
      return tokenized_sent

In [14]:
# Tokenization of each document
def tokenize_doc(doc):
    tokenized_doc = []
    for sent in train_array:
        tokenized_doc.append(tokenize(sent))
    return tokenized_doc


In [15]:
train_tokenized_doc = tokenize_doc(train_array)
test_tokenized_doc = tokenize_doc(test_array)
print(train_tokenized_doc[:10])

[['David', 'Drops', 'the', 'Weight'], ['David', 'noticed', 'he', 'had', 'put', 'on', 'a', 'lot', 'of', 'weight', 'recently', ''], ['He', 'examined', 'his', 'habits', 'to', 'try', 'and', 'figure', 'out', 'the', 'reason', ''], ['He', 'realized', "he'd", 'been', 'eating', 'too', 'much', 'fast', 'food', 'lately', ''], ['He', 'stopped', 'going', 'to', 'burger', 'places', 'and', 'started', 'a', 'vegetarian', 'diet', ''], ['After', 'a', 'few', 'weeks', '', 'he', 'started', 'to', 'feel', 'much', 'better', ''], ['Frustration'], ['Tom', 'had', 'a', 'very', 'short', 'temper', ''], ['One', 'day', 'a', 'guest', 'made', 'him', 'very', 'angry', ''], ['He', 'punched', 'a', 'hole', 'in', 'the', 'wall', 'of', 'his', 'house', '']]


In [16]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(train_tokenized_doc)]
tagged_data[:10]

[TaggedDocument(words=['David', 'Drops', 'the', 'Weight'], tags=[0]),
 TaggedDocument(words=['David', 'noticed', 'he', 'had', 'put', 'on', 'a', 'lot', 'of', 'weight', 'recently', ''], tags=[1]),
 TaggedDocument(words=['He', 'examined', 'his', 'habits', 'to', 'try', 'and', 'figure', 'out', 'the', 'reason', ''], tags=[2]),
 TaggedDocument(words=['He', 'realized', "he'd", 'been', 'eating', 'too', 'much', 'fast', 'food', 'lately', ''], tags=[3]),
 TaggedDocument(words=['He', 'stopped', 'going', 'to', 'burger', 'places', 'and', 'started', 'a', 'vegetarian', 'diet', ''], tags=[4]),
 TaggedDocument(words=['After', 'a', 'few', 'weeks', '', 'he', 'started', 'to', 'feel', 'much', 'better', ''], tags=[5]),
 TaggedDocument(words=['Frustration'], tags=[6]),
 TaggedDocument(words=['Tom', 'had', 'a', 'very', 'short', 'temper', ''], tags=[7]),
 TaggedDocument(words=['One', 'day', 'a', 'guest', 'made', 'him', 'very', 'angry', ''], tags=[8]),
 TaggedDocument(words=['He', 'punched', 'a', 'hole', 'in', 't

In [17]:
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)
# Save trained doc2vec model
model.save("train_doc2vec.model")
## Load saved doc2vec model
model= Doc2Vec.load("train_doc2vec.model")

In [18]:
# find most similar doc 
def find_similar_doc(doc):
      train_doc = tokenize(doc)
      sim = model.docvecs.most_similar(positive=[model.infer_vector(train_doc)],topn=len(train_array))
      return sim

In [19]:
def createList(r1, r2):
    return [item for item in range(r1, r2+1)]

In [20]:
d = find_similar_doc("I owns a cat.")
len(d)
d

  after removing the cwd from sys.path.


[(43103, 0.8031928539276123),
 (119573, 0.7792018055915833),
 (189879, 0.7780933380126953),
 (116900, 0.7777941823005676),
 (192325, 0.7692453265190125),
 (15751, 0.764326810836792),
 (122662, 0.7580335736274719),
 (82489, 0.7576169371604919),
 (188549, 0.7564620971679688),
 (173071, 0.7453251481056213),
 (65813, 0.7435972690582275),
 (28251, 0.742857813835144),
 (25719, 0.7423835396766663),
 (69875, 0.7355471849441528),
 (178436, 0.7331219911575317),
 (80102, 0.7316989302635193),
 (108297, 0.7283289432525635),
 (235549, 0.7249078750610352),
 (102736, 0.7243520021438599),
 (175995, 0.7242211699485779),
 (65071, 0.7233837842941284),
 (95499, 0.7176923751831055),
 (219411, 0.7132023572921753),
 (160457, 0.7124772071838379),
 (35175, 0.7120435237884521),
 (91561, 0.7102269530296326),
 (191894, 0.7100791335105896),
 (185024, 0.7098234295845032),
 (234459, 0.7085542678833008),
 (229150, 0.7075707912445068),
 (38539, 0.7075539827346802),
 (132478, 0.707243025302887),
 (90115, 0.7067975997924

In [21]:
for prob in d:
      print(prob[1])

0.8031928539276123
0.7792018055915833
0.7780933380126953
0.7777941823005676
0.7692453265190125
0.764326810836792
0.7580335736274719
0.7576169371604919
0.7564620971679688
0.7453251481056213
0.7435972690582275
0.742857813835144
0.7423835396766663
0.7355471849441528
0.7331219911575317
0.7316989302635193
0.7283289432525635
0.7249078750610352
0.7243520021438599
0.7242211699485779
0.7233837842941284
0.7176923751831055
0.7132023572921753
0.7124772071838379
0.7120435237884521
0.7102269530296326
0.7100791335105896
0.7098234295845032
0.7085542678833008
0.7075707912445068
0.7075539827346802
0.707243025302887
0.7067975997924805
0.7063732743263245
0.7060926556587219
0.7056400179862976
0.7053185701370239
0.7049493193626404
0.7033212184906006
0.7032384872436523
0.7008915543556213
0.7000528573989868
0.700025200843811
0.698521077632904
0.6966884732246399
0.6966288685798645
0.6961260437965393
0.6960119009017944
0.6958354711532593
0.6947956085205078
0.694766104221344
0.694656252861023
0.6937248706817627


In [22]:
similar_matrix = []
index_ls = []
X,Y=[],[]
with open("output/similar_matrix.txt","w") as sfile:
      for i in range(0,len(train_array),6):  #240000
            prob_ls = []
            tmp_index_ls = []
            tmp_index_ls = createList(i,i+5)
            index_ls.append(tmp_index_ls)
            sim_output = ""
            sim_output += train_array[i] + ","
            sim = find_similar_doc(sim_output)
            for tup in sim:
                  prob_ls.append(tup[1])   #240000
            #similar_matrix.append(prob_ls)
            sfile.write(prob_ls)
sfile.close()
X = index_ls
Y = similar_matrix
X = np.array(X)
Y = np.array(Y)
print(X.shape,Y.shape)
print(X[:5],Y[:5])



  after removing the cwd from sys.path.


TypeError: write() argument must be str, not list

In [None]:
np.savetxt("output/similar_matrix.txt",Y,fmt="%s")

In [None]:
import numpy as np
vec = Y
vt = vec[::6]
v1 = vec[1::6]
v2 = vec[2::6]
v3 = vec[3::6]
v4 = vec[4::6]
v5 = vec[5::6]
print(np.array(vt).shape)
print(np.array(v1).shape)
print(np.array(v2).shape)
print(np.array(v3).shape)
print(np.array(v4).shape)
print(np.array(v5).shape)

['1']
(1,)
(1,)
(1,)
(1,)
(1,)
(1,)


In [None]:
v0 = np.zeros(4800).tolist()
X = []
Y = []
m = min(len(vt),len(v1),len(v2),len(v3),len(v4),len(v5))
for i in range(m):
    X.append([v0 , v0 , v0 , v1[i]])
    Y.append(v2[i])
    X.append([v0, v0, v1[i], v2[i]])
    Y.append(v3[i])
    X.append([v0, v1[i], v2[i], v3[i]])
    Y.append(v4[i])
    X.append([v1[i], v2[i], v3[i], v4[i]])
    Y.append(v5[i])
    
X = np.asarray(X)  # X.shape is (samples, timesteps, features)
Y = np.asarray(Y)  

  from ipykernel import kernelapp as app


In [None]:
maxvec = 10
for i in vec:
    if min(i) < maxvec:
        maxvec = min(i)
maxvec

[ 43542  22278 178548 140628 141942  12840 101172  35052 230778  93060]


In [None]:
char = []
for story in train_stories.values[:,2:7]:
      for sentence in story:
            for word in sentence.split(' '):
                  if any(c in '"!@#$%^&*()-+?_=,<>/"0123456789' for c in word):
                        continue
                  word = re.sub('[^a-zA-Z0-9 \n\.]','', word.lower().replace('.',''))
                  if not word == "":
                        char.append(word)
char.append('.')
print('Vocab before sorted:',len(char))
print(char[:100])
sorted_char = sorted(list(set(char)))
print('\nVocab after sorted:',len(sorted_char))
print(sorted_char[:100])

Vocab before sorted: 1667842
['david', 'noticed', 'he', 'had', 'put', 'on', 'a', 'lot', 'of', 'weight', 'recently', 'he', 'examined', 'his', 'habits', 'to', 'try', 'and', 'figure', 'out', 'the', 'reason', 'he', 'realized', 'hed', 'been', 'eating', 'too', 'much', 'fast', 'food', 'lately', 'he', 'stopped', 'going', 'to', 'burger', 'places', 'and', 'started', 'a', 'vegetarian', 'diet', 'after', 'a', 'few', 'he', 'started', 'to', 'feel', 'much', 'better', 'tom', 'had', 'a', 'very', 'short', 'temper', 'one', 'day', 'a', 'guest', 'made', 'him', 'very', 'angry', 'he', 'punched', 'a', 'hole', 'in', 'the', 'wall', 'of', 'his', 'house', 'toms', 'guest', 'became', 'afraid', 'and', 'left', 'quickly', 'tom', 'sat', 'on', 'his', 'couch', 'filled', 'with', 'regret', 'about', 'his', 'actions', 'marcus', 'needed', 'clothing', 'for', 'a', 'business']

Vocab after sorted: 24888
['.', 'a', 'aa', 'aaa', 'aaliyah', 'aardvark', 'aardvarks', 'aaron', 'aarons', 'aason', 'aback', 'abandon', 'abandoned', 'abbey'

In [None]:
#create sequences
wordlist = char
seq_length = 30
vocab_size = len(sorted_char)
sequences_step = 1
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))

nb sequences: 1667812


In [None]:
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
Y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
        Y[i, vocab[next_words[i]]] = 1

In [None]:
learning_rate = 0.001
rnn_size = 256
batch_size = 64
num_epochs = 10

240000


In [None]:
def lstm(seq_length, vocab_size):
      print('Build LSTM model.')
      '''model = Sequential()
      model.add(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size))
      model.add(Dropout(0.6))
      model.add(Dense(vocab_size))
      model.add(Activation('softmax'))
      optimizer = Adam(lr=learning_rate)
      model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])'''
      md = Sequential()
      md.add(LSTM(4800, dropout=0.2, recurrent_dropout=0.2, input_shape=(None,6))) 
      # model.add(Dense(250, activation='relu')) 
      # model.add(Dropout(0.2)) 
      # model.add(Dense(1, activation='sigmoid'))
      md.compile(loss='mean_squared_error', optimizer='rmsprop') 

      return md

In [None]:
md = lstm(seq_length, vocab_size)
md.summary()

Build LSTM model.
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 4800)              92294400  
                                                                 
Total params: 92,294,400
Trainable params: 92,294,400
Non-trainable params: 0
_________________________________________________________________


In [None]:
#fit the model
history = md.fit(X, Y, 
                 batch_size=batch_size,
                 epochs=num_epochs,
                 validation_split=0.25)

Epoch 1/10


ValueError: in user code:

    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "d:\Downloads\python3.7\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "d:\Downloads\python3.7\lib\site-packages\keras\engine\input_spec.py", line 296, in assert_input_compatibility
        f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 1, 6), found shape=(None, 6)


In [None]:
def generate_stories(original_doc,similar_index,train_array):
      txt = "Story Title: " + original_doc[0]
      txt += "Original Story:" + "\n" + original_doc[1:]
      txt += "Generated Story:" + "\n"
      for i in similar_index: 
            txt += train_array[i] + " "
      return txt + "\n\n"

In [None]:
sim_index = 0
with open("output/story-generated.txt","a") as tfile:
      for i in range(0,len(test_array),6):
            story = generate_stories(test_array[i:i+6],similar_index[sim_index],train_array)
            sim_index += 1
            tfile.write(story)


In [None]:
'''n = 10 
train_array = train_stories.values[:n,1:].reshape(-1).tolist()
vecn = np.load("test_doc2vec.model")
vec = vecn.tolist()
len(vec)

def nn(qvec, vectors, train_array, k=5):
    qvec /= np.linalg.norm(qvec)
    vectors /= np.linalg.norm(vectors)
    scores = np.dot(qvec, vectors.T).flatten()
    sorted_args = np.argsort(scores)[::-1]
    sentences = [train_array[a] for a in sorted_args[:k]]
    for i, s in enumerate(sentences):
        print (s, sorted_args[i])'''

ValueError: Cannot load file containing pickled data when allow_pickle=False