Story Generation using GRU Model

In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils
import requests
import re

Using TensorFlow backend.


In [13]:
stories = pd.read_csv("story_generation_dataset/ROCStories_train.csv", encoding="utf8")

In [14]:
stories.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,David Drops the Weight,David noticed he had put on a lot of weight re...,He examined his habits to try and figure out t...,He realized he'd been eating too much fast foo...,He stopped going to burger places and started ...,"After a few weeks, he started to feel much bet..."
1,0beabab2-fb49-460e-a6e6-f35a202e3348,Frustration,Tom had a very short temper.,One day a guest made him very angry.,He punched a hole in the wall of his house.,Tom's guest became afraid and left quickly.,Tom sat on his couch filled with regret about ...
2,87da1a22-df0b-410c-b186-439700b70ba6,Marcus Buys Khakis,Marcus needed clothing for a business casual e...,All of his clothes were either too formal or t...,He decided to buy a pair of khakis.,The pair he bought fit him perfectly.,Marcus was happy to have the right clothes for...
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,Different Opinions,Bobby thought Bill should buy a trailer and ha...,Bill thought a truck would be better for what ...,Bobby pointed out two vehicles were much more ...,Bill was set in his ways with conventional thi...,He ended up buying the truck he wanted despite...
4,c71bb23b-7731-4233-8298-76ba6886cee1,Overcoming shortcomings,John was a pastor with a very bad memory.,He tried to memorize his sermons many days in ...,He decided to learn to sing to overcome his ha...,He then made all his sermons into music and sa...,His congregation was delighted and so was he.


In [15]:
stories.values.shape

(30000, 7)

In [73]:
array = stories.values[:,2:].reshape(-1).tolist()
print(len(array))

150000


In [74]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [75]:
# Tokenization of each document
tokenized_doc = []
sent_ls=[]
for sent in array:
    sent_ls=[]
    for w in sent.split(' '):
        w = w.lower().replace('.','')
        sent_ls.append(w)
    tokenized_doc.append(sent_ls)

print(tokenized_doc[:10])


[['david', 'noticed', 'he', 'had', 'put', 'on', 'a', 'lot', 'of', 'weight', 'recently'], ['he', 'examined', 'his', 'habits', 'to', 'try', 'and', 'figure', 'out', 'the', 'reason'], ['he', 'realized', "he'd", 'been', 'eating', 'too', 'much', 'fast', 'food', 'lately'], ['he', 'stopped', 'going', 'to', 'burger', 'places', 'and', 'started', 'a', 'vegetarian', 'diet'], ['after', 'a', 'few', 'weeks,', 'he', 'started', 'to', 'feel', 'much', 'better'], ['tom', 'had', 'a', 'very', 'short', 'temper'], ['one', 'day', 'a', 'guest', 'made', 'him', 'very', 'angry'], ['he', 'punched', 'a', 'hole', 'in', 'the', 'wall', 'of', 'his', 'house'], ["tom's", 'guest', 'became', 'afraid', 'and', 'left', 'quickly'], ['tom', 'sat', 'on', 'his', 'couch', 'filled', 'with', 'regret', 'about', 'his', 'actions']]


In [76]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
tagged_data[:10]

[TaggedDocument(words=['david', 'noticed', 'he', 'had', 'put', 'on', 'a', 'lot', 'of', 'weight', 'recently'], tags=[0]),
 TaggedDocument(words=['he', 'examined', 'his', 'habits', 'to', 'try', 'and', 'figure', 'out', 'the', 'reason'], tags=[1]),
 TaggedDocument(words=['he', 'realized', "he'd", 'been', 'eating', 'too', 'much', 'fast', 'food', 'lately'], tags=[2]),
 TaggedDocument(words=['he', 'stopped', 'going', 'to', 'burger', 'places', 'and', 'started', 'a', 'vegetarian', 'diet'], tags=[3]),
 TaggedDocument(words=['after', 'a', 'few', 'weeks,', 'he', 'started', 'to', 'feel', 'much', 'better'], tags=[4]),
 TaggedDocument(words=['tom', 'had', 'a', 'very', 'short', 'temper'], tags=[5]),
 TaggedDocument(words=['one', 'day', 'a', 'guest', 'made', 'him', 'very', 'angry'], tags=[6]),
 TaggedDocument(words=['he', 'punched', 'a', 'hole', 'in', 'the', 'wall', 'of', 'his', 'house'], tags=[7]),
 TaggedDocument(words=["tom's", 'guest', 'became', 'afraid', 'and', 'left', 'quickly'], tags=[8]),
 Tagg

In [77]:
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)
# Save trained doc2vec model
model.save("test_doc2vec.model")
## Load saved doc2vec model
model= Doc2Vec.load("test_doc2vec.model")

In [78]:
test_stories = pd.read_csv("story_generation_dataset/ROCStories_test.csv", encoding="utf8")
test_stories.head()

Unnamed: 0,storyid,storytitle,sentence1,sentence2,sentence3,sentence4,sentence5
0,a34dd5ad-761f-4369-acaf-42e146479c9b,Bad Dream,Tommy was very close to his dad and loved him ...,His was a cop and was shot and killed on duty.,Tommy cried in his mother's arms at the funeral.,Tommy suddenly woke up in a cold sweat.,"Realizing he had just had a bad dream, he went..."
1,d14fc434-01da-4b39-9e7b-3733c510ac29,Scary Movies,Tim was dating a girl who was easily scared.,He decided to have a horror movie night.,She reluctantly agreed.,Tim's girlfriend was scared and stayed close t...,Tim's plan worked perfectly.
2,56800cdf-d149-489f-9ee4-a87ea3138533,Red Butterfly,Samuel collected butterflies for his collection.,One afternoon he spotted a bright red butterfly.,He tried to catch it but it kept flying higher.,Samuel got a ladder and went to the roof of hi...,He missed his footing and went tumbling off th...
3,0ba922c2-afe4-444f-a9b7-8bcf202ebf65,Dirty Feet,She ran outside without her shoes.,She was excited to catch the ice cream man.,She ordered her ice cream and ran home.,She ran into the house.,Her mother yelled because her feet were dirty.
4,f81f29cd-b6f4-4faf-af9a-b779787a6b34,Bullfrog,There once was a man named Larry Butterfrog.,He went down to buy World of Warcraft.,"However, he lacked the money.","So, he had to go to his mom, who was miffed.",And she told him to get a job.


In [79]:
print(test_stories.values.shape)
test_array = test_stories.values[:,2:].reshape(-1).tolist()
print(len(test_array))

(10000, 7)
50000


In [80]:
def tokenize(sent):
      tokenized_sent = []
      for w in sent.split(' '):
            w.lower().replace('.','')
            tokenized_sent.append(w)
      return tokenized_sent

In [81]:
# find most similar doc 
test_doc = tokenize(test_array[0])
print(test_doc)
sim = model.docvecs.most_similar(positive=[model.infer_vector(test_doc)],topn=10)
sim

['Tommy', 'was', 'very', 'close', 'to', 'his', 'dad', 'and', 'loved', 'him', 'greatly.']


  after removing the cwd from sys.path.


[(149995, 0.9008253216743469),
 (432, 0.8353145122528076),
 (61706, 0.8109379410743713),
 (24838, 0.8096643686294556),
 (54473, 0.8055285811424255),
 (83629, 0.7830244898796082),
 (11907, 0.7824154496192932),
 (106179, 0.7814722657203674),
 (87761, 0.7702282667160034),
 (115938, 0.765677809715271)]

In [82]:
similar_index = []
for tup in sim:
      similar_index.append(tup[0])
print(similar_index)

[149995, 432, 61706, 24838, 54473, 83629, 11907, 106179, 87761, 115938]


In [85]:
txt =test_array[0]
new_txt = ""
print('Original Story:')
print(txt)
print('Generated Story:')
for i in similar_index:
      new_txt += array[i] + " "
print(new_txt)

Original Story:
Tommy was very close to his dad and loved him greatly.
Generated Story:
Tommy was very close to his dad and loved him greatly. His dad turned around and started to laugh. She grew up without a mom and was responsible for her brothers. His parents were not happy about the arrest. She enjoyed the one nurse who always joked around with her. Ariel made new friends quickly. Her mom started by teaching her how to cook rice. She was disappointed her son wasn't more like her. He always kept a flashlight with him to see ahead of him. Dan's friends eventually stopped hanging out with him. 


In [51]:
n = 10 
array = stories.values[:n,1:].reshape(-1).tolist()
vecn = np.load("test_doc2vec.model")
vec = vecn.tolist()
len(vec)

def nn(qvec, vectors, array, k=5):
    qvec /= np.linalg.norm(qvec)
    vectors /= np.linalg.norm(vectors)
    scores = np.dot(qvec, vectors.T).flatten()
    sorted_args = np.argsort(scores)[::-1]
    sentences = [array[a] for a in sorted_args[:k]]
    for i, s in enumerate(sentences):
        print (s, sorted_args[i])

ValueError: Cannot load file containing pickled data when allow_pickle=False