In [47]:
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy
import markovify
import re

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [48]:
from urllib import request

url = "https://www.gutenberg.org/files/5200/5200-0.txt"

response = request.urlopen(url)
raw = response.read()
text = raw.decode("utf-8-sig")

In [49]:
def strip_gutenberg_burger(text):
    lines = re.split(r"[~\r\n]+", text)

    book = []
    in_book = False
    for line in lines:

        if line.startswith("*** END OF THE PROJECT GUTENBERG EBOOK"):
            in_book = False

        if in_book:
            book.append(line)

        if line.startswith("*** START OF THE PROJECT GUTENBERG EBOOK"):
            in_book = True
        
    return " ".join(book)

text = strip_gutenberg_burger(text)

In [67]:
def text_cleaner(text):
    text = text.replace("“","\"")
    text = text.replace("’","'")

    return text

text = text_cleaner(text)

In [68]:
doc = nlp(text)

In [69]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])



Verbs: ['translate', 'wake', 'find', 'transform', 'lie', 'lift', 'see', 'dome', 'divide', 'bed', 'cover', 'seem', 'slide', 'compare', 'wave', 'look', 'happen', 'think', 'lie', 'lie', 'spread', 'travel', 'hang', 'cut', 'house', 'show', 'fit', 'sit', 'raise', 'cover', 'turn', 'look', 'hear', 'hit', 'make', 'feel', 'sleep', 'forget', 'think', 'be', 'be', 'do', 'use', 'sleep', 'get', 'throw', 'roll', 'be', 'try', 'shut', 'have', 'look', 'flounder', 'stop', 'begin', 'feel', 'feel', 'think', 'choose', 'travel', 'do', 'take', 'do', 'be', 'travel', 'make', 'get', 'know', 'become', 'go', 'feel', 'push', 'lift', 'find', 'see', 'cover', 'know', 'make', 'try', 'feel', 'draw', 'touch', 'overcome', 'slide', 'get', 'think', 'make', 'get', 'get', 'travel', 'live', 'go', 'copy', 'sit', 'eat', 'try', 'kick', 'know', 'be', 'have', 'think', 'give', 'go', 'tell', 'think', 'tell', 'let', 'know', 'feel', 'fall', 'sit', 'talk', 'have', 'go', 'be', 'get', 'pay', 'suppose', 'do', 'make', 'get', 'get', 'leave', 

In [70]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Franz Kafka Translated PERSON
David Wyllie PERSON
One morning TIME
Gregor Samsa PERSON
four CARDINAL
Samsa PRODUCT
Gregor PERSON
a hundred CARDINAL
one CARDINAL
morning TIME
First ORDINAL
five CARDINAL
half past six DATE
more like quarter DATE
seven CARDINAL
four o'clock TIME
seven CARDINAL
Gregor PERSON
five years DATE
Gregor PERSON
quarter DATE
seven CARDINAL
Gregor PERSON
seven CARDINAL
Gregor PERSON
first ORDINAL
Gregor PERSON
Gregor GPE
Gregor PERSON
Gregor GPE
first ORDINAL
today DATE
first ORDINAL
first ORDINAL
Gregor PERSON
first ORDINAL
first ORDINAL
morning TIME
seven o'clock TIME
quarter past DATE
seven CARDINAL
seven o'clock TIME
Gregor PERSON
half CARDINAL
Two CARDINAL
ten past seven DATE
Gregor PERSON
Gregor PERSON
first ORDINAL
at least a couple of hours CARDINAL
Gregor PERSON
Gregor PERSON
today DATE
Gregor PERSON
Gregor PERSON
morning TIME
Samsa PERSON
a week DATE
every evening TIME
two or three CARDINAL
Gregor PERSON
this morning TIME
Samsa PERSON
Gregor PERSON
Gregor

In [71]:
sents = ' '.join([sent.text for sent in doc.sents if len(sent.text) > 1])

In [72]:
generator_1 = markovify.Text(sents, state_size=3)

In [80]:
#We will randomly generate three sentences
for i in range(3):
    print(generator_1.make_sentence())
#We will randomly generate three more sentences of no more than 100 characters
for i in range(3):
    print(generator_1.make_short_sentence(max_chars=1000))

If they were shocked then it would no longer be Gregor's responsibility and he could only hear their feet as they stepped heavily on the floor.
His sister began to cry.
Gregor's mother did once thoroughly clean his room, and could not understand how he had behaved this time and whether, perhaps, any slight improvement could be seen.
Gregor's wish to see his mother was to be up at six to get to work.
He was merely fixed on the idea that Gregor should be got back into his room, although that was in a position to bear the costs of the whole performance and it was out of the room was alright; and only then did she let her mother dissuade her.
It seemed remarkable to Gregor that Grete had not said enough and that his father would stop that unbearable hissing!


In [79]:
#next we will use spacy's part of speech to generate more legible text
class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        return ['::'.join((word.orth_, word.pos_)) for word in nlp(sentence)]
    def word_join(self, words):
        sentence = ' '.join(word.split('::')[0] for word in words)
        return sentence
#Call the class on our text
generator_2 = POSifiedText(sents, state_size=3)

In [87]:
def tidy(text):
    text = text.replace(" ;", ";")
    text = text.replace(" '", "'")    
    text = text.replace(" ,", ",")    
    text = text.replace(" .", ".")    
    text = text.replace(" ?", "?")    
    return text

In [88]:
for i in range(5):
    txt = generator_2.make_sentence()
    if txt is not None:
        print(tidy(txt))


So she refused to let her out of his shocked state.
Whenever they began to talk of the need to earn money in quite different ways.
But who knows, maybe that was a good sign.
But now the two of them, then that was the first word she had spoken to him directly since his transformation.


In [90]:
for i in range(5):
    txt = generator_2.make_short_sentence(max_chars=200)
    if txt:
        print(tidy(txt))

But his sister was not there, Gregor would always first let go of the door and lifted it towards his father.
Gregor made a run for him; he wanted to say something would invariably stop and gather his companions around him.
In her alarm, which was something they had not been touched in the meantime and some interest had accumulated.
This meant that his mother's attention.
Gregor drew his head back from the door and he would remain immobile for hours afterwards.
