In [1]:
import pandas as pd
import string
import re
import numpy as np
from collections import defaultdict 

In [2]:
transitions_initial = []
transitions_order1 = defaultdict(list)
transitions_order2 = defaultdict(list)

punctuation_table = str.maketrans('','',string.punctuation)

for line in open('robert_frost.txt', encoding="utf-8"):
    line = line.strip().lower()
    
    ####################
    # Skip empty line
    ####################
    if not line:
        continue
    
    #####################
    # Remove punctuation
    #####################
    line = line.translate(punctuation_table)
    tokens = re.compile("\s").split(line) + ['END']
    
    ##############
    # Update words
    ##############
    transitions_initial.append(tokens[0])
    
    for t1, t2 in zip(tokens[:-1], tokens[1:]):
        transitions_order1[t1].append(t2)
        
    for t1, t2, t3 in zip(tokens[:-2], tokens[1:-1], tokens[2:]):
        transitions_order2[(t1, t2)].append(t3)

#######################################
# Convert frequency to probabilities
######################################
def normalize(l):
    s = pd.Series(l).value_counts()
    return (s/s.sum()).to_dict()

transitions_initial = normalize(transitions_initial)
transitions_order1 = {k: normalize(v) for k, v in transitions_order1.items()}
transitions_order2 = {k: normalize(v) for k, v in transitions_order2.items()}

In [3]:
#######################################################
# Sentence generator
# 1. Create first word
# 2. Create second word with unigram distribution
# 3. Create 3rd word onwards with bigram distribution
#######################################################
def generate_sentence(max_length, initial=None):
    def sample_first():
        return np.random.choice(list(transitions_initial.keys()), p=list(transitions_initial.values()))

    def sample_others(words_dict, words):
        possible_words = words_dict[words]
        return np.random.choice(list(possible_words.keys()), p=list(possible_words.values()))
    
    def sentence_generator(initial=None):
        sentence = [sample_first() if not initial else initial]
        yield sentence
                
        sentence.append(sample_others(transitions_order1, sentence[0]))
        yield sentence
        
        while True:
            sentence.append(sample_others(transitions_order2, (sentence[-2], sentence[-1])))
            yield sentence
            
    gen = sentence_generator(initial)
    for i in range(max_length):
        sentence = next(gen)
        if sentence[-1] == 'END':
            return ' '.join(sentence[:-1])
        
    return ' '.join(sentence)

In [4]:
print('Random sentences:')
for _ in range(30):
    print('\t', generate_sentence(10))
    
print('Random sentences start with "i":')
for _ in range(30):
    print('\t', generate_sentence(10, 'i'))
    
print('Random sentences start with "the":')
for _ in range(30):
    print('\t', generate_sentence(10, 'the'))

Random sentences:
	 they were beside the wall stands bare
	 in the dark to say which buds are leaf and
	 in the cellar
	 and besides
	 of getting home again because
	 my going forth
	 an opening
	 only house
	 oh for some miss
	 dont stint her
	 hes celebrating something strange
	 and downy flake
	 enough
	 on slippery rocks beside a waterfall
	 and lie in stones and bushes unretrieved
	 is now this box put it away
	 but if i havent brought you to the attic since
	 as married thats what you see in such a wild
	 only house
	 if it rains
	 one level higher than the cellar in spring
	 all through them
	 i can just see my tent pegged
	 you like is books
	 on it again
	 i know and it was all about when did she
	 there
	 you how she tended both or had them tended
	 my advantage on a lake before a storm
	 someone now and then someone
Random sentences start with "i":
	 i staid the night
	 i had the courage
	 i did
	 i think its going to be salted
	 i took him out on a sort of bakeshop meals
	 