# Goal: Generate poems that would sound like Robert Frost

In [9]:
import numpy as np
import string

np.random.seed(1234)

In [10]:
# create our dicts
initial = {}    # start of the phrase
first_order = {}    # second word only
second_order = {}

In [11]:
def remove_punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation))

In [12]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

File ‘robert_frost.txt’ already there; not retrieving.



In [15]:
# the key: starting word, or pairs of words in second order case
# value: list, storing all possible next words
def add2dict(d, k, v):
  if k not in d:
    d[k] = []
  d[k].append(v)

In [16]:
# looping over each line
for line in open('robert_frost.txt'):
    # tokenize the sentence
    tokens = remove_punctuation(line.rstrip().lower()).split()

    T = len(tokens)
    for i in range(T):
        t = tokens[i]
        # check if i is the first word
        if i == 0:
            # measure the distribution of the first word
            initial[t] = initial.get(t, 0.) + 1
        else:
            # grab the previous word in the sentence at t-1
            t_1 = tokens[i-1]
            # check whether or not we are at the end of the sentence
            if i == T -1:
                # create a fake token called 'END'
                add2dict(second_order, (t_1, t), 'END')
            if i == 1:
                # measure the distribution of second word
                add2dict(first_order, t_1, t)
            else:
                t_2 = tokens[i-2]
                # key is the previous 2 words, and value is the current word
                add2dict(second_order, (t_2, t_1), t)



In [20]:
first_order

{'two': ['roads',
  'roads',
  'miles',
  'oldbelievers',
  'winds',
  'weeks',
  'of',
  'at'],
 'and': ['sorry',
  'be',
  'looked',
  'having',
  'both',
  'that',
  'miles',
  'miles',
  'would',
  'dropped',
  'further',
  'when',
  'tell',
  'the',
  'caught',
  'put',
  'threw',
  'birds',
  'suddenly',
  'scurf',
  'sorry',
  'since',
  'whats',
  'tell',
  'many',
  'blew',
  'stamped',
  'sometimes',
  'some',
  'then',
  'came',
  'this',
  'then',
  'politician',
  'thatd',
  'rode',
  'if',
  'from',
  'i',
  'he',
  'full',
  'experts',
  'built',
  'both',
  'thats',
  'spoke',
  'anyway',
  'had',
  'the',
  'how',
  'taken',
  'lie',
  'left',
  'stroked',
  'the',
  'a',
  'me',
  'a',
  'between',
  'wont',
  'hes',
  'his',
  'nothing',
  'better',
  'kick',
  'carried',
  'thought',
  'swollen',
  'swollen',
  'hold',
  'all',
  'fell',
  'set',
  'sit',
  'bring',
  'push',
  'that',
  'those',
  'sproutlands',
  'perhaps',
  'see',
  'dangle',
  'disappeared',
  

In [17]:
# normalize the distribution
initial_total = sum(initial.values())
for t, c in initial.items():
    initial[t] = c / initial_total

In [18]:
# param is a list of tokens
def list2pdict(ts):
    # turn each list of possibilities into a dict of probabilities
    d = {}
    n = len(ts)
    for t in ts:
        d[t] = d.get(t, 0.) + 1
    for t, c in d.items():
        d[t] = c / n
    return d

In [21]:
for t_1, ts in first_order.items():
    # replace list with dict of probabilities, instead of a list of tokens
    first_order[t_1] = list2pdict(ts)

In [23]:
first_order

{'two': {'roads': 0.25,
  'miles': 0.125,
  'oldbelievers': 0.125,
  'winds': 0.125,
  'weeks': 0.125,
  'of': 0.125,
  'at': 0.125},
 'and': {'sorry': 0.015503875968992248,
  'be': 0.007751937984496124,
  'looked': 0.007751937984496124,
  'having': 0.007751937984496124,
  'both': 0.015503875968992248,
  'that': 0.015503875968992248,
  'miles': 0.015503875968992248,
  'would': 0.007751937984496124,
  'dropped': 0.007751937984496124,
  'further': 0.007751937984496124,
  'when': 0.007751937984496124,
  'tell': 0.023255813953488372,
  'the': 0.031007751937984496,
  'caught': 0.007751937984496124,
  'put': 0.015503875968992248,
  'threw': 0.007751937984496124,
  'birds': 0.007751937984496124,
  'suddenly': 0.007751937984496124,
  'scurf': 0.007751937984496124,
  'since': 0.015503875968992248,
  'whats': 0.007751937984496124,
  'many': 0.007751937984496124,
  'blew': 0.007751937984496124,
  'stamped': 0.007751937984496124,
  'sometimes': 0.007751937984496124,
  'some': 0.007751937984496124,

In [24]:
for k, ts in second_order.items():
    second_order[k] = list2pdict(ts)

In [25]:
second_order

{('two', 'roads'): {'diverged': 1.0},
 ('roads', 'diverged'): {'in': 1.0},
 ('diverged', 'in'): {'a': 1.0},
 ('in', 'a'): {'yellow': 0.07692307692307693,
  'wood': 0.07692307692307693,
  'window': 0.07692307692307693,
  'packing': 0.07692307692307693,
  'byroad': 0.07692307692307693,
  'family': 0.07692307692307693,
  'new': 0.07692307692307693,
  'row': 0.07692307692307693,
  'time': 0.07692307692307693,
  'town': 0.07692307692307693,
  'book': 0.07692307692307693,
  'smother': 0.07692307692307693,
  'glass': 0.07692307692307693},
 ('yellow', 'wood'): {'END': 1.0},
 ('a', 'yellow'): {'wood': 1.0},
 ('and', 'sorry'): {'i': 1.0},
 ('sorry', 'i'): {'could': 0.5, 'ever': 0.5},
 ('i', 'could'): {'not': 0.2, 'END': 0.2, 'have': 0.2, 'see': 0.2, 'do': 0.2},
 ('could', 'not'): {'travel': 0.5, 'say': 0.5},
 ('travel', 'both'): {'END': 1.0},
 ('not', 'travel'): {'both': 1.0},
 ('and', 'be'): {'one': 0.5, 'whole': 0.5},
 ('be', 'one'): {'traveler': 1.0},
 ('one', 'traveler'): {'long': 1.0},
 ('t

In [26]:
def sample_word(d):
    # draw a sample, this will be a number btw 0 and 1
    p0 = np.random.random()
    # store the sum of the probabilities we encounter in the dict
    cumulative = 0
    # t is the token we might sample, and p is its corresponding probability
    for t, p in d.items():
        cumulative += p
        if p0 < cumulative:
            if p0 < cumulative:
                return t
    assert(False)   # should never get here, if the code gets here, they it couldn't find a token

In [29]:
def generate():
    # generate 4 lines at a time
    for i in range(4):
        sentence = []

        # initial word
        w0 = sample_word(initial)
        sentence.append(w0)

        # sample second word
        w1 = sample_word(first_order[w0])
        sentence.append(w1)

        # second_order transitions until END token
        while True:
            w2 = sample_word(second_order[(w0, w1)])
            if w2 == 'END':
                break
            sentence.append(w2)
            w0 = w1
            w1 = w2
        print(' '.join(sentence))

# Use the train model to generate poems

In [33]:
generate()

as gilt to gold that wouldnt show
that isnt it folks arent afraid of fire
a bill
i guess theyd better drag it through again
