In [1]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

np.random.seed(1234)

In [2]:
robert_frost_file_path = "../data/robert_frost.txt" #robert_frost_small robert_frost

In [3]:
def preprocessing_word(lst):
    return [x.lower().translate(str.maketrans('', '', string.punctuation)) for x in lst]

In [4]:
def file_to_tokenize(file_path):
    '''
    This function purpose is to read file and convert to list of tokenize
    '''
    tokenize_list = []
    f = open(file_path, "r")
    for x in f:
        tokenize_list.append(x.split())
        
    # remove empty array
    tokenize_list = [ x for x in tokenize_list if len(x) != 0]
    
    #preprocessing
    tmp = []
    for x in tokenize_list:
        tmp.append(preprocessing_word(x))
        
    tokenize_list = tmp
        
    return tokenize_list

In [5]:
# read file and convert to tokenize by function file_to_tokenize
robert_frost_tokenize_list = file_to_tokenize(robert_frost_file_path)
len(robert_frost_tokenize_list)

1436

In [6]:
robert_frost_tokenize_list[:5]

[['two', 'roads', 'diverged', 'in', 'a', 'yellow', 'wood'],
 ['and', 'sorry', 'i', 'could', 'not', 'travel', 'both'],
 ['and', 'be', 'one', 'traveler', 'long', 'i', 'stood'],
 ['and', 'looked', 'down', 'one', 'as', 'far', 'as', 'i', 'could'],
 ['to', 'where', 'it', 'bent', 'in', 'the', 'undergrowth']]

In [21]:
# train model using dict
first_order_state_transition  = None
first_order_initial_state = None

second_order_state_transition  = None
second_order_initial_state = None

In [24]:
# compute counting for state transition and initial state 
def compute_counting(lst):
    first_order_state_transition  = {}
    initial_state = {}

    second_order_state_transition  = {}
    
    for tokens in lst:    
         
        first_index = None
        second_index = None
        counting_word = 1
        for idx in tokens:
                
            if counting_word == 1:
                if idx not in initial_state:
                    initial_state[idx] = 1
                else:
                    initial_state[idx] += 1
                    

            if counting_word == 2:
                if first_index not in first_order_state_transition:
                    first_order_state_transition[first_index] = {}
                if idx not in first_order_state_transition[first_index]:
                    first_order_state_transition[first_index][idx] = 1
                else:
                    first_order_state_transition[first_index][idx] += 1
                    
            if counting_word == len(tokens)-1:
                key = (first_index,idx)
                if key not in second_order_state_transition:
                    second_order_state_transition[key] = {}
                second_order_state_transition[key]['end'] = 1
                    
                    
            if counting_word >= 3:
                
                key = (second_index,first_index)
                if key not in second_order_state_transition:
                    second_order_state_transition[key] = {}
                
                if idx not in second_order_state_transition[key]:
                    second_order_state_transition[key][idx] = 1
                else:
                    second_order_state_transition[key][idx] += 1
            

                                     

#             if counting_word <= 5:
#                 print(counting_word,second_index,first_index,idx)
            second_index = first_index    
            first_index = idx
                
            counting_word += 1
                

    
    return initial_state,first_order_state_transition,second_order_state_transition

In [48]:
initial_state,first_order_state_transition,second_order_state_transition = compute_counting(robert_frost_tokenize_list)

In [49]:
dict(list(initial_state.items())[0:5])

{'two': 8, 'and': 129, 'to': 50, 'then': 12, 'because': 1}

In [50]:
dict(list(first_order_state_transition.items())[0:2])

{'two': {'roads': 2,
  'miles': 1,
  'oldbelievers': 1,
  'winds': 1,
  'weeks': 1,
  'of': 1,
  'at': 1},
 'and': {'sorry': 2,
  'be': 1,
  'looked': 1,
  'having': 1,
  'both': 2,
  'that': 2,
  'miles': 2,
  'would': 1,
  'dropped': 1,
  'further': 1,
  'when': 1,
  'tell': 3,
  'the': 4,
  'caught': 1,
  'put': 2,
  'threw': 1,
  'birds': 1,
  'suddenly': 1,
  'scurf': 1,
  'since': 2,
  'whats': 1,
  'many': 1,
  'blew': 1,
  'stamped': 1,
  'sometimes': 1,
  'some': 1,
  'then': 3,
  'came': 1,
  'this': 2,
  'politician': 1,
  'thatd': 1,
  'rode': 1,
  'if': 2,
  'from': 1,
  'i': 4,
  'he': 1,
  'full': 1,
  'experts': 1,
  'built': 1,
  'thats': 1,
  'spoke': 2,
  'anyway': 1,
  'had': 1,
  'how': 1,
  'taken': 1,
  'lie': 1,
  'left': 1,
  'stroked': 1,
  'a': 2,
  'me': 1,
  'between': 1,
  'wont': 1,
  'hes': 1,
  'his': 1,
  'nothing': 1,
  'better': 1,
  'kick': 1,
  'carried': 1,
  'thought': 1,
  'swollen': 2,
  'hold': 1,
  'all': 2,
  'fell': 1,
  'set': 1,
  'sit': 

In [51]:
dict(list(second_order_state_transition.items())[0:5])

{('two', 'roads'): {'diverged': 2},
 ('roads', 'diverged'): {'in': 2},
 ('diverged', 'in'): {'a': 2},
 ('in', 'a'): {'yellow': 1,
  'wood': 1,
  'end': 1,
  'window': 1,
  'packing': 1,
  'byroad': 1,
  'family': 1,
  'new': 1,
  'row': 1,
  'time': 1,
  'town': 1,
  'book': 1,
  'smother': 1,
  'glass': 2},
 ('a', 'yellow'): {'end': 1, 'wood': 1}}

In [42]:
def convert_to_prob(initial_state,first_order_state_transition,second_order_dict):
    for i in second_order_dict:
#         print(i)
#         print(second_order_dict[i])
        total_word_in_key = sum(second_order_dict[i].values())
        for j in second_order_dict[i]:
                tmp_get = second_order_dict.get(i, {}).get(j,None)
                if not tmp_get is None:
                    second_order_dict[i][j] = tmp_get/total_word_in_key
                else:
                    second_order_dict[i][j] = 0
#             print(i,j,k,tmp_get)
#         print('------')
        
    for i in first_order_state_transition:
#         print(i)
#         print(first_order_state_transition[i])
        total_word_in_key = sum(first_order_state_transition[i].values())
#         print(total_word_in_key)
        for j in first_order_state_transition[i]:
            tmp_get = first_order_state_transition[i].get(j, None)
            if not tmp_get is None:
                first_order_state_transition[i][j] = tmp_get/total_word_in_key
            else:
                first_order_state_transition[i][j] = 0
#             print(j,tmp_get)
#         print('------')

    all_word_length = sum(initial_state.values())
    for i in initial_state:
#         print(i,initial_state[i],all_word_length)
        initial_state[i] = initial_state[i]/all_word_length

In [52]:
convert_to_prob(initial_state,first_order_state_transition,second_order_state_transition)

In [53]:
dict(list(initial_state.items())[0:5])

{'two': 0.005571030640668524,
 'and': 0.08983286908077995,
 'to': 0.034818941504178275,
 'then': 0.008356545961002786,
 'because': 0.0006963788300835655}

In [54]:
dict(list(first_order_state_transition.items())[0:2])

{'two': {'roads': 0.25,
  'miles': 0.125,
  'oldbelievers': 0.125,
  'winds': 0.125,
  'weeks': 0.125,
  'of': 0.125,
  'at': 0.125},
 'and': {'sorry': 0.015503875968992248,
  'be': 0.007751937984496124,
  'looked': 0.007751937984496124,
  'having': 0.007751937984496124,
  'both': 0.015503875968992248,
  'that': 0.015503875968992248,
  'miles': 0.015503875968992248,
  'would': 0.007751937984496124,
  'dropped': 0.007751937984496124,
  'further': 0.007751937984496124,
  'when': 0.007751937984496124,
  'tell': 0.023255813953488372,
  'the': 0.031007751937984496,
  'caught': 0.007751937984496124,
  'put': 0.015503875968992248,
  'threw': 0.007751937984496124,
  'birds': 0.007751937984496124,
  'suddenly': 0.007751937984496124,
  'scurf': 0.007751937984496124,
  'since': 0.015503875968992248,
  'whats': 0.007751937984496124,
  'many': 0.007751937984496124,
  'blew': 0.007751937984496124,
  'stamped': 0.007751937984496124,
  'sometimes': 0.007751937984496124,
  'some': 0.007751937984496124,

In [56]:
dict(list(second_order_state_transition.items())[0:5]) 

{('two', 'roads'): {'diverged': 1.0},
 ('roads', 'diverged'): {'in': 1.0},
 ('diverged', 'in'): {'a': 1.0},
 ('in', 'a'): {'yellow': 0.06666666666666667,
  'wood': 0.06666666666666667,
  'end': 0.06666666666666667,
  'window': 0.06666666666666667,
  'packing': 0.06666666666666667,
  'byroad': 0.06666666666666667,
  'family': 0.06666666666666667,
  'new': 0.06666666666666667,
  'row': 0.06666666666666667,
  'time': 0.06666666666666667,
  'town': 0.06666666666666667,
  'book': 0.06666666666666667,
  'smother': 0.06666666666666667,
  'glass': 0.13333333333333333},
 ('a', 'yellow'): {'end': 0.5, 'wood': 0.5}}

In [66]:
def sample_word(input_dict):
    # random value between 0-1
    random_prob = np.random.rand()
    
    cumulative_sum = 0
    
    for k,v in input_dict.items():
        cumulative_sum += v
        if random_prob <= cumulative_sum:
            return k

In [70]:
sample_word(second_order_state_transition[('in', 'a')])

'glass'

In [67]:
sample_word(first_order_state_transition['two'])

'miles'

In [78]:
def generate_poem(initial_state,first_order_state_transition,second_order_state_transition):
    for i in range(4):
        sentence = []
        
        # sample start word
        w0 =  sample_word(initial_state)
        sentence.append(w0)
        
        # sample second_word
        w1 =  sample_word(first_order_state_transition[w0])
        sentence.append(w1)
        
        while True:
            tmp = second_order_state_transition.get((w0,w1),None)
            if tmp is None:
                break
            w2 = sample_word(second_order_state_transition[(w0,w1)])
            if w2 =='end':
                break
        
            sentence.append(w2)
            w0 = w1
            w1 = w2
            
        print(' '.join(sentence))

In [79]:
generate_poem(initial_state,first_order_state_transition,second_order_state_transition)

with whose vast wheels
time someone came with arms
but i must go we cant stay here for
i like your going to say   there shed better
