In [4]:
#Need string to read the text and numpy to do the Markov Model
import string
import numpy as np

In [5]:
#Taking a dataset of health headlines that I found online - data source had several new sources - this is only NYT
data = 'nytimeshealth.txt'

In [6]:
#Trying to get out some of the punctuation from the words so that 'the,' and 'the' are not different, etc. 
def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('','', string.punctuation))

In [7]:
#Setting up the word dictionary here
def add2dict(dictionary, key, value):
    if key not in dictionary:
        dictionary[key] = []
    dictionary[key].append(value)

In [8]:
#Want to generate the probability distribution of the words
def gen_next_prob(given_list):
    prob_dict = {}
    given_list_length = len(given_list)
    for item in given_list:
        prob_dict[item] = prob_dict.get(item, 0) + 1
    for key, value in prob_dict.items():
        prob_dict[key] = value / given_list_length
    return prob_dict

In [9]:
first_words = {}
second_words = {}
transitions = {}

In [12]:
# Training the Markov model based on the data for NY Times Health Headlines
def train_markov_model():
    for line in open(data):
        tokens = remove_punctuation(line.rstrip().lower()).split()
        tokens_length = len(tokens)
        for i in range(tokens_length):
            token = tokens[i]
            if i == 0:
                first_words[token] = first_words.get(token, 0) + 1
            else:
                prev_token = tokens[i - 1]
                if i == tokens_length - 1:
                    add2dict(transitions, (prev_token, token), 'END')
                if i == 1:
                    add2dict(second_words, prev_token, token)
                else:
                    prev_prev_token = tokens[i - 2]
                    add2dict(transitions, (prev_prev_token, prev_token), token)
    
    # Normalize the distributions
    first_words_total = sum(first_words.values())
    for key, value in first_words.items():
        first_words[key] = value / first_words_total
        
    for prev_word, next_word_list in second_words.items():
        second_words[prev_word] = gen_next_prob(next_word_list)
        
    for word_pair, next_word_list in transitions.items():
        transitions[word_pair] = gen_next_prob(next_word_list)
    
    print('Training successful.')

In [13]:
train_markov_model()

Training successful.


In [14]:
#The rest of the code will try to run in order to predict the words
def sample_word(dictionary):
    p0 = np.random.random()
    cumulative = 0
    for key, value in dictionary.items():
        cumulative += value
        if p0 < cumulative:
            return key
    assert(False)


In [15]:
number_of_sentences = 10


In [18]:
# Function to generate sample text
def generate():
    for i in range(number_of_sentences):
        sentence = []
        # First word
        word0 = sample_word(first_words)
        sentence.append(word0)
        # Second word
        word1 = sample_word(second_words[word0])
        sentence.append(word1)
        # Subsequent words until end
        while True:
            word2 = sample_word(transitions[(word0, word1)])
            if word2 == 'END':
                break
            sentence.append(word2)
            word0 = word1
            word1 = word2
        print(' '.join(sentence))


In [19]:
generate()

399767754305445888mon nov 11 011159 0000 2014oped contributor why do dying old people httpnewoldageblogsnytimescom20140717warmingtimesinacoldland
399921943417073664mon nov 11 173747 0000 2014with intensive swim lessons a man attacks his fear of ebola in monrovia and is that always a good death httpnytims1twvoo3
398681512687976448fri nov 08 041356 0000 2014montefiore’s new bronx medical center plans to unveil initiative to map the human race’ provides transparency on our guard doctor adds
544694650234474497tue dec 16 051307 0000 2013new york state httpnytimsp3uvu9
534409055205609472mon nov 17 185439 0000 2014study finds that teen pregnancy and abortion rates plummet with longacting female contraception httpwwwnytimescom20141002scienceteenagepregnancyandabortionratesplummetwithlongactingfemalecontraceptionstudysayshtml
480355001118629888sat jun 21 044420 0000 2013national briefing midatlantic pennsylvania transplant recipient gets second set of new york times httppbstwimgcommediabyolyi2c