In [1]:
import numpy as np
import pandas as pd

In [2]:
filename = 'examiner-date-tokens.csv'

In [3]:
df_text = pd.read_csv('../Data/'+filename)
df_text = df_text.sample(10000)
df_text.head()

Unnamed: 0,publish_date,headline_tokens
1389065,20110821,metro detroits summer art fair guide august 29...
130072,20100222,ice storms create slippery slopes for lawrencians
1347814,20110724,why the debt ceiling debate is crucial to los ...
885010,20101201,green craft round up north woods ornaments win...
757416,20101008,resume writing workshop


We dont care about the publication date, so we want to focus on the headline tokens. Our first task is to calculate the size of our transition matrix. We will do this by splitting up each headline by word and finding the number of unique words

In [4]:
firstWords = []
wordArray = []
headlines = df_text['headline_tokens']
for headline in headlines:
    firstWords.append(headline.split()[0])
    for word in headline.split():
        wordArray.append(word)

In [5]:
uniqueWords = list(set(wordArray))
uniqueWords.sort()
uniqueWordCount = len(uniqueWords)

In [6]:
print("Number of words in all headlines:", len(wordArray))
print("Number of unique words used:", uniqueWordCount)

Number of words in all headlines: 87320
Number of unique words used: 16702


From this we see that we have about 26.8 million separate words and 273 thousand unique words. In our algorithm we will be treating each number in the sentence as a word and hopefully the sentences we generate will include numbers and be coherant.

In [7]:
words = dict.fromkeys(uniqueWords)
index = 0
for word in words:
    words[word]=index
    index += 1

In [8]:
transitionCount = np.zeros((uniqueWordCount+1, uniqueWordCount+1))
transition2Count = np.zeros((uniqueWordCount+1, uniqueWordCount+1))

We increase the size of the transition matrix by 1 to account for the null state, or a transition from a word to the end of a sentence.

In [9]:
for headline in headlines:
    sentence = headline.split()
    for i in range(len(sentence)):
        if i < len(sentence) - 1:
            transitionCount[words[sentence[i]]][words[sentence[i+1]]] += 1
        else:
            transitionCount[words[sentence[i]]][uniqueWordCount] += 1

        if i < len(sentence) - 2:
            transition2Count[words[sentence[i]]][words[sentence[i+2]]] += 1
        else:
            transition2Count[words[sentence[i]]][uniqueWordCount] += 1
transitionCount[uniqueWordCount][uniqueWordCount] = 1
transition2Count[uniqueWordCount][uniqueWordCount] = 1

In [10]:
print(transitionCount[words['happy']][words['new']])
print(transitionCount[words['new']][words['year']])
print(transition2Count[words['happy']][words['year']])

0.0
14.0
0.0


In [11]:
transitionNorm = transitionCount
transition2Norm = transition2Count
for i in range(len(transitionCount)):
    transitionNorm[i] /= transitionNorm[i].sum()
    transition2Norm[i] /= transition2Norm[i].sum()

In [12]:
print(transitionNorm[words['not']][words['not']])

0.0


In [13]:
uniqueWords.append(None)
print(np.random.choice(uniqueWords, size=1,p=transitionNorm[words['happy']]))

['hours']


In [31]:
def generateSentence(seed=np.random.choice(firstWords, size=1)[0], targetLength=7, generatedSentence=[]):
    generatedSentence.append(seed)
    
    nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[generatedSentence[-1]]])[0]
    if targetLength > 1:
        while(nextWord is None):
            nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[generatedSentence[-1]]])[0]

    while nextWord is not None:
        generatedSentence.append(nextWord)
        nextProbabilitys = transitionNorm[words[generatedSentence[-1]]] * (transition2Norm[words[generatedSentence[-2]]]) + transitionNorm[words[generatedSentence[-1]]]/4
        nextProbabilitys[-1] += 0.00001
        nextProbabilitys /= nextProbabilitys.sum()
        if len(generatedSentence) < targetLength - 1:
            if nextProbabilitys.sum() > nextProbabilitys[-1]:
                nextProbabilitys[-1] /= 10
            nextProbabilitys /= nextProbabilitys.sum()
        if len(generatedSentence) > targetLength + 1:
            nextProbabilitys[-1] *= 2
            nextProbabilitys /= nextProbabilitys.sum()
        nextWord = np.random.choice(uniqueWords, size=1,p=nextProbabilitys)[0]
        
    return generatedSentence

In [32]:
generatedSentence = generateSentence()
print(' '.join(str(x) for x in generatedSentence if x is not None))

super bowl next home run fun the houses


This sentence generator is currently implemented as a Marcov Model as opposed to a Hidden Markov Model. To change this to a HMM, we are going to introduce hidden states that the transition probability matrix is based off of. In many use-cases this hiddnen state may be based off of part of speach of the word. Because with this dataset we are not given the part of speach of this word, we are going to implement a hidden state as a toy model for this assignment. Our hidden state is going to give preference to words based off of their length. The average english word length is 4.5 letters. We are going to have 2 hidden states, which we will refer to as S and L. If our hidden state is S, standing for shorter, we will double probability of all words 4 letters and under, and half the probability of all words longer than 4 letters; vice-versa for stat L, longer word. This isn't exactly ideal in generating coherant sentences as parts of speach may be better, but this will demostrate the Hidden Markov Model and help our model adapt in the future to people's word length preferences.

In [33]:
def emissionProbability(probability, state):
    for i in range(len(probability)-1):
        if state is 'S':
            if len(uniqueWords[i]) < 5:
                probability[i] *= 2
            else:
                probability[i] /= 2
        else:
            if len(uniqueWords[i]) > 4:
                probability[i] *= 2
            else:
                probability[i] /= 2
    probability[i] /= probability.sum()
    
    
hiddenStateDict = {'S': 0, 'L': 1}
hiddenStates = ['S','L']
hiddenStateTransitionMatrix = [[.7, .3],[.8, .2]]
def nextHiddenState(hiddenState):
    nextHiddenState = np.random.choice(hiddenStates, size=1, p=hiddenStateTransitionMatrix[hiddenStateDict[hiddenState]])
    return nextHiddenState[0]

In [40]:
def generateSentenceHidden(seed=np.random.choice(firstWords, size=1)[0], targetLength=7, generatedSentence=[], hiddenState = 'S'):
    if generatedSentence == []:
        generatedSentence.append(seed)   
    nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[generatedSentence[-1]]])[0]
    if targetLength > 1:
        while(nextWord is None):
            nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[generatedSentence[-1]]])[0]

    while nextWord is not None:
        generatedSentence.append(nextWord)
        nextProbabilitys = transitionNorm[words[generatedSentence[-1]]] * (transition2Norm[words[generatedSentence[-2]]]) + transitionNorm[words[generatedSentence[-1]]]/4
        nextProbability = emissionProbability(nextProbabilitys, hiddenState)
        nextProbabilitys[-1] += 0.00001
        nextProbabilitys /= nextProbabilitys.sum()
        if len(generatedSentence) < targetLength - 1:
            if nextProbabilitys.sum() > nextProbabilitys[-1]:
                nextProbabilitys[-1] /= 10
            nextProbabilitys /= nextProbabilitys.sum()
        if len(generatedSentence) > targetLength + 1:
            nextProbabilitys[-1] *= 2
            nextProbabilitys /= nextProbabilitys.sum()
        nextWord = np.random.choice(uniqueWords, size=1,p=nextProbabilitys)[0]
        
        hiddenState = nextHiddenState(hiddenState)
    return generatedSentence

In [41]:
generatedSentence = generateSentenceHidden()
print(' '.join(str(x) for x in generatedSentence if x is not None))

tweeter accused of time foods market wraps up


Now we will demonstrate how to generate a sentence based given a sequence of words

In [44]:
sentence = "To be or not to"
generatedSentence = generateSentenceHidden(seed=sentence.split()[-1], generatedSentence=sentence.split())
print(' '.join(str(x) for x in generatedSentence if x is not None))

To be or not to find the modern day cookie
