In [1]:
import numpy as np
import pandas as pd

In [2]:
filename = 'examiner-date-tokens.csv'

In [3]:
df_text = pd.read_csv('../Data/'+filename)
df_text = df_text.sample(100000)
df_text.head()

Unnamed: 0,publish_date,headline_tokens
2442459,20130703,hassayampa inn steps up to the plate as presco...
2305868,20130315,do weight loss supplements trigger eating diso...
517376,20100713,lacma director wallis annenberg joins with cha...
1249070,20110527,real housewives of new jersey teresa giudice t...
190251,20100317,khloe kardashian angers shia labeouf with unso...


We dont care about the publication date, so we want to focus on the headline tokens. Our first task is to calculate the size of our transition matrix. We will do this by splitting up each headline by word and finding the number of unique words

In [4]:
firstWords = []
wordArray = []
headlines = df_text['headline_tokens']
for headline in headlines:
    firstWords.append(headline.split()[0])
    for word in headline.split():
        wordArray.append(word)

In [5]:
uniqueWords = list(set(wordArray))
uniqueWords.sort()
uniqueWordCount = len(uniqueWords)

In [6]:
print("Number of words in all headlines:", len(wordArray))
print("Number of unique words used:", uniqueWordCount)

Number of words in all headlines: 870254
Number of unique words used: 54846


From this we see that we have about 26.8 million separate words and 273 thousand unique words. In our algorithm we will be treating each number in the sentence as a word and hopefully the sentences we generate will include numbers and be coherant.

In [7]:
words = dict.fromkeys(uniqueWords)
index = 0
for word in words:
    words[word]=index
    index += 1

In [8]:
transitionCount = np.zeros((uniqueWordCount+1, uniqueWordCount+1))
transition2Count = np.zeros((uniqueWordCount+1, uniqueWordCount+1))

We increase the size of the transition matrix by 1 to account for the null state, or a transition from a word to the end of a sentence.

In [9]:
for headline in headlines:
    sentence = headline.split()
    for i in range(len(sentence)):
        if i < len(sentence) - 1:
            transitionCount[words[sentence[i]]][words[sentence[i+1]]] += 1
        else:
            transitionCount[words[sentence[i]]][uniqueWordCount] += 1

        if i < len(sentence) - 2:
            transition2Count[words[sentence[i]]][words[sentence[i+2]]] += 1
        else:
            transition2Count[words[sentence[i]]][uniqueWordCount] += 1
transitionCount[uniqueWordCount][uniqueWordCount] = 1
transition2Count[uniqueWordCount][uniqueWordCount] = 1

In [10]:
print(transitionCount[words['happy']][words['new']])
print(transitionCount[words['new']][words['year']])
print(transition2Count[words['happy']][words['year']])

12.0
134.0
11.0


In [11]:
transitionNorm = transitionCount
transition2Norm = transition2Count
for i in range(len(transitionCount)):
    transitionNorm[i] /= transitionNorm[i].sum()
    transition2Norm[i] /= transition2Norm[i].sum()

In [12]:
print(transitionNorm[words['not']][words['not']])

0.0007052186177715092


In [13]:
uniqueWords.append(None)
print(np.random.choice(uniqueWords, size=1,p=transitionNorm[words['happy']]))

[None]


In [46]:
seed = np.random.choice(firstWords, size=1)[0]
print(seed)
length = 1
previousWord = seed
nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[previousWord]])[0]

while nextWord is not None:
    print(nextWord)
    length += 1
    nextProbabilitys = transitionNorm[words[nextWord]] * ((transition2Norm[words[previousWord]]) != 0)
    nextProbabilitys[-1] = min(transitionNorm[words[nextWord]][-1],transition2Norm[words[previousWord]][-1],.01)
    nextProbabilitys /= nextProbabilitys.sum()
    previousWord = nextWord
    nextWord = np.random.choice(uniqueWords, size=1,p=nextProbabilitys)[0]
# print(np.random.choice(uniqueWords, size=1,p=transitionNorm[words['happy']]))

easter
fun
at
new
vegas
magazine
anniversary
edition
at
the
comic
book
for
part
1
no
one
is
the
final
fantasy
xiv
a


In [None]:
len(uniqueWords)

In [None]:
uniqueWordCount