### Reading Data

In [1]:
import pandas as p
news_headlines_df = p.read_csv("abcnews-date-text.csv")
news_headlines_df['Date'] = p.to_datetime(news_headlines_df.publish_date)
news_headlines_df = news_headlines_df.sort_values(by=['Date'])

news_text = news_headlines_df['headline_text'].values
print(news_text[0:2])

['aba decides against community broadcasting licence'
 'patterson snubs health meeting to avoid lions den']


In [2]:
len(news_text)

1186018

#### Trying with 2000 max features & min_df = 2

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
unigram_tf_vectorizer = CountVectorizer(encoding='latin-1',max_features = 2000, binary=False,stop_words='english', min_df=2, max_df = 0.8)

In [31]:
import numpy as np
news_unigram_tf_matrix = unigram_tf_vectorizer.fit_transform(np.array(news_text))
len(unigram_tf_vectorizer.vocabulary_)

2000

In [32]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 20

In [33]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20

# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda_tf = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda_news_tf_docs = lda_tf.fit_transform(news_unigram_tf_matrix)

In [34]:
display_topics(lda_tf, unigram_tf_vectorizer.get_feature_names(), no_top_words)

Topic 0:
man court murder charged years accused child life face work jail park assault david gets run faces land told sentenced
Topic 1:
sydney perth deal time public workers talks return crisis rail green research close light sale rio footage august military gives
Topic 2:
queensland rural hospital brisbane house family missing test png children street mp students parliament concerns baby storm union video wants
Topic 3:
health news year labor state minister help rise port climate chinese hobart regional wall hill mental monday abbott air team
Topic 4:
australian melbourne qld adelaide school sex west final people business ban tax fight alleged jailed review arrested finds emergency warns
Topic 5:
trump china afl drum john australias pay turnbull body million coal energy gas job morrison russia history star nrn stop
Topic 6:
new crash car abc market darwin media weather laws takes second prices fatal white town beach probe plane outback thousands
Topic 7:
says day interview death win 

In [35]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_tf.score(news_unigram_tf_matrix))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_tf.perplexity(news_unigram_tf_matrix))

Log Likelihood:  -30093912.38864605
Perplexity:  2262.5655515904773


#### 4000 features and min_df = 5; Only Unigrams

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
unigram_tf_vectorizer = CountVectorizer(encoding='latin-1',max_features = 4000, binary=False, stop_words='english', min_df=5, max_df = 0.8)

In [37]:
import numpy as np
news_unigram_tf_matrix = unigram_tf_vectorizer.fit_transform(np.array(news_text))
len(unigram_tf_vectorizer.vocabulary_)

4000

In [38]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 20

In [39]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20

# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda_tf = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda_news_tf_docs = lda_tf.fit_transform(news_unigram_tf_matrix)

In [40]:
display_topics(lda_tf, unigram_tf_vectorizer.get_feature_names(), no_top_words)

Topic 0:
court perth win guilty drug man face png charges support jail michael david gets run india prison teen sentenced company
Topic 1:
council market victoria rise farm turnbull week abbott security air debate amid outback investigation threat staff rio challenge hunt creek
Topic 2:
open killed live darwin island chinese pay cattle mining hits price boost injured energy newcastle continues james dollar thousands international
Topic 3:
australian election melbourne qld adelaide accused sex minister final people business funding tax fight jailed arrested end season street trade
Topic 4:
car change record 2016 2015 coal share bid anti job australians dairy england indonesia kids smith aussie marriage september loses
Topic 5:
day report centre public claims speaks driver hobart campaign christmas peter urged local youth race black questions urges shark suicide
Topic 6:
Topic 7:
man sydney murder woman charged years trial dead life power pm leader research boy charge start stabbing hear

In [41]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_tf.score(news_unigram_tf_matrix))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_tf.perplexity(news_unigram_tf_matrix))

Log Likelihood:  -39003105.808877245
Perplexity:  4072.6532038111072


#### Adding Bigrams with the same 4000 max_features filter

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
unigram_tf_vectorizer = CountVectorizer(encoding='latin-1',max_features = 4000, binary=False, lowercase = True, stop_words='english', min_df=5,ngram_range = (1,2),max_df = 0.8)

In [26]:
import numpy as np
news_unigram_tf_matrix = unigram_tf_vectorizer.fit_transform(np.array(news_text))
len(unigram_tf_vectorizer.vocabulary_)

4000

In [27]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20

# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda_tf = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda_news_tf_docs = lda_tf.fit_transform(news_unigram_tf_matrix)

In [28]:
display_topics(lda_tf, unigram_tf_vectorizer.get_feature_names(), no_top_words)

Topic 0:
nsw election nt market funding farm laws party govt week share mental chief debate sea announces mental health defence force release
Topic 1:
queensland woman test royal hour darwin trial commission near country hour students program body union cricket royal commission energy told assault sexual
Topic 2:
wa report school missing open live alleged chinese future second cattle price cyclone injured peter monday newcastle super star uk
Topic 3:
government child dead set deal island work workers talks mining faces return friday team rail green boy close light best
Topic 4:
police court accused man drug tax face fight charges shooting trade station old run make free aboriginal dog western teen
Topic 5:
health news year labor state minister women time centre claims rise speaks port hobart campaign wall hill christmas appeal defends
Topic 6:
man murder charged coast national years gold tasmanian life victoria search nrl jail park david gets gold coast east storm coal
Topic 7:
trump r

In [29]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_tf.score(news_unigram_tf_matrix))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_tf.perplexity(news_unigram_tf_matrix))

Log Likelihood:  -39618571.12494908
Perplexity:  3749.25734151968
