In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv("npr.csv")

In [3]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [5]:
# npr["Article"][0]

In [6]:
len(npr)

11992

In [7]:
# preprocessing

from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# max_df -> ignore terms that have high document frequency; discard words that show up in 90% of documents
# min_df -> minimum document frequency; 2-> for word to be counted, it has to be in at least 2 articles
# stop_words -> remove stop words
cv = CountVectorizer(max_df = 0.9, min_df = 2, stop_words = "english")

In [9]:
# document term matrix -> dtm
dtm = cv.fit_transform(npr["Article"])

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
# n_components -> number of topics
# words for topic will be generated randomly
LDA = LatentDirichletAllocation(n_components = 7, random_state = 42)

In [12]:
LDA.fit(dtm)

## PART TWO

In [13]:
# Grab the vocabulary of words

# Grab the topics

# Grab the highest probability words per topic

In [14]:
# Grab the vocabulary of words

In [18]:
len(cv.get_feature_names_out()) # -> holding the instance of every single word

54777

In [19]:
type(cv.get_feature_names_out())

numpy.ndarray

In [20]:
cv.get_feature_names_out()[50000]

'transcribe'

In [21]:
import random

random_word_id = random.randint(0, 54777)

cv.get_feature_names_out()[random_word_id]

'compel'

In [22]:
# Grab the topics
# Getting from the trained LDA

len(LDA.components_)

7

In [23]:
type(LDA.components_)

numpy.ndarray

In [24]:
LDA.components_.shape

(7, 54777)

In [25]:
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [26]:
# Grab the highest probability words per topic
# we still don't know what this topic represents

single_topic = LDA.components_[0]

In [27]:
# argsort returns index positions that would sort this array
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993], dtype=int64)

In [28]:
import numpy as np

arr = np.array([10, 200, 1])
arr

array([ 10, 200,   1])

In [29]:
arr.argsort()

array([2, 0, 1], dtype=int64)

In [31]:
# argsort --> index positions sorted from least --> greatest

# top 10 values (10 greatest values)
# last 10 values of argsort
single_topic.argsort()[-10:]

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993], dtype=int64)

In [32]:
top_ten_words = single_topic.argsort()[-10:]

In [33]:
for index in top_ten_words:
    print(cv.get_feature_names_out()[index])

new
percent
government
company
million
care
people
health
said
says


In [35]:
top_twenty_words = single_topic.argsort()[-20:]
for index in top_twenty_words:
    print(cv.get_feature_names_out()[index])

president
state
tax
insurance
trump
companies
money
year
federal
000
new
percent
government
company
million
care
people
health
said
says


In [37]:
# for each topic (LDA_components_) print top 15 words related to that topic

for i, topic in enumerate(LDA.components_):
    print(f"The top 15 words for topic #{i}")

    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print("\n\n")

The top 15 words for topic #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']



The top 15 words for topic #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']



The top 15 words for topic #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']



The top 15 words for topic #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']



The top 15 words for topic #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']



The top 15 words for topic #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'th

In [38]:
# assigning topics to articles

In [39]:
npr

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
...,...
11987,The number of law enforcement officers shot an...
11988,"Trump is busy these days with victory tours,..."
11989,It’s always interesting for the Goats and Soda...
11990,The election of Donald Trump was a surprise to...


In [40]:
# create a list of topics

topic_results = LDA.transform(dtm)

In [41]:
topic_results

array([[1.61040465e-02, 6.83341493e-01, 2.25376318e-04, ...,
        2.99652737e-01, 2.25479379e-04, 2.25497980e-04],
       [3.63424997e-02, 8.86130697e-01, 4.40751747e-04, ...,
        7.57636804e-02, 4.40866779e-04, 4.40835574e-04],
       [3.28569485e-04, 6.96344889e-01, 3.28302105e-04, ...,
        3.02012902e-01, 3.28724083e-04, 3.28352652e-04],
       ...,
       [1.44467964e-02, 1.60696622e-01, 1.73678310e-01, ...,
        2.24636569e-02, 3.98728349e-04, 3.98359730e-04],
       [4.33560738e-04, 3.53196803e-02, 4.33022554e-04, ...,
        9.62512640e-01, 4.33971991e-04, 4.33490254e-04],
       [3.98777533e-01, 2.54376049e-04, 3.59290659e-01, ...,
        2.40914375e-01, 2.54445555e-04, 2.54253739e-04]])

In [42]:
topic_results.shape

(11992, 7)

In [45]:
# probability of article belonging to each of 7 topics
topic_results[0]

array([1.61040465e-02, 6.83341493e-01, 2.25376318e-04, 2.25369288e-04,
       2.99652737e-01, 2.25479379e-04, 2.25497980e-04])

In [46]:
topic_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [47]:
# concat topic with article

topic_results[0].argmax()

1

In [48]:
npr["Topic"] = topic_results.argmax(axis = 1)
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,1
11988,"Trump is busy these days with victory tours,...",4
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,4
