In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv("npr.csv")

In [3]:
# change preprocessing feature extraction
# NNMF works with coefficients -> tfidf vectorization

from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf = TfidfVectorizer(max_df=0.95, min_df = 2, stop_words="english")

In [5]:
dtm = tfidf.fit_transform(npr["Article"])

In [6]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [7]:
# preforming NMF

from sklearn.decomposition import NMF

In [9]:
nmf_model = NMF(n_components = 7, random_state = 42)

In [10]:
nmf_model.fit(dtm)

In [12]:
tfidf.get_feature_names_out()[2300]

'albala'

In [13]:
# NMF -> dealing with words that have the highest coefficient of belonging to specific topic
for i, topic in enumerate(nmf_model.components_):
    print(f"The top 15 words for topic #{i}")
    print([tfidf.get_feature_names_out()[index] for index in topic.argsort()[-15:]])

The top 15 words for topic #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']
The top 15 words for topic #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']
The top 15 words for topic #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']
The top 15 words for topic #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']
The top 15 words for topic #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']
The top 15 words for topic #5
['love', 've', 'don', 'album', 'way

In [17]:
# attach topics to documents

topic_results = nmf_model.transform(dtm)


In [18]:
# pick the most representative topic for document
topic_results.argmax(axis = 1)

array([1, 1, 1, ..., 0, 4, 3], dtype=int64)

In [20]:
npr["Topic"] = topic_results.argmax(axis = 1)

In [21]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [23]:
mytopic_dict = {0:"health", 1:"election_1", 2:"legislation", 3:"politics", 4:"election", 5:"music", 6:"edu"}

npr["Tpoic label"] = npr["Topic"].map(mytopic_dict)

In [24]:
npr.head()

Unnamed: 0,Article,Topic,Tpoic label
0,"In the Washington of 2016, even when the polic...",1,election_1
1,Donald Trump has used Twitter — his prefe...,1,election_1
2,Donald Trump is unabashedly praising Russian...,1,election_1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,politics
4,"From photography, illustration and video, to d...",6,edu
