In [0]:
#import the libraries
import pandas as pd

In [0]:
#Read the csv file
npr=pd.read_csv('/content/npr.csv')

In [4]:
npr

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
...,...
11987,The number of law enforcement officers shot an...
11988,"Trump is busy these days with victory tours,..."
11989,It’s always interesting for the Goats and Soda...
11990,The election of Donald Trump was a surprise to...


In [0]:
#Since LDA is dependent on per word count probabilities
#Wecould only use countvectorizers for LDA
#But Since Non negative Matrix Factorization works with coefficients values
#Here we can preprocess the text with TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
#Create instance of tfidf vectorizer
tfidf=TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [0]:
dtm=tfidf.fit_transform(npr['Article'])

In [8]:
dtm
#11992 are the Articles
#54777 are the words

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [0]:
#perform Non Negative Matrix factorization
from sklearn.decomposition import NMF

In [0]:
#create the instance
nmf_model=NMF(n_components=7,random_state=42)

In [11]:
#Fit nmf to dtm
nmf_model.fit(dtm)
#NMF works faster than LDA

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [12]:
#Display the topics
#Check the word at index 2300
tfidf.get_feature_names()[2300]

'albala'

In [14]:
#grab top 15 words for the topic
for index,topic in enumerate(nmf_model.components_):
  print(f"The top 15 words fro TOPIC # {index}")
  print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
  print('\n')
  print('\n')

The top 15 words fro TOPIC # 0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']




The top 15 words fro TOPIC # 1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']




The top 15 words fro TOPIC # 2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']




The top 15 words fro TOPIC # 3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']




The top 15 words fro TOPIC # 4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']




The top 15 words fro TOPIC # 5
['love', 

In [0]:
#In LDA, we dealt with words that have highest probabilities of belonging to a topic
#in NMF,we are dealing with highest coefficient values inside of the matrix
#Attach the discovered topic labels to the original articles
topic_results=nmf_model.transform(dtm)

In [16]:
topic_results

array([[0.        , 0.12075603, 0.00140297, ..., 0.01518909, 0.        ,
        0.        ],
       [0.00600706, 0.12631211, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.14147252, 0.        , ..., 0.0226561 , 0.        ,
        0.        ],
       ...,
       [0.03188623, 0.        , 0.00840979, ..., 0.00373073, 0.02440375,
        0.        ],
       [0.        , 0.03796415, 0.0107136 , ..., 0.12669893, 0.01177688,
        0.00099946],
       [0.02172572, 0.006454  , 0.0007123 , ..., 0.0123984 , 0.01282932,
        0.00155022]])

In [17]:
topic_results[0]
#this gives the coefficient value for top topic thats representative and what we want is the index position
#of the most representative topic or target

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [19]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3])

In [0]:
npr['Topic']=topic_results.argmax(axis=1)

In [21]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [0]:
#We can give names to Topics
my_topic_dict={0:'Health',1:'Election',2:'Legislation',3:'Politics',4:'Election',5:'Music',6:'Education'}
npr['Topic Label']=npr['Topic'].map(my_topic_dict)

In [23]:
npr

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,Election
1,Donald Trump has used Twitter — his prefe...,1,Election
2,Donald Trump is unabashedly praising Russian...,1,Election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Politics
4,"From photography, illustration and video, to d...",6,Education
...,...,...,...
11987,The number of law enforcement officers shot an...,3,Politics
11988,"Trump is busy these days with victory tours,...",1,Election
11989,It’s always interesting for the Goats and Soda...,0,Health
11990,The election of Donald Trump was a surprise to...,4,Election
