In [1]:
import numpy as np

import pandas as pd

In [2]:
data = pd.read_csv("npr.csv")

In [3]:
data.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
data.shape

(11992, 1)

In [7]:
#data["Article"][0]

### There are 11992 articles in dataset and we can't read all of them and classify that which article talks about which topic. Nor do we have any label to check that. So we will use LDA for Topic Modeling. Since there are no labels as we can see in the dataset, this is an unsupervised learning approach.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(max_df= 0.95 , min_df= 2, stop_words= "english")

In [10]:
dtm = cv.fit_transform(data["Article"])

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

In [12]:
LDA = LatentDirichletAllocation(n_components=7 , random_state=42)

In [13]:
LDA.fit(dtm)

LatentDirichletAllocation(n_components=7, random_state=42)

In [16]:
len(cv.get_feature_names())

54777

**There are 54777 features. Means there are 54777 words.**

In [17]:
dtm.shape

(11992, 54777)

**The document term matrix has found 54777 words across 11992 articles.**

In [19]:
LDA.components_.shape

(7, 54777)

**This shows the word-topic distribution** 

In [26]:
#LDA.components_[0].round(2)

In [27]:
#LDA.components_[0].argsort()[-15:]  #gives index of top 15 words

### Grabing highest probability words for all 7 topics

In [28]:
for i,topic in enumerate(LDA.components_):
    
    print(f"THE TOP 15 WORDS FOR TOPIC {i} ARE :")
    
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    
    print("\n")
    
    print("\n")
    

THE TOP 15 WORDS FOR TOPIC 0 ARE :




['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']




THE TOP 15 WORDS FOR TOPIC 1 ARE :
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']




THE TOP 15 WORDS FOR TOPIC 2 ARE :
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']




THE TOP 15 WORDS FOR TOPIC 3 ARE :
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']




THE TOP 15 WORDS FOR TOPIC 4 ARE :
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']




THE TOP 15 WORDS FOR TOPIC 5 ARE :
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'th

In [29]:
topic_results = LDA.transform(dtm)

In [32]:
100 * topic_results[0].round(2) #i.e article 1 has 68% chances of belonging to topic 1

array([ 2., 68.,  0.,  0., 30.,  0.,  0.])

### Assigning Topics

In [34]:
data["Topic"] = topic_results.argmax(axis = 1)

In [35]:
data.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2


In [36]:
dict_topic_names = {0: "Insurance", 1 : "International Relations", 2 : "Travel and Food", 3 : "Healthcare", 4 : "Election", 5 : "Music", 6 : "Education"}

In [37]:
data["Topic Names"] = data["Topic"].map(dict_topic_names)

In [38]:
data.head()

Unnamed: 0,Article,Topic,Topic Names
0,"In the Washington of 2016, even when the polic...",1,International Relations
1,Donald Trump has used Twitter — his prefe...,1,International Relations
2,Donald Trump is unabashedly praising Russian...,1,International Relations
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1,International Relations
4,"From photography, illustration and video, to d...",2,Travel and Food
