# TOPIC MODELLING

In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv("C:\\Users\\saisu\\OneDrive\\Documents\\npr.csv")
npr 

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
...,...
195,Mothers should feel comfortable infants in p...
196,"In South Korea, preparing for the worst has be..."
197,David Bowie had long wanted to make a record w...
198,Chances are your doctor has stopped taking not...


In [3]:
npr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  200 non-null    object
dtypes: object(1)
memory usage: 1.7+ KB


# TEXT PREPROCESSING

**Text Cleaning**

**Remove punctuation**

**Remove stopwords**

**Stemming/lemmatization**

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

corpus=[]
for i in range(len(npr)):
    rp = re.sub('[^a-zA-Z]'," ",npr['Article'][i])
    rp = rp.lower()
    rp = rp.split()
    rp = [wnl.lemmatize(word) for word in rp if not word in set(stopwords.words('english'))]
    rp = " ".join(rp)
    corpus.append(rp)
    
print(corpus)
    



**VECTORIZATION**

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus)

# MODELLING USING LDA(LATENT DIRICHLET ALLOCATION)

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

model = LatentDirichletAllocation(n_components=4)
    
model.fit(X)

In [7]:
topic_results = model.transform(X)

In [8]:
topic_results[0]

array([3.20860993e-01, 3.76995480e-04, 3.85201187e-04, 6.78376810e-01])

In [9]:
topic_results[0].argmax()

3

**combining with original data**

In [10]:
npr['group'] = topic_results.argmax(axis=1)

In [11]:
npr.head()

Unnamed: 0,Article,group
0,"In the Washington of 2016, even when the polic...",3
1,Donald Trump has used Twitter — his prefe...,3
2,Donald Trump is unabashedly praising Russian...,3
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",2


**showing top words per topic**

In [17]:
for index,topic in enumerate(model.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')
    

THE TOP 10 WORDS FOR TOPIC #0
['new', 'trump', 'like', 'people', 'could', 'also', 'year', 'would', 'one', 'say']


THE TOP 10 WORDS FOR TOPIC #1
['student', 'victim', 'violence', 'police', 'year', 'woman', 'one', 'people', 'said', 'say']


THE TOP 10 WORDS FOR TOPIC #2
['way', 'would', 'new', 'said', 'time', 'one', 'people', 'year', 'like', 'say']


THE TOP 10 WORDS FOR TOPIC #3
['state', 'intelligence', 'also', 'new', 'year', 'russia', 'president', 'said', 'say', 'trump']


