# Latent Dirichlet Allocation

In [1]:
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
zip_file = zipfile.ZipFile("/content/drive/MyDrive/NLP_Vol1/UPDATED_NLP_COURSE.zip","r")
zip_file.extractall()
zip_file.close()

In [3]:
df = pd.read_csv("/content/UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv")
print(f"Shape : {df.shape}")
print()
df.head()

Shape : (11992, 1)



Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [5]:
#df["Article"][0]

In [16]:
from warnings import filterwarnings
filterwarnings("ignore")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
cv = CountVectorizer(max_df = 0.9, min_df = 2,stop_words = "english")

In [8]:
dtm = cv.fit_transform(df["Article"])
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [10]:
LDA = LatentDirichletAllocation(n_components=7,
                                random_state=42)

LDA.fit(dtm)

LatentDirichletAllocation(n_components=7, random_state=42)

In [17]:
# Grab the vocabulary of words
len(cv.get_feature_names())

54777

In [18]:
cv.get_feature_names()[50000]

'transcribe'

In [29]:
import random

random_word_id = random.randint(0, 54777)
cv.get_feature_names()[random_word_id]

'fairview'

In [21]:
# Grab The Topics

len(LDA.components_),LDA.components_.shape

(7, (7, 54777))

In [24]:
# Küçükten büyüğe index sırası !!!
# 1 en düşük 2 indexi
# 10 ortance 0 indexi
# 200 en büyük 1 indexi
# array([2,0,1]) yani !!!

arr = np.array([10,200,1])
arr, arr.argsort()

(array([ 10, 200,   1]), array([2, 0, 1]))

In [22]:
single_topic = LDA.components_[0]
single_topic

array([8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
       1.43006821e-01, 1.42902042e-01, 1.42861626e-01])

In [23]:
# En büyük 10 değeri aldık !!!
single_topic.argsort()[-10:]

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993])

In [30]:
twenty_ten_words = single_topic.argsort()[-20:]

In [31]:
for index in twenty_ten_words:
    print(cv.get_feature_names()[index])

president
state
tax
insurance
trump
companies
money
year
federal
000
new
percent
government
company
million
care
people
health
said
says


In [35]:
# Grab the highest probability words per topic
for i,topic in enumerate(LDA.components_):

    print(f"The Top 15 Words For Topic #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print("\n")
    print("\n")

The Top 15 Words For Topic #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']




The Top 15 Words For Topic #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']




The Top 15 Words For Topic #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']




The Top 15 Words For Topic #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']




The Top 15 Words For Topic #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']




The Top 15 Words For Topic #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know'

In [36]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [37]:
topic = LDA.transform(dtm)

In [38]:
topic.shape

(11992, 7)

In [39]:
topic[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [42]:
topic[0].round(2).argmax()

1

In [43]:
df["Topic"]= topic.argmax(axis = 1)
df.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2


# Non - Negative Matrix

In [44]:
df = pd.read_csv("/content/UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv")
print(f"Shape : {df.shape}")
print()
df.head()

Shape : (11992, 1)



Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df = 0.95 , min_df = 2 , stop_words = "english")

In [47]:
dtm = tfidf.fit_transform(df["Article"])

In [48]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [49]:
from sklearn.decomposition import NMF

In [50]:
nmf_model = NMF(n_components = 7, 
                random_state = 42)

In [70]:
nmf_model.fit(dtm)

NMF(n_components=7, random_state=42)

In [52]:
tfidf.get_feature_names()[2300]

'albala'

In [94]:
nmf_model.components_[0]

array([0.00000000e+00, 2.49950821e-01, 0.00000000e+00, ...,
       1.70313822e-03, 2.37544362e-04, 0.00000000e+00])

In [72]:
nmf_model.components_.shape

(7, 54777)

In [73]:
for index,topic in enumerate(nmf_model.components_):

    print(f"The Top 15 Words For Topic #{index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print("\n")

The Top 15 Words For Topic #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


The Top 15 Words For Topic #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


The Top 15 Words For Topic #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


The Top 15 Words For Topic #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


The Top 15 Words For Topic #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


The Top 15 Words For Topic #5
['love', 've', 'don', 'al

In [74]:
topic_results = nmf_model.transform(dtm)
topic_results.shape

(11992, 7)

In [76]:
topic_results[0],topic_results[0].argmax()

(array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
        0.        , 0.        ]), 1)

In [77]:
df["Topic"] = topic_results.argmax(axis = 1)
df.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [91]:
mytopic_dict = {0: "health" ,1 : "election",2 : "legis",3 : "poli", 4 : "election", 5 : "music", 6:"edu"}
df["Topic Label"] = df["Topic"].map(mytopic_dict)

In [92]:
df.head()

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,election
1,Donald Trump has used Twitter — his prefe...,1,election
2,Donald Trump is unabashedly praising Russian...,1,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,poli
4,"From photography, illustration and video, to d...",6,edu


# Exercise

In [93]:
df = pd.read_csv("/content/UPDATED_NLP_COURSE/05-Topic-Modeling/quora_questions.csv")
print(f"Shape : {df.shape}")
print()
df.head()

Shape : (404289, 1)



Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [96]:
tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2 ,stop_words = "english")

dtm = tfidf.fit_transform(df["Question"])

dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [98]:
nmf_model = NMF(n_components = 20, random_state = 42)
nmf_model.fit(dtm)

NMF(n_components=20, random_state=42)

In [100]:
nmf_model.components_.shape

(20, 38669)

In [101]:
for index,topic in enumerate(nmf_model.components_):

    print(f"The Top 15 Words For Topic # {index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print("\n")

The Top 15 Words For Topic # 0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


The Top 15 Words For Topic # 1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


The Top 15 Words For Topic # 2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


The Top 15 Words For Topic # 3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


The Top 15 Words For Topic # 4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


The Top 15 Words For Topic # 5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'coun

In [102]:
topic_results = nmf_model.transform(dtm)
topic_results.shape

(404289, 20)

In [103]:
df["Topic"] = topic_results.argmax(axis = 1)
df.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
