# Latent Dirichlet Allocation

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import pandas as pd

In [21]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/quora_questions.csv')

In [22]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


## Preprocessing

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [25]:
dtm = cv.fit_transform(df['Question'])

In [26]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2002912 stored elements and shape (404289, 38669)>

## LDA

In [27]:
from sklearn.decomposition import LatentDirichletAllocation

In [28]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)

In [None]:
# This can take awhile, we're dealing with a large amount of documents!
LDA.fit(dtm)

In [None]:
len(cv.get_feature_names_out())

In [None]:
import random

In [None]:
for i in range(10):
    random_word_id = random.randint(0, len(cv.get_feature_names_out()) - 1)
    print(cv.get_feature_names_out()[random_word_id])

In [None]:
for i in range(10):
    random_word_id = random.randint(0, len(cv.get_feature_names_out()) - 1)
    print(cv.get_feature_names_out()[random_word_id])

### Showing Top Words Per Topic

In [None]:
len(LDA.components_)

In [None]:
LDA.components_

In [None]:
len(LDA.components_[0])

In [None]:
single_topic = LDA.components_[0]

In [None]:
# Returns the indices that would sort this array.
single_topic.argsort()

In [None]:
# Word least representative of this topic
single_topic[18302]

In [None]:
# Word most representative of this topic
single_topic[single_topic.argsort()[-1]]

In [None]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

In [None]:
top_word_indices = single_topic.argsort()[-10:]

In [None]:
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

In [None]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

**Attaching Discovered Question Labels to Original Question**

In [None]:
dtm

In [None]:
dtm.shape

In [None]:
len(df)

In [None]:
topic_results = LDA.transform(dtm)

In [None]:
topic_results.shape

In [None]:
topic_results[0]

In [None]:
topic_results[0].round(2)

In [None]:
topic_results[0].argmax()

**This means that our model thinks that the first question belongs to topic #2.**

### Combining with Original Data

In [None]:
df.head()

In [None]:
topic_results.argmax(axis=1)

In [None]:
df['Question'] = topic_results.argmax(axis=1)

In [None]:
df.head(10)