# LDA with Scikit-Learn 

    The set of topics and their associated word probabilities
    The topic of each word
    The particular topic mixture of each document

- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
- https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read data from csv
jcmc_df = pd.read_csv('/mnt/hd1/comm_subfields/jcmc_abstract_list.csv', low_memory=False)
sentence_list = jcmc_df['AB'].dropna()
# number of documents
sentence_list.shape

(68,)

In [3]:
# turn to term-document frequency matrix
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(stop_words='english')
tf = tf_vectorizer.fit_transform(sentence_list)

In [4]:
tf.toarray().shape

(68, 1969)

In [5]:
from sklearn.decomposition import LatentDirichletAllocation
n_components = 10
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## The set of topics and their associated word probabilities

In [6]:
# retrieve top n_top_words words for each topic
n_top_words = 5
feature_names = tf_vectorizer.get_feature_names()

# components_[i, j] can be viewed as pseudocount that 
# represents the number of times word j was assigned to topic i.

topic_word_matrix = lda.components_
print(topic_word_matrix.shape)

rows = []
for topic_id, topic in enumerate(topic_word_matrix):
    row = ["Topic #%d: " % topic_id]
    row += [feature_names[i] + "*" + str(np.round(topic[i] / np.sum(topic), 4))
            for i in topic.argsort()[:-n_top_words-1:-1]]
    rows.append(row)
    
topic_word_df = pd.DataFrame(rows, columns=['Topic', 'Top1 Word*Prob', 'Top2 Word*Prob', \
                                            'Top3 Word*Prob', 'Top4 Word*Prob', 'Top5 Word*Prob'])

(10, 1969)


In [7]:
topic_word_df

Unnamed: 0,Topic,Top1 Word*Prob,Top2 Word*Prob,Top3 Word*Prob,Top4 Word*Prob,Top5 Word*Prob
0,Topic #0:,online*0.0238,communication*0.0219,friendship*0.0159,information*0.01,im*0.01
1,Topic #1:,social*0.0165,users*0.0137,online*0.0117,srm*0.009,network*0.009
2,Topic #2:,templates*0.0085,study*0.0071,sharing*0.0071,online*0.0071,findings*0.0057
3,Topic #3:,political*0.0284,information*0.0154,participation*0.0135,news*0.0116,sharing*0.0116
4,Topic #4:,facebook*0.0126,physical*0.0111,information*0.01,social*0.0098,study*0.0087
5,Topic #5:,social*0.012,affordances*0.0105,network*0.0072,data*0.0071,effects*0.0071
6,Topic #6:,social*0.0219,news*0.0182,facebook*0.0169,results*0.0078,data*0.0078
7,Topic #7:,individual*0.0188,level*0.0147,use*0.0106,internet*0.0106,global*0.0106
8,Topic #8:,self*0.0169,disclosure*0.015,media*0.0141,privacy*0.0131,social*0.0121
9,Topic #9:,digital*0.0105,news*0.0101,media*0.0088,mobile*0.0088,studies*0.0088


## The topic of each word

In [8]:
n_top_topic = 2

word_topic_matrix = topic_word_matrix.T
print(word_topic_matrix.shape)

rows = []
for word_id, word in enumerate(word_topic_matrix):
    row = [feature_names[word_id]]
    row += ['Topic' + str(i) + "*" + str(np.round(word[i] / np.sum(word), 2))
            for i in word.argsort()[:-n_top_topic-1:-1]]
    rows.append(row)
    
word_topic_df = pd.DataFrame(rows, columns=['Word', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

(1969, 10)


In [9]:
word_topic_df.tail(10)

Unnamed: 0,Word,Top1 Topic*Prob,Top2 Topic*Prob
1959,worldwide,Topic7*0.55,Topic0*0.05
1960,written,Topic2*0.7,Topic7*0.03
1961,www,Topic7*0.77,Topic0*0.03
1962,year,Topic1*0.55,Topic7*0.05
1963,yielded,Topic7*0.7,Topic0*0.03
1964,york,Topic9*0.55,Topic7*0.05
1965,young,Topic9*0.35,Topic8*0.35
1966,youth,Topic2*0.55,Topic7*0.05
1967,youtube,Topic1*0.55,Topic9*0.05
1968,yuan,Topic0*0.55,Topic7*0.05


## The topic mixture of each document

In [25]:
n_top_topic = 2

doc_topic_matrix = lda.transform(tf)
print(doc_topic_matrix.shape)

rows = []
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    row += ['Topic' + str(i) + "*" + str(np.round(doc[i], 2))
            for i in doc.argsort()[:-n_top_topic-1:-1]]
    rows.append(row)
    
doc_topic_df = pd.DataFrame(rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

(68, 10)


In [30]:
doc_topic_df.iloc[15:25]

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
15,# 15,Topic1*0.99,Topic8*0.0
16,# 16,Topic9*0.99,Topic3*0.0
17,# 17,Topic9*0.99,Topic8*0.0
18,# 18,Topic9*0.99,Topic3*0.0
19,# 19,Topic1*0.94,Topic4*0.05
20,# 20,Topic8*0.99,Topic4*0.0
21,# 21,Topic3*0.74,Topic6*0.23
22,# 22,Topic5*0.99,Topic4*0.0
23,# 23,Topic3*0.99,Topic6*0.0
24,# 24,Topic1*0.99,Topic4*0.0
