In [11]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd

http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/

> 1. Randomly assign a topic to a document
> 2. Given the words assigned to the topic, calculate the probability of a topic to a document
> 3. Given the words assigned to the topic, calculate the probability of a topic to a word
> 4. Multiply step 2 * step 3, assign topic of document 
> 5. Iteratively repeated the process until the stopping conditions meet


In [2]:
data = fetch_20newsgroups(remove=("headers", "footers", "quotes"))

print(data.data[:1])

['I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.']


In [3]:
max_features = 1000

tf_vectorizer = CountVectorizer(max_features=max_features,
                               stop_words="english")

tf = tf_vectorizer.fit_transform(data.data)

vocab = tf_vectorizer.get_feature_names()

n_top_words = 5

In [4]:
n_topics = 20

model = LatentDirichletAllocation(n_components= n_topics)

model.fit(tf)

LatentDirichletAllocation(n_components=20)

In [5]:
print(model.components_)

[[5.00000001e-02 5.00000006e-02 5.00000002e-02 ... 2.17500011e+00
  5.00000004e-02 1.40296817e+00]
 [2.25452642e+01 5.00000002e-02 5.00000032e-02 ... 5.00000007e-02
  5.00000001e-02 5.00000007e-02]
 [5.00000007e-02 4.05023749e+01 5.00000002e-02 ... 1.47943049e+02
  7.12233171e+00 1.61249399e+02]
 ...
 [5.00000000e-02 1.20611072e+01 5.00000000e-02 ... 1.14021803e+01
  5.00000004e-02 1.71087567e+01]
 [5.00000005e-02 5.00000006e-02 5.00000002e-02 ... 2.57273392e-01
  5.00000005e-02 5.00000108e-02]
 [5.00000004e-02 5.00000007e-02 5.00000023e-02 ... 4.57986394e+01
  4.81710775e+00 1.16020976e+00]]


In [6]:
print(model.transform(tf))

[[0.00208333 0.00208333 0.00208333 ... 0.00208333 0.00208333 0.00208333]
 [0.0025     0.0025     0.0025     ... 0.0025     0.0025     0.0025    ]
 [0.00060241 0.00060241 0.00060241 ... 0.00060241 0.00060241 0.00060241]
 ...
 [0.11039019 0.00454545 0.00454545 ... 0.00454545 0.00454545 0.00454545]
 [0.00294118 0.00294118 0.00294118 ... 0.00294118 0.00294118 0.00294118]
 [0.00357143 0.12888659 0.00357143 ... 0.00357143 0.00357143 0.00357143]]


In [7]:
topic_words = {}

for topic, comp in enumerate(model.components_):
    # for the n-dimensional array "arr":
    # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
    # which contains the indices that would sort arr in a descending fashion
    # for the ith element in ranked_array, ranked_array[i] represents the index of the
    # element in arr that should be at the ith index in ranked_array
    # ex. arr = [3,7,1,0,3,6]
    # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
    # word_idx contains the indices in "topic" of the top num_top_words most relevant
    # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)    
    # word_idx = np.argsort(comp)[::-1][:9]
    
    word_idx = comp.argsort()[-6:]
    # store the words most relevant to the topic
    topic_words[topic] = [vocab[i] for i in word_idx]

In [8]:
topic_words

{0: ['current', 'ground', 'used', 'use', 'bike', 'power'],
 1: ['section', 'bit', 'program', 'entry', 'output', 'file'],
 2: ['just', 'going', 'don', 'know', 'said', 'people'],
 3: ['guns', 'people', 'state', 'law', 'government', 'gun'],
 4: ['nasa', 'launch', 'armenians', 'turkish', 'armenian', 'space'],
 5: ['pl', 'a86', 'b8f', 'g9v', 'max', 'ax'],
 6: ['edu', 'ftp', 'software', 'version', 'image', 'available'],
 7: ['db', 'jesus', 'believe', 'does', 'people', 'god'],
 8: ['points', 'period', 'team', 'games', 'play', 'game'],
 9: ['use', 'know', 've', 'don', 'just', 'like'],
 10: ['like', 'good', 'year', 'just', 'don', 'think'],
 11: ['looking', 'help', 'mail', 'does', 'know', 'thanks'],
 12: ['memory', 'use', 'window', 'using', 'problem', 'windows'],
 13: ['just', 'read', 'question', 'book', 'years', 'time'],
 14: ['000', 'april', 'san', '1993', 'university', 'new'],
 15: ['new', '20', '10', 'sale', '50', '00'],
 16: ['11', 'scsi', '25', '10', '16', 'drive'],
 17: ['jesus', 'law', '

In [14]:
topics =  model.transform(tf)

topicnames = ["Topic" + str(i) for i in range(model.n_components)]

docnames = ["Doc" + str(i) for i in range(len(data.data))]

df_topic = pd.DataFrame(np.round(topics, 2), columns=topicnames, index=docnames)

df_topic.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38,0.0,0.16,0.0,0.43,0.0,0.0,0.0,0.0,0.0,0.0
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.41,0.0,0.18,0.0,0.0,0.15,0.0,0.0,0.0
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.52,0.05,0.29,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc3,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.63,0.24
Doc4,0.0,0.0,0.0,0.0,0.09,0.0,0.27,0.0,0.0,0.0,0.47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13
