In [1]:
# LDA is a generative probabilistic model, that tries to find
# groups of words that appear frequently together across different
# documents. 

# These frequently appearing words represent our topics,
# assuming each document is a mixture of different words.

# Given Bag-of-words matrix as input, LDA decomposes it into 2 new
# matrices: A document to topic matrix, a word to topic matrix

# LDA decomposes the Bag-of-words matrix in such a way that we if
# we multiply those 2 matrices together, we would be able to reproduce
# the input, the bag-of-words matrix, with lowest possible error.

## Number of topics is a hyperparameter

In [2]:
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer to create the bag-of-words matrix as input 
# to the LDA

# max_df and max_features are Hyperparameters
count = CountVectorizer(stop_words = 'english',
                        max_df = .1,
                        max_features = 5000)

X = count.fit_transform(df['review'].values)

In [5]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_topics = 10,
                                random_state = 123,
                                learning_method = 'batch')

X_topics = lda.fit_transform(X)



In [6]:
lda.components_.shape

(10, 5000)

In [10]:
n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))


Topic 1:
worst minutes awful script stupid
Topic 2:
family mother father children girl
Topic 3:
american war dvd music tv
Topic 4:
human audience cinema art sense
Topic 5:
police guy car dead murder
Topic 6:
horror house sex girl woman
Topic 7:
role performance comedy actor performances
Topic 8:
series episode war episodes tv
Topic 9:
book version original read novel
Topic 10:
action fight guy guys cool
