In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/sklearn/topic_modelling/imdb')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install pyldavis



In [3]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf

In [4]:
def get_idx2word(_index_from=3):
  word2idx = tf.keras.datasets.imdb.get_word_index()
  word2idx = {k:(v+_index_from) for k,v in word2idx.items()}
  word2idx["<pad>"] = 0
  word2idx["<start>"] = 1
  word2idx["<unk>"] = 2
  idx2word = {idx: w for w, idx in word2idx.items()}
  return idx2word

In [5]:
N_TOPICS = 10
MAX_TERMS = 10

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data()
idx2word = get_idx2word()
documents = [' '.join([idx2word[idx] for idx in x_train[1:]]) for x_train in X_train] + \
            [' '.join([idx2word[idx] for idx in x_test[1:]]) for x_test in X_test]

tfidf = TfidfVectorizer(stop_words = 'english',
                        max_df = .1,
                        max_features = 5000)
tfidf_repr = tfidf.fit_transform(documents)

lda = LatentDirichletAllocation(n_components = N_TOPICS,
                                learning_method = 'batch',
                                max_iter = 50,
                                verbose = 1)
lda.fit(tfidf_repr)

for topic_idx, term_vals in enumerate(lda.components_):
  message = "Topic #{}: ".format(topic_idx)
  message += " ".join([tfidf.get_feature_names()[i]
                       for i in term_vals.argsort()[:-MAX_TERMS-1:-1]])
  print(message)

iteration: 1 of max_iter: 50
iteration: 2 of max_iter: 50
iteration: 3 of max_iter: 50
iteration: 4 of max_iter: 50
iteration: 5 of max_iter: 50
iteration: 6 of max_iter: 50
iteration: 7 of max_iter: 50
iteration: 8 of max_iter: 50
iteration: 9 of max_iter: 50
iteration: 10 of max_iter: 50
iteration: 11 of max_iter: 50
iteration: 12 of max_iter: 50
iteration: 13 of max_iter: 50
iteration: 14 of max_iter: 50
iteration: 15 of max_iter: 50
iteration: 16 of max_iter: 50
iteration: 17 of max_iter: 50
iteration: 18 of max_iter: 50
iteration: 19 of max_iter: 50
iteration: 20 of max_iter: 50
iteration: 21 of max_iter: 50
iteration: 22 of max_iter: 50
iteration: 23 of max_iter: 50
iteration: 24 of max_iter: 50
iteration: 25 of max_iter: 50
iteration: 26 of max_iter: 50
iteration: 27 of max_iter: 50
iteration: 28 of max_iter: 50
iteration: 29 of max_iter: 50
iteration: 30 of max_iter: 50
iteration: 31 of max_iter: 50
iteration: 32 of max_iter: 50
iteration: 33 of max_iter: 50
iteration: 34 of ma

In [6]:
import pyLDAvis
import pyLDAvis.sklearn

In [7]:
pyLDAvis.save_html(pyLDAvis.sklearn.prepare(lda, tfidf_repr, tfidf), 'lda.html')