## 1. Setup

In [18]:
import pandas as pd
from time import time
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [19]:
dbase = '../assets/Preprocessed Data.xlsx'
data = pd.read_excel(dbase, sheet='Sheet1')

In [20]:
data['Token description'] = data['Description'].apply(lambda x: word_tokenize(x))

In [21]:
descs = data['Token description'].map(lambda x: ' '.join(x))

In [22]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)

dtm_tf = tf_vectorizer.fit_transform(descs)

print(dtm_tf.shape)

(6064, 15963)


## 2. LDA and Viz

In [23]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(descs)
print(dtm_tfidf.shape)

(6064, 15963)


### 20 topics

In [24]:
lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0,)
lda_tf.fit(dtm_tf)

lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0,)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=20, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [25]:
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)

In [26]:
data = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

### 10 topics

In [27]:
lda_tf = LatentDirichletAllocation(n_topics=10, random_state=0,)
lda_tf.fit(dtm_tf)

lda_tfidf = LatentDirichletAllocation(n_topics=10, random_state=0,)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [28]:
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)

In [29]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

### 15 topics

In [30]:
lda_tf = LatentDirichletAllocation(n_topics=15, random_state=0,)
lda_tf.fit(dtm_tf)
data = pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
pyLDAvis.display(data)

