In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline
# Run in terminal or command prompt
# python3 -m spacy download en

import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [33]:
data = open("html_text.txt", encoding='utf8')

In [3]:
cf = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=50000,             # max number of uniq words
                            )

df = cf.fit_transform(data)

In [36]:
df.shape

(60958, 1850)

In [4]:
# Materialize the sparse data
data_dense = df.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  0.055502104683508274 %


In [13]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=15,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(df)

print(lda_model)  # Model attributes


LatentDirichletAllocation(learning_method='online', n_components=15, n_jobs=-1,
                          random_state=100)


In [14]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(df))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(df))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -532825.6007237053
Perplexity:  1207.0333488362025
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 15,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [15]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(df)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [10, 15, 20, 25, 30]})

In [16]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(df))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -127872.24296902132
Model Perplexity:  872.5977100214018


In [19]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, df, cf, mds='tsne')
panel

In [43]:
#place text you want tested here, then run the rest of the notebook.
test_corpus = ['''If you struggle with acne prone skin, then you’ve probably been recommended exfoliating acids (like Salicylic and Glycolic) to decongest pores and clear your complexion. However these acids are often misused and end up damaging the skin barrier – causing it to become ultra sensitive!
As much as it feels like the right route, scrubbing your face and using clay masks is actually counter productive for acne-prone skin! You actually need to be EXTRA gentle with your skin to reduce inflammation and heal your skin barrier. I recommend exfoliating with Konjac 
Sponges or cleansing pads like Face Halo and using hydrating sheets masks to calm and soothe the skin. Clay masks can be beneficial, but when they are left to go dry they can damage the skin barrier by stripping the skin of it’s protective oils, all my favourite exfoliating and mask products are listed here.
The key to repairing your skin barrier is to strip your routine RIGHT back to basics! You really don’t need tonnes of different products and you definitely want to avoid products aimed at acne-prone skin as these are often way too harsh and contain ingredients that actually damage the skin barrier further.
My best advice is to use a simple cleanser that’s SLS-free and not too stripping on the skin, followed by a gentle serum that ideally contains Hyaluronic Acid and Niacinamide which both help to repair the skin barrier.
I then recommend finishing off with a facial oil (my favourite is Emu oil) or gentle moisturiser and ALWAYS use an SPF during the day! Just like every other wound or scratch on your skin. Your immune system will fight the infection that is within the pore and work to clear it in a few days/weeks depending on the depth and severity of infection.
To speed up healing, the key is to improve your immunity. Ensuring you’re getting enough sleep, working on stress management and eating a diet rich in nutrient rich foods is the best way to ensure your body can protect you against infection and inflammation before it gets out of control!''']

In [48]:
test_feature_matrix = cf.transform(test_corpus)

In [49]:
test_lda_output = best_lda_model.transform(test_feature_matrix)

In [50]:
#your results here show the precentage of the document belonging to the above topics. 
#So higher precentage means more aligned to that topic.
test_lda_output

array([[0.00080013, 0.00080002, 0.65517014, 0.00080005, 0.00080008,
        0.18598673, 0.00080001, 0.00080007, 0.00080005, 0.15324271]])