In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.sklearn

# spacy for lemmatization
import spacy

# Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import nltk

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

  from imp import reload
  import distutils  # noqa
  from scipy.sparse.base import spmatrix
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1


In [3]:
df = pd.read_csv('./dataset/NMSendataset_wordtoken_lda_2023.csv', error_bad_lines=False);

In [4]:
eval(df['wordtoken'][4])

['proper', 'multiplayer', 'introduce', 'lonely', 'experience', 'either']

In [5]:
def convert(s):
    s = eval(s)
    s = ' '.join(s)
    return s
df['wordtoken'] = df['wordtoken'].apply(convert)

In [6]:
data = df.wordtoken.values.tolist()
data[:3]

['good',
 'difference year',
 'messy origin no_man_sky grown deeper polished developer work something like initial promise']

# Vectorizer

## BoW vectorizer

In [7]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=8,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
#                              token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

%time data_vectorized = vectorizer.fit_transform(data)
print(data_vectorized.shape)

CPU times: total: 3.81 s
Wall time: 3.91 s
(476343, 10518)


In [8]:
# vectorizer.get_feature_names()

In [9]:
data_feature_names=vectorizer.get_feature_names()

In [10]:

data_dense = data_vectorized.todense()

print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


Sparsicity:  0.06216141314489166 %


# Build LDA Model

In [11]:
lda_model = LatentDirichletAllocation(n_components=10,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)


In [12]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, perp_tol=0.1,
             random_state=100, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)

In [13]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -24990390.21933443
Perplexity:  2153.2292984183987
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 10,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [14]:
# Define Search Param
search_params = {'n_components': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 'max_iter': [5, 10, 15, 20]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'max_iter': [5, 10, 15, 20],
                         'n_components': [5, 10, 15, 20, 25, 30, 35, 40, 45,
                                          50]})

In [15]:
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
        n_jobs=1,
       param_grid={'n_components': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], 'max_iter': [5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

GridSearchCV(error_score='raise',
             estimator=LatentDirichletAllocation(learning_method=None,
                                                 n_jobs=1),
             n_jobs=1,
             param_grid={'max_iter': [5, 10, 15, 20],
                         'n_components': [5, 10, 15, 20, 25, 30, 35, 40, 45,
                                          50]},
             return_train_score='warn')

In [16]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'max_iter': 20, 'n_components': 5}
Best Log Likelihood Score:  -4857594.809423642
Model Perplexity:  1471.718719133914
