In [1]:
import pandas as pd
import datatable as dt
import numpy as np
import re


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
import pickle

from joblib import dump, load

import matplotlib.pyplot as plt

import random

pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [2]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [3]:
import multiprocessing
from pandarallel import pandarallel

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

workers = num_processors-1

Available CPUs: 8


In [4]:
with open('news_list.pkl', 'rb') as f:
    news_list = pickle.load(f)

with open('nw_dictionary.pkl', 'rb') as f:
    nw_dictionary = pickle.load(f)

with open('nw_doc_term_matrix.pkl', 'rb') as f:
    nw_doc_term_matrix = pickle.load(f)

In [5]:
# Calculate the number of elements to select (10% of the total)
sample_size = int(len(nw_doc_term_matrix) * 0.1)

# Randomly select 10% of the elements
nw_doc_term_matrix_sample = random.sample(nw_doc_term_matrix, sample_size)

In [9]:
%%time

grid = {}
grid['Validation_Set'] = {}

topics_range = [15, 20, 25, 30]

# Alpha parameter
alpha = [0.01, 0.51, 0.91]
alpha.append('asymmetric')


# Beta parameter
#beta = [0.01, 0.51, 0.91]
#beta.append('symmetric')
beta = ["auto"]
# Validation sets
num_of_docs = len(nw_doc_term_matrix)
corpus_sets = [nw_doc_term_matrix_sample]
corpus_title = ['100% Corpus']
model_results = {
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

itr = 0
itr_total = len(beta)*len(alpha)*len(topics_range)*len(corpus_title)
print(f'LDA will execute {itr_total} iterations')

LDA will execute 16 iterations
CPU times: total: 0 ns
Wall time: 0 ns


In [10]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = LdaMulticore(corpus=nw_doc_term_matrix_sample,
                       id2word=dictionary,
                       num_topics=k,
                       random_state=100,                  
                       passes=10,
                       alpha=a,
                       eta=b,
                       workers=workers)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=news_list, dictionary=nw_dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [13]:
# iterate through hyperparameters
for i in range(len(corpus_sets)):
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                itr += 1
                cv = compute_coherence_values(corpus=nw_doc_term_matrix_sample, dictionary=nw_dictionary, 
                                              k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                pct_completed = round((itr / itr_total * 100),1)
#                 print(f'Completed Percent: {pct_completed}%, Corpus: {corpus_title[i]}, Topics: {k}, Alpha: {a}, Beta: {b}, Coherence: {cv}')
                if itr % 2 == 0:
                    print(f'Finished {pct_completed}% of LDA runs')


Finished 12.5% of LDA runs
Finished 25.0% of LDA runs
Finished 37.5% of LDA runs
Finished 50.0% of LDA runs
Finished 62.5% of LDA runs
Finished 75.0% of LDA runs
Finished 87.5% of LDA runs
Finished 100.0% of LDA runs


In [14]:
lda_tuning = pd.DataFrame(model_results)
#lda_tuning.to_csv(os.path.join(path_lda, 'lda_tuning_results.csv'), index=False)

In [15]:
with open('fine_tune.pkl', 'wb') as f:
    pickle.dump(lda_tuning, f)

In [5]:
with open('fine_tune.pkl', 'rb') as f:
    fine_tune = pickle.load(f)

In [8]:
fine_tune.sort_values(by=['Coherence'], ascending=False)

Unnamed: 0,Topics,Alpha,Beta,Coherence
4,20,0.010000,auto,0.437246
7,20,asymmetric,auto,0.43672
13,30,0.510000,auto,0.423792
5,20,0.510000,auto,0.412215
6,20,0.910000,auto,0.410979
11,25,asymmetric,auto,0.410261
9,25,0.510000,auto,0.40826
8,25,0.010000,auto,0.406927
0,15,0.010000,auto,0.405379
1,15,0.510000,auto,0.403053


The 10 and 20 topic models in my other notebook each had better coherence scores with default parameters. I will use those

### 3 topic model

In [5]:
%%time

tw_lda_model = LdaMulticore(corpus=nw_doc_term_matrix,
                   id2word=nw_dictionary,
                   num_topics=3,
                   random_state=100,
                   passes=10,
                   eta='auto',
                   workers=workers)

CPU times: total: 18min 25s
Wall time: 20min 34s


In [6]:
%%time

lda_display_tw = gensimvis.prepare(tw_lda_model, nw_doc_term_matrix, nw_dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display_tw)

CPU times: total: 4min 1s
Wall time: 5min 27s
