In [3]:

import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
from nltk.corpus import wordnet as wn
#nltk.download('stopwords')
#nltk.download('wordnet')
# spacy for lemmatization
import spacy
from spacy.lang.en import English
parser = English()

# Plotting tools

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import pickle

with open("../data/tech_review_sent_corpus.pkl","rb") as f:
    tech_review_corpus = pickle.load(f)
    
display(pd.DataFrame(tech_review_corpus).head(10))

reviews = pd.DataFrame(tech_review_corpus).review.tolist()

#reviews = list(map(str.split, reviews))

Unnamed: 0,index,review
0,299,great company people care make difference
1,299,management care employee well providing contin...
2,1145,powerhouse epc went heavy worksharing
3,1145,send much work overseas possible keep core person
4,1145,heavy loaded company management
5,1145,chase project hire thousand oil booming
6,1145,turn lay thousand going
7,1145,sharpest spike hiring laying
8,1145,employee clearly greatest asset
9,1375,much could add


In [4]:
reviews

['great company people care make difference',
 'management care employee well providing continued learning opportunity',
 'powerhouse epc went heavy worksharing',
 'send much work overseas possible keep core person',
 'heavy loaded company management',
 'chase project hire thousand oil booming',
 'turn lay thousand going',
 'sharpest spike hiring laying',
 'employee clearly greatest asset',
 'much could add',
 'enjoyed working people working',
 'loved',
 'hour great flexibility needed',
 'place pretty boring time interesting facility uranium enrichment almost half century',
 'everyone hurry get thing done hurry part make sure get need get done',
 'typically 3 level removed customer actually work',
 'hire',
 'term would rate 4',
 'hired nice intelligent people',
 'reasonably pleasant place spend 40 hr per work',
 'infrastructure adequately maintained except maybe escalator',
 'could also rate bos time rated',
 'benefit better average',
 'project reasonably well managed',
 'returned rate

In [5]:
#NLTK Stop words
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#extend stop word based on top 50 words
stop_words.extend(["new","lot","get","many","also","day","part","one","things","always","years","really"])

In [6]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [7]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [8]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    #tokens = [token for token in tokens if len(token) > 4]
    #tokens = [token for token in tokens if token not in stop_words]
    #tokens = [get_lemma(token) for token in tokens]
    return tokens

In [9]:
tokenize(reviews[0])

['great', 'company', 'people', 'care', 'make', 'difference']

In [10]:
processed_docs = []
for i in range(len(reviews)):
    processed_docs.append(tokenize(reviews[i]))
processed_docs[:10]


[['great', 'company', 'people', 'care', 'make', 'difference'],
 ['management',
  'care',
  'employee',
  'well',
  'providing',
  'continued',
  'learning',
  'opportunity'],
 ['powerhouse', 'epc', 'went', 'heavy', 'worksharing'],
 ['send', 'much', 'work', 'overseas', 'possible', 'keep', 'core', 'person'],
 ['heavy', 'loaded', 'company', 'management'],
 ['chase', 'project', 'hire', 'thousand', 'oil', 'booming'],
 ['turn', 'lay', 'thousand', 'going'],
 ['sharpest', 'spike', 'hiring', 'laying'],
 ['employee', 'clearly', 'greatest', 'asset'],
 ['much', 'could', 'add']]

In [11]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(processed_docs)
# Create Corpus
texts = processed_docs
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


In [12]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           workers = 30,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [13]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 15
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [#gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               #gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               #gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=390)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

100%|████████████████████████████████████████████████████████████████████████████| 390/390 [83:44:44<00:00, 773.04s/it]


In [14]:
pd.DataFrame(model_results).sort_values(by = ['Coherence','Topics'],ascending=False)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
118,100% Corpus,5,asymmetric,0.91,0.460634
173,100% Corpus,7,symmetric,0.91,0.454982
178,100% Corpus,7,asymmetric,0.91,0.451349
352,100% Corpus,13,symmetric,0.61,0.451092
23,100% Corpus,2,symmetric,0.91,0.445184
...,...,...,...,...,...
95,100% Corpus,5,0.31,0.01,0.354034
68,100% Corpus,4,0.31,0.91,0.352335
159,100% Corpus,7,0.31,symmetric,0.350764
365,100% Corpus,14,0.31,0.01,0.349799


In [15]:

final_lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7, 
                                           random_state=100,
                                           chunksize=100,
                                           workers = 10,
                                           passes=10,
                                           alpha='symmetric',
                                           eta=0.91,
                                           per_word_topics=True)

In [16]:
# Select the model and print the topics
optimal_model = final_lda_model
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=20))

[(0,
  '0.100*"job" + 0.058*"part" + 0.027*"benefit" + 0.023*"salary" + '
  '0.022*"enjoyable" + 0.021*"pay" + 0.021*"hardest" + 0.016*"security" + '
  '0.015*"le" + 0.009*"compensation" + 0.008*"term" + 0.008*"low" + '
  '0.008*"company" + 0.007*"hike" + 0.007*"time" + 0.006*"average" + '
  '0.006*"high" + 0.006*"decent" + 0.005*"long" + 0.005*"level"'),
 (1,
  '0.016*"support" + 0.016*"skill" + 0.014*"client" + 0.013*"system" + '
  '0.010*"software" + 0.010*"technical" + 0.010*"development" + '
  '0.010*"project" + 0.009*"customer" + 0.009*"application" + 0.009*"issue" + '
  '0.008*"service" + 0.008*"knowledge" + 0.008*"business" + 0.007*"learned" + '
  '0.006*"process" + 0.006*"product" + 0.006*"working" + 0.005*"worked" + '
  '0.005*"experience"'),
 (2,
  '0.057*"lot" + 0.045*"opportunity" + 0.043*"learn" + 0.037*"technology" + '
  '0.031*"many" + 0.030*"thing" + 0.025*"project" + 0.020*"learned" + '
  '0.018*"company" + 0.016*"learning" + 0.015*"different" + 0.014*"got" + '
  '0.0

In [18]:
lda_display = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=reviews):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=final_lda_model, corpus=corpus, texts=reviews)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()

df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

#df_dominant_topic["Review"] = reviews_["review"] 
df_dominant_topic.head(10)