In [1]:
from multiprocessing import Pool
from tqdm import tqdm
from spacy.lang.en import English
import gensim.corpora as corpora
from gensim.models import LdaMulticore
import multiprocessing as mp

import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
from nltk.corpus import wordnet as wn
#nltk.download('stopwords')
#nltk.download('wordnet')
# spacy for lemmatization
import spacy
from spacy.lang.en import English
parser = English()

# Plotting tools

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
num_cpus = mp.cpu_count() - 1

parser = English()

with open("../Data/tech_review_sent_corpus.pkl","rb") as f:
    tech_review_corpus = pickle.load(f)
    
reviews = pd.DataFrame(tech_review_corpus).review.tolist()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


In [30]:
def compute_coherence_values(param):
    lda_model = LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=param["k"], 
                                           random_state=100,
                                           chunksize=1000,
                                           workers = num_cpus,
                                           passes=10,
                                           alpha=param["alpha"],
                                           eta=param["beta"],
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
    
    param["coherence"] = coherence_model_lda.get_coherence()

    return param

if __name__ == "__main__":

    print("Total workers:", num_cpus)

    print("Tokenize the corpus")
    with Pool() as p:
        processed_docs = list(tqdm(p.imap(tokenize, reviews), total=len(reviews)))

    # Create Dictionary
    id2word = corpora.Dictionary(processed_docs)
    # Term Document Frequency
    print("Create a Dictionary")
    corpus = [id2word.doc2bow(text) for i, text in tqdm(enumerate(processed_docs), total=len(processed_docs))]

    grid = {}
    grid['Validation_Set'] = {}
    # Topics range
    min_topics = 5
    max_topics = 15
    step_size = 1
    topics_range = range(max_topics, min_topics, -1)
    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')
    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')

    parameters = []
    for k in topics_range:
        for a in alpha:
            for b in beta:
                parameters.append({
                        "k":k
                        ,"alpha":a
                        ,"beta":b
                        ,"workers":4
                    })

    print("Running modeling")
    print("Total Paramters", len(parameters))

    results = list(map(compute_coherence_values, tqdm(parameters)))
    
    df = pd.DataFrame(results).to_csv("../Data/lda_sent_modeling.csv",index = False)

Total workers: 15
Tokenize the corpus


100%|██████████| 156991/156991 [00:06<00:00, 24622.96it/s]
  8%|▊         | 12715/156991 [00:00<00:01, 127076.50it/s]

Create a Dictionary


100%|██████████| 156991/156991 [00:01<00:00, 129638.49it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

Running modeling
Total Paramters 60


100%|██████████| 60/60 [1:10:27<00:00, 70.46s/it]


In [33]:
pd.read_csv('lda_sent_modeling.csv').sort_values(by = ['coherence','k'],ascending=False)

Unnamed: 0,k,alpha,beta,workers,coherence
298,6,asymmetric,0.91,4,0.475331
237,8,asymmetric,0.61,4,0.456776
28,15,asymmetric,0.91,4,0.456197
232,8,symmetric,0.61,4,0.446326
88,13,asymmetric,0.91,4,0.444676
...,...,...,...,...,...
286,6,0.91,0.31,4,0.326012
287,6,0.91,0.61,4,0.325265
288,6,0.91,0.91,4,0.324374
289,6,0.91,symmetric,4,0.322756


In [14]:
final_lda_model = LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=1000,
                                           workers = 4,
                                           passes=10,
                                           alpha='asymmetric',
                                           eta=0.61,
                                           per_word_topics=True)

In [15]:
# Select the model and print the topics
optimal_model = final_lda_model
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=20))

[(0,
  '0.136*"work" + 0.053*"great" + 0.030*"people" + 0.027*"environment" + '
  '0.026*"part" + 0.026*"job" + 0.023*"enjoy" + 0.021*"place" + '
  '0.019*"management" + 0.016*"worker" + 0.016*"co" + 0.015*"friendly" + '
  '0.014*"fun" + 0.012*"time" + 0.012*"culture" + 0.010*"manager" + '
  '0.010*"really" + 0.009*"hardest" + 0.008*"always" + 0.008*"supportive"'),
 (1,
  '0.044*"life" + 0.042*"work" + 0.034*"balance" + 0.033*"employee" + '
  '0.025*"benefit" + 0.022*"management" + 0.017*"job" + 0.017*"salary" + '
  '0.014*"pay" + 0.011*"le" + 0.010*"long" + 0.010*"care" + 0.010*"security" + '
  '0.009*"level" + 0.008*"hour" + 0.007*"high" + 0.007*"company" + '
  '0.007*"policy" + 0.007*"much" + 0.006*"compensation"'),
 (2,
  '0.115*"learn" + 0.058*"lot" + 0.047*"work" + 0.047*"project" + '
  '0.042*"technology" + 0.033*"year" + 0.031*"thing" + 0.024*"many" + '
  '0.018*"different" + 0.015*"experience" + 0.015*"got" + 0.011*"get" + '
  '0.010*"2" + 0.009*"3" + 0.009*"knowledge" + 0.008

In [16]:
lda_display = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word, sort_topics=False)
pyLDAvis.display(lda_display)

In [17]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=reviews):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=final_lda_model, corpus=corpus, texts=reviews)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()

df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

#df_dominant_topic["Review"] = reviews_["review"] 
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,0.0,0.703,"work, great, people, environment, part, job, e...",great people care make difference
1,1,1.0,0.447,"life, work, balance, employee, benefit, manage...",management care employee well providing contin...
2,2,5.0,0.5735,"manager, get, management, time, would, job, ma...",powerhouse epc went heavy worksharing
3,3,5.0,0.6568,"manager, get, management, time, would, job, ma...",send much work overseas possible keep core person
4,4,1.0,0.7913,"life, work, balance, employee, benefit, manage...",heavy loaded management
5,5,2.0,0.5553,"learn, lot, work, project, technology, year, t...",chase project hire thousand oil booming
6,6,5.0,0.817,"manager, get, management, time, would, job, ma...",turn lay thousand going
7,7,5.0,0.8094,"manager, get, management, time, would, job, ma...",sharpest spike hiring laying
8,8,0.0,0.3927,"work, great, people, environment, part, job, e...",employee clearly great asset
9,9,5.0,0.7698,"manager, get, management, time, would, job, ma...",much could add


In [18]:
with open("../Data/all_reviews.pkl","rb") as f:
    reviews_ = pickle.load(f)
    
job_filter = pd.read_csv("../Data/filter_job_titles.csv")

job_filters = job_filter.clean_job_title.tolist()
idx = (reviews_.language == "en")
reviews_ = reviews_.loc[idx,:]

idx = (reviews_.clean_job_title.isin(job_filters))
reviews_ = reviews_.loc[idx,:].reset_index()

print(reviews_.shape)

(45251, 16)


In [19]:
df_sent = pd.DataFrame(pd.DataFrame(tech_review_corpus).groupby('index'))
df_sent.columns=['Indexes','reviews']
display(df_sent.head(10))

Unnamed: 0,Indexes,reviews
0,299,index ...
1,1145,index ...
2,1375,index review 9 ...
3,1874,index ...
4,2136,index ...
5,2209,index ...
6,2685,index ...
7,3054,index rev...
8,3082,index ...
9,3134,index ...


In [20]:
sent_corpus= pd.DataFrame(tech_review_corpus)

In [21]:
sent_corpus

Unnamed: 0,index,review
0,299,great people care make difference
1,299,management care employee well providing contin...
2,1145,powerhouse epc went heavy worksharing
3,1145,send much work overseas possible keep core person
4,1145,heavy loaded management
...,...,...
156986,2587643,enjoy teamwork
156987,2587648,work reporting make sure everyone work
156988,2587648,processed claim
156989,2587648,enjoy work processor


In [22]:
df_dominant_topic['Index'] = sent_corpus['index']
index_list = list(set(df_dominant_topic["Index"].values))
df_dominant_topic.head(10)
reviews_.review.loc[df_sent[df_sent["Indexes"] == [i for i in index_list]].index]

36301    oracle dba ,i am working here from past 7 yrs ...
Name: review, dtype: object

In [23]:
index_list = list(set(df_dominant_topic["Index"].values))
index_list.sort(reverse=False)
df_dominant_topic_copy = df_dominant_topic.copy()

In [66]:
Review_List = []
for i in index_list:
    if [item for item in (df_dominant_topic.Index.isin([i]).values)]:
        sent_idx = df_dominant_topic['Index'].loc[df_dominant_topic[df_dominant_topic["Index"] == i].index].index
        for x in range(len(sent_idx)):
            Review_List.append(reviews_.review.loc[df_sent[df_sent["Indexes"] == i].index].tolist()[0])
           

In [67]:
df_dominant_topic_copy = df_dominant_topic_copy.assign(Review=Review_List)

In [68]:
df_dominant_topic_copy.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Index,Review
0,0,0.0,0.703,"work, great, people, environment, part, job, e...",great people care make difference,299,Fluor is a great company with people that care...
1,1,1.0,0.447,"life, work, balance, employee, benefit, manage...",management care employee well providing contin...,299,Fluor is a great company with people that care...
2,2,5.0,0.5735,"manager, get, management, time, would, job, ma...",powerhouse epc went heavy worksharing,1145,Fluor was the first powerhouse EPC that went h...
3,3,5.0,0.6568,"manager, get, management, time, would, job, ma...",send much work overseas possible keep core person,1145,Fluor was the first powerhouse EPC that went h...
4,4,1.0,0.7913,"life, work, balance, employee, benefit, manage...",heavy loaded management,1145,Fluor was the first powerhouse EPC that went h...
5,5,2.0,0.5553,"learn, lot, work, project, technology, year, t...",chase project hire thousand oil booming,1145,Fluor was the first powerhouse EPC that went h...
6,6,5.0,0.817,"manager, get, management, time, would, job, ma...",turn lay thousand going,1145,Fluor was the first powerhouse EPC that went h...
7,7,5.0,0.8094,"manager, get, management, time, would, job, ma...",sharpest spike hiring laying,1145,Fluor was the first powerhouse EPC that went h...
8,8,0.0,0.3927,"work, great, people, environment, part, job, e...",employee clearly great asset,1145,Fluor was the first powerhouse EPC that went h...
9,9,5.0,0.7698,"manager, get, management, time, would, job, ma...",much could add,1375,There is not much that I could add here. I en...


In [69]:
with open("topic_sent_reviews.pkl","wb") as f:
    pickle.dump(df_dominant_topic_copy,f)