In [1]:

from multiprocessing import Pool
from tqdm import tqdm
from spacy.lang.en import English
import gensim.corpora as corpora
from gensim.models import LdaMulticore
import multiprocessing as mp

import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
from nltk.corpus import wordnet as wn
#nltk.download('stopwords')
#nltk.download('wordnet')
# spacy for lemmatization
import spacy
from spacy.lang.en import English
parser = English()

# Plotting tools

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
with open("../data/tech_review_word_corpus.pkl","rb") as f:
    tech_review_corpus = pickle.load(f)
    
reviews = pd.DataFrame(tech_review_corpus).review.tolist()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
num_cpus = mp.cpu_count() - 1

parser = English()


def compute_coherence_values(param):
    lda_model = LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=param["k"], 
                                           random_state=100,
                                           chunksize=100,
                                           workers = num_cpus,
                                           passes=10,
                                           alpha=param["alpha"],
                                           eta=param["beta"],
                                           per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
    
    param["coherence"] = coherence_model_lda.get_coherence()

    return param

if __name__ == "__main__":

    print("Total workers:", num_cpus)

    print("Tokenize the corpus")
    with Pool() as p:
        processed_docs = list(tqdm(p.imap(tokenize, reviews), total=len(reviews)))

    # Create Dictionary
    id2word = corpora.Dictionary(processed_docs)
    # Term Document Frequency
    print("Create a Dictionary")
    corpus = [id2word.doc2bow(text) for i, text in tqdm(enumerate(processed_docs), total=len(processed_docs))]

    grid = {}
    grid['Validation_Set'] = {}
    # Topics range
    min_topics = 7
    max_topics = 13
    step_size = 1
    topics_range = range(max_topics, min_topics, -1)
    # Alpha parameter
    alpha = list(np.arange(0.01, 1, 0.3))
    alpha.append('symmetric')
    alpha.append('asymmetric')
    # Beta parameter
    beta = list(np.arange(0.01, 1, 0.3))
    beta.append('symmetric')

    parameters = []
    for k in topics_range:
        for a in alpha:
            for b in beta:
                parameters.append({
                        "k":k
                        ,"alpha":a
                        ,"beta":b
                        ,"workers":4
                    })

    print("Running modeling")
    print("Total Paramters", len(parameters))

    results = list(map(compute_coherence_values, tqdm(parameters)))

    df = pd.DataFrame(results).to_csv("../data/lda_word_modeling.csv",index = False)

Total workers: 15
Tokenize the corpus


100%|██████████| 45251/45251 [00:02<00:00, 17070.34it/s]
  2%|▏         | 690/45251 [00:00<00:10, 4251.66it/s]

Create a Dictionary


100%|██████████| 45251/45251 [00:01<00:00, 38315.66it/s]
  0%|          | 0/180 [00:00<?, ?it/s]

Running modeling
Total Paramters 180


100%|██████████| 180/180 [7:02:39<00:00, 140.88s/it]  


In [3]:
pd.DataFrame(results).sort_values(by = ['coherence','k'],ascending=False)

Unnamed: 0,k,alpha,beta,workers,coherence
115,10,asymmetric,0.01,4,0.624038
157,8,0.31,0.61,4,0.623279
172,8,symmetric,0.61,4,0.618415
128,9,0.31,0.91,4,0.616429
170,8,symmetric,0.01,4,0.608220
...,...,...,...,...,...
92,10,0.01,0.61,4,0.507157
88,11,asymmetric,0.91,4,0.499703
113,10,symmetric,0.91,4,0.499092
33,12,0.01,0.91,4,0.485647


In [5]:

final_lda_model = LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           workers = 4,
                                           passes=10,
                                           alpha='asymmetric',
                                           eta=0.01,
                                           per_word_topics=True)

In [6]:
# Select the model and print the topics
optimal_model = final_lda_model
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=20))

[(0,
  '0.053*"wa" + 0.028*"great" + 0.024*"work" + 0.021*"year" + 0.015*"worked" + '
  '0.015*"job" + 0.015*"working" + 0.014*"people" + 0.013*"time" + '
  '0.013*"would" + 0.010*"ha" + 0.009*"place" + 0.009*"benefit" + 0.008*"many" '
  '+ 0.008*"position" + 0.008*"manager" + 0.008*"experience" + '
  '0.007*"employee" + 0.007*"management" + 0.007*"one"'),
 (1,
  '0.056*"wa" + 0.050*"part" + 0.048*"work" + 0.041*"job" + 0.029*"worker" + '
  '0.028*"co" + 0.028*"management" + 0.020*"learned" + 0.019*"enjoyable" + '
  '0.018*"hardest" + 0.018*"working" + 0.016*"typical" + 0.016*"time" + '
  '0.010*"task" + 0.008*"always" + 0.008*"meeting" + 0.008*"environment" + '
  '0.007*"project" + 0.007*"friendly" + 0.007*"helpful"'),
 (2,
  '0.031*"get" + 0.026*"make" + 0.023*"people" + 0.020*"work" + 0.020*"time" + '
  '0.014*"one" + 0.012*"need" + 0.011*"help" + 0.011*"want" + 0.011*"always" + '
  '0.011*"like" + 0.011*"manager" + 0.010*"know" + 0.010*"thing" + 0.009*"way" '
  '+ 0.008*"come" + 0.

In [7]:
lda_display = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word, sort_topics=False)
pyLDAvis.display(lda_display)

In [6]:
with open("../data/all_reviews.pkl","rb") as f:
    reviews_ = pickle.load(f)
    
job_filter = pd.read_csv("../data/filter_job_titles.csv")

job_filters = job_filter.clean_job_title.tolist()
idx = (reviews_.language == "en")
reviews_ = reviews_.loc[idx,:]

idx = (reviews_.clean_job_title.isin(job_filters))
reviews_ = reviews_.loc[idx,:].reset_index()

print(reviews_.shape)
#display(reviews_.head())


(45251, 16)


In [7]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=reviews):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=final_lda_model, corpus=corpus, texts=reviews)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()

df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_dominant_topic["Review"] = reviews_["review"] 
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Review
0,0,1.0,0.6569,"company, great, work, employee, management, pe...",great company people care make difference mana...,Fluor is a great company with people that care...
1,1,1.0,0.4597,"company, great, work, employee, management, pe...",powerhouse epc went heavy worksharing send muc...,Fluor was the first powerhouse EPC that went h...
2,2,1.0,0.3941,"company, great, work, employee, management, pe...",much could add enjoyed working people working ...,There is not much that I could add here. I en...
3,3,3.0,0.4926,"worked, year, working, time, contract, experie...",place pretty boring time interesting facility ...,The place can be pretty boring most of the tim...
4,4,5.0,0.6341,"get, time, work, job, manager, people, hour, l...",everyone hurry get thing done hurry part make ...,"Everyone is in a hurry to get things done, but..."
5,5,5.0,0.4096,"get, time, work, job, manager, people, hour, l...",hire term would rate hired nice intelligent pe...,Was a re-hire. \rFirst term would rate them a...
6,6,1.0,0.4723,"company, great, work, employee, management, pe...",hated working company pay great management utt...,I hated working for this company. The pay was ...
7,7,0.0,0.345,"work, good, place, lot, learn, great, environm...",interesting day work enjoyable environment use...,Interesting days at work in a enjoyable enviro...
8,8,1.0,0.3201,"company, great, work, employee, management, pe...",challenging place work manager five office 200...,Challenging place to work as IT Manager with f...
9,9,3.0,0.4207,"worked, year, working, time, contract, experie...",total thirty seven 37 year successful engineer...,Total of thirty-seven (37) years of successful...


In [8]:
with open("tech_review_with_topics.pkl","wb") as f:
    pickle.dump(df_dominant_topic, f)

In [9]:
df_dominant_topic.groupby('Dominant_Topic').size()

Dominant_Topic
0.0    19209
1.0    10335
2.0      884
3.0      487
4.0      595
5.0     4953
6.0      335
7.0     1475
8.0     1973
9.0     5005
dtype: int64

### The most representative sentence for each topic¶

In [10]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.9681,"work, good, place, lot, learn, great, environment, working, company, technology",good company work learn lot good work culture great work get kind support get chance work latest...
1,1.0,0.966,"company, great, work, employee, management, people, place, benefit, job, environment",great benefit great place seek long term employment excellent educational support program help w...
2,2.0,0.992,"development, application, software, developer, using, requirement, testing, web, involved, test",involved complete sdlc software development life cycle plan analyze design build test rollout im...
3,3.0,0.7006,"worked, year, working, time, contract, experience, enjoyed, project, client, different",year exprince streem project 3years expriance stil
4,4.0,0.9276,"system, support, network, user, software, computer, server, engineer, information, service",managing department hardware wing site preparation troubleshooting network problem preventive ma...
5,5.0,0.9658,"get, time, work, job, manager, people, hour, like, would, one",left hired employee paid per hour people year tell hiring higher rate month still refused give p...
6,6.0,0.9148,"service, business, product, customer, company, world, based, technology, become, solution",home retailer home improvement construction product service operates many box format store acros...
7,7.0,0.8955,"project, skill, task, client, technical, management, daily, meeting, process, knowledge",learned handle client also present project third client
8,8.0,0.941,"part, job, worker, co, enjoyable, hardest, work, management, typical, learned",enjoyable part job typical work learned management co worker hardest part job enjoyable part job
9,9.0,0.9567,"work, good, life, balance, salary, company, management, benefit, job, culture",good work culture healthy work environment work life balance priority descent salary benefit goo...
