In [1]:

from multiprocessing import Pool
from tqdm import tqdm
from spacy.lang.en import English
import gensim.corpora as corpora
from gensim.models import LdaMulticore
import multiprocessing as mp
import json
import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
from nltk.corpus import wordnet as wn
#nltk.download('stopwords')
#nltk.download('wordnet')
# spacy for lemmatization
import spacy
from spacy.lang.en import English
parser = English()

#Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
# Plotting tools

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
def tokenize(doc):
    tokens = doc.split(" ")
    tokens = [word for word in tokens if len(word.strip()) > 2]
    return tokens

print("Loading tech corpus")
with open("../data/tech_review_word_corpus.pkl","rb") as f:
    tech_review_corpus = pickle.load(f)
    
reviews = pd.DataFrame(tech_review_corpus).review.tolist()
print("Tokenize the corpus")

with open(( "../data/stop_words.json"), "r") as f:
    stop_words = json.load(f)

vectorizer = CountVectorizer( min_df = 5,max_df=.90, tokenizer=tokenize, stop_words=stop_words, ngram_range=(1, 2))



Loading tech corpus
Tokenize the corpus


In [3]:
len(reviews)

45251

## Sampling

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test= train_test_split(reviews,test_size=0.01, random_state=100)

In [5]:
print(len(X_train))
print(len(X_test))

44798
453


In [6]:
X = vectorizer.fit_transform(X_train)
print("Total Vocab Size", len(vectorizer.vocabulary_))

Total Vocab Size 28806


In [7]:
from gensim.corpora.dictionary import Dictionary
def vect2gensim(vectorizer, dtmatrix):
     # transform sparse matrix into gensim corpus and dictionary
    corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix, documents_columns=False)
    dictionary = Dictionary.from_corpus(corpus_vect_gensim,
        id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))

    return (corpus_vect_gensim, dictionary)
corpus, id2word = vect2gensim(vectorizer, X)

In [None]:
final_lda_model = LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=13, 
                                           random_state=100,
                                           chunksize=50,
                                           workers = 2,
                                           passes=10,
                                           alpha="asymmetric",
                                           eta=0.91,
                                           per_word_topics=True)

In [None]:
final_lda_model.save('../results/word-gensim-lda.model')

In [None]:
# Select the model and print the topics
optimal_model = final_lda_model
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=30))

In [None]:
lda_display = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
aspect = {}
for t in range(0, optimal_model.num_topics):
    aspect['topic {}: '.format(t)] = [v[0] for v in optimal_model.show_topic(t, 50)]

In [None]:
with open("../results/gensim/word-gensim-lda-topic-k-13.json", "w") as f:
                json.dump(aspect, f)

In [45]:
with open("../data/all_reviews.pkl","rb") as f:
    reviews_ = pickle.load(f)
    
job_filter = pd.read_csv("../data/filter_job_titles.csv")

job_filters = job_filter.clean_job_title.tolist()
idx = (reviews_.language == "en")
reviews_ = reviews_.loc[idx,:]

idx = (reviews_.clean_job_title.isin(job_filters))
reviews_ = reviews_.loc[idx,:].reset_index()

print(reviews_.shape)
#display(reviews_.head())


(45251, 16)


In [46]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=reviews):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=final_lda_model, corpus=corpus, texts=reviews)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()

df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_dominant_topic["Review"] = reviews_["review"] 
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Review
0,0,0.0,0.8605,"great, employee, opportunity, work, management, provide, benefit, care, health, organization",fluor be a great company with people that care to make a difference management care about their ...,Fluor is a great company with people that care to make a difference. Management cares about the...
1,1,5.0,0.7994,"work, get, job, people, management, n't, time, manager, go, 's",fluor be the first powerhouse epc that go heavy on the worksharing they send as much work overse...,Fluor was the first powerhouse EPC that went heavy on the worksharing. They send as much work o...
2,2,9.0,0.4359,"work, place, good, great, environment, place work, work environment, great place, people, friendly",there be not much that i could add here i enjoy work at fluor and the people that i be work with...,There is not much that I could add here. I enjoyed working at Fluor and the people that I was w...
3,3,5.0,0.6665,"work, get, job, people, management, n't, time, manager, go, 's",the place can be pretty bore most of the time but it have some very interesting facility where t...,The place can be pretty boring most of the time but it has some very interesting facilities wher...
4,4,5.0,0.6034,"work, get, job, people, management, n't, time, manager, go, 's",everyone be in a hurry to get thing do but they be not in a hurry to do their part to make sure ...,"Everyone is in a hurry to get things done, but they are not in a hurry to do their part to make ..."
5,5,5.0,0.5522,"work, get, job, people, management, n't, time, manager, go, 's",be a re-hire first term would rate them a 4 they hire nice intelligent people reasonable pleasan...,"Was a re-hire. \rFirst term would rate them a 4. They hired nice, intelligent people. Reasona..."
6,6,5.0,0.8384,"work, get, job, people, management, n't, time, manager, go, 's",i hat work for this company the pay be great but the management be utterly horrible people would...,"I hated working for this company. The pay was great, but the management was utterly horrible. Pe..."
7,7,1.0,0.7111,"work, part, job, enjoy, hard, part job, learn, management, help, time",interesting day at work in a enjoy environment user be coddle so they have no understanding of t...,Interesting days at work in a enjoyable environment. User are coddled so they have no understand...
8,8,5.0,0.339,"work, get, job, people, management, n't, time, manager, go, 's",challenge place to work a it manager with five office and over 200 user and me be the sole it su...,Challenging place to work as IT Manager with five offices and over 200 users and me being the so...
9,9,11.0,0.6443,"system, •, development, application, software, use, support, project, work, test",total of thirty-seven 37 year of successful engineering experience in electrical engineering and...,Total of thirty-seven (37) years of successful engineering experiences in electrical engineering...


In [47]:
df_dominant_topic.drop(columns="Text",inplace=True)

In [48]:
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Review
0,0,0.0,0.8605,"great, employee, opportunity, work, management, provide, benefit, care, health, organization",Fluor is a great company with people that care to make a difference. Management cares about the...
1,1,5.0,0.7994,"work, get, job, people, management, n't, time, manager, go, 's",Fluor was the first powerhouse EPC that went heavy on the worksharing. They send as much work o...
2,2,9.0,0.4359,"work, place, good, great, environment, place work, work environment, great place, people, friendly",There is not much that I could add here. I enjoyed working at Fluor and the people that I was w...
3,3,5.0,0.6665,"work, get, job, people, management, n't, time, manager, go, 's",The place can be pretty boring most of the time but it has some very interesting facilities wher...
4,4,5.0,0.6034,"work, get, job, people, management, n't, time, manager, go, 's","Everyone is in a hurry to get things done, but they are not in a hurry to do their part to make ..."
5,5,5.0,0.5522,"work, get, job, people, management, n't, time, manager, go, 's","Was a re-hire. \rFirst term would rate them a 4. They hired nice, intelligent people. Reasona..."
6,6,5.0,0.8384,"work, get, job, people, management, n't, time, manager, go, 's","I hated working for this company. The pay was great, but the management was utterly horrible. Pe..."
7,7,1.0,0.7111,"work, part, job, enjoy, hard, part job, learn, management, help, time",Interesting days at work in a enjoyable environment. User are coddled so they have no understand...
8,8,5.0,0.339,"work, get, job, people, management, n't, time, manager, go, 's",Challenging place to work as IT Manager with five offices and over 200 users and me being the so...
9,9,11.0,0.6443,"system, •, development, application, software, use, support, project, work, test",Total of thirty-seven (37) years of successful engineering experiences in electrical engineering...


In [49]:
with open("tech_review_with_topics.pkl","wb") as f:
    pickle.dump(df_dominant_topic, f)

In [50]:
df_dominant_topic.groupby('Dominant_Topic').size()

Dominant_Topic
0.0     5505
1.0     5189
2.0      368
3.0      125
4.0     7269
5.0     9280
6.0     7633
7.0       50
8.0      166
9.0     6604
10.0      32
11.0    2975
12.0      55
dtype: int64

### The most representative sentence for each topic¶

In [51]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.9815,"great, employee, opportunity, work, management, provide, benefit, care, health, organization",at tech mahindra employee be reward for innovate and encourage to take on leader ship role the f...
1,1.0,0.9829,"work, part, job, enjoy, hard, part job, learn, management, help, time",a very structured day with pending email to be answer and reply to the client with issue and dai...
2,2.0,0.8779,"year, recommend, 3, 2, 5, 1, 4, month, would recommend, late",i have 4 + year of exp in it industry and 2+ year of exp in datapower
3,3.0,0.807,"customer, service, customer service, provider, fortune, table, 500, work customer, help customer...",enjoy the food court varity of selection to eat- gym membership- ping pong
4,4.0,0.9784,"learn, work, good, lot, experience, technology, thing, learn lot, get, opportunity",tc provide awesome opportunity for learn i have learn many thing inside tc tc provide me chance ...
5,5.0,0.9919,"work, get, job, people, management, n't, time, manager, go, 's",so this company be a subcontractor for a large company on this overall contract of every contrac...
6,6.0,0.9753,"good, work, balance, life, work life, life balance, good work, management, culture, benefit",management be not good and work culture also not good management be not good and work culture al...
7,7.0,0.7242,"network, computer, remote, curve, learn curve, switch, networking, shoot, trouble, cisco",work on fotinet firewall cisco asa juniper f5 add and remove firewall configure device such a ro...
8,8.0,0.7242,"skill, home, work home, improve, learn skill, technical, skill set, enhance, set, technical skill",improve my technical stability management and presentation skill
9,9.0,0.9749,"work, place, good, great, environment, place work, work environment, great place, people, friendly",great work place good team work excellent management good work environment excellent work place ...


### Sklearn Topic Modeling

In [52]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV 


In [53]:

lda = LatentDirichletAllocation(
                learning_method="batch",
                random_state=100,
                n_components=7,
                doc_topic_prior=0.01,
                topic_word_prior=0.31,
                n_jobs=-2
            )
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=0.01,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=-2,
                          perp_tol=0.1, random_state=100, topic_word_prior=0.31,
                          total_samples=1000000.0, verbose=0)

In [54]:
def print_topics(model, vectorizer, top_n=50):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('•', 2152.3043183731656), ('development', 1730.0918149780796), ('work', 1473.2847012084758), ('system', 1457.0119540512298), ('use', 1425.1357544326597), ('application', 1391.1172856916562), ('support', 1212.6857484482955), ('software', 1169.3874339943602), ('test', 917.8518795823762), ('issue', 879.8888915237427), ('project', 874.5352259273773), ('server', 776.6539095713795), ('design', 766.4158022574471), ('network', 754.6627604125773), ('user', 744.0942142497814), ('client', 674.6770678451837), ('experience', 650.6240976933344), ('business', 598.0986940791116), ('implement', 588.7212301902206), ('&', 574.7780458279402), ('database', 557.4207393155214), ('management', 556.1743915970684), ('requirement', 556.0817790424478), ('process', 544.420606179033), ('report', 542.8321989617361), ('involve', 524.7963207415515), ('code', 489.04390144066673), ('service', 467.07465160550277), ('create', 463.18197149801205), ('developer', 441.6234514420656), ('manage', 429.87590

[('work', 10392.353177745086), ('great', 5370.733354160755), ('employee', 2715.459069728282), ('benefit', 2467.7090289603602), ('place', 2390.7375114337424), ('management', 2043.3726178089642), ('place work', 1536.6564625462163), ('environment', 1445.1097338282866), ('hour', 1349.2187289460333), ('balance', 1329.1694301086848), ('culture', 1224.420864097882), ('time', 1065.5473205088151), ('pay', 1042.8836855528596), ('great place', 1015.4643868322244), ('well', 1005.0291481525225), ('people', 997.9322165042648), ('life', 904.2763668001636), ("'s", 885.4578887642748), ('flexible', 882.3676588851167), ('good', 870.698314616632), ('great work', 858.3601608209251), ('opportunity', 833.5919583938993), ('care', 764.4202211100444), ('need', 763.6882229722637), ('home', 698.561504992571), ('work environment', 678.2519329026813), ('like', 659.0393635460014), ('make', 649.1010295495588), ('advancement', 595.5452098553545), ('one', 588.122356651096), ('excellent', 587.908569696159), ('lot', 572.

In [56]:
aspect = {}
for idx, topic in enumerate(lda.components_):
    aspect['Aspect {0}'.format(str(idx))] = [vectorizer.get_feature_names()[i]
                                                         for i in topic.argsort()[:-100 - 1:-1]]

with open("../results/Sklearn-LDA/sklearn-word-lda-topic-k-7-a-0.01-b-0.31.json", "w") as f:
    json.dump(aspect, f)

TypeError: list indices must be integers or slices, not str

In [None]:
# Build a Non-Negative Matrix Factorization Model
nmf = NMF(n_components=7
                  ,init='nndsvd'
                 ,random_state=100,
                 alpha=0.61)

nmf.fit(X)


In [None]:
print("NMF Model:")
print_topics(nmf, vectorizer)
print("=" * 20)

In [None]:
spect = {}
for idx, topic in enumerate(nmf.components_):
    aspect['Aspect {0}'.format(str(idx))] = {vectorizer.get_feature_names()[i]
                                                         for i in topic.argsort()[:-100 - 1:-1]}

with open("../results/Sklearn-LDA/sklearn-word-nmf-topic-k-7-a-0.61.json", "w") as f:
    json.dump(aspect, f)