In [67]:
import numpy as np
import pandas as pd
import os
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

import re


In [68]:
rev_data = pd.read_csv('employee_reviews.csv')

rev_data.head()

topics = list(rev_data.columns.values)[10:15]

rev_data.head()

Unnamed: 0.1,Unnamed: 0,company,location,dates,job-title,summary,pros,cons,advice-to-mgmt,overall ratings,work balance,culture values,career opportunities,company benefits,senior management,helpful-count,link
0,1,google,none,"Dec 11, 2018",Current Employee - Anonymous Employee,Best Company to work for,People are smart and friendly,Bureaucracy is slowing things down,none,5,4,5,5,4,5,0,https://www.glassdoor.com/Reviews/Google-Revie...
1,2,google,"Mountain View, CA","Jun 21, 2013",Former Employee - Program Manager,"Moving at the speed of light, burn out is inev...","1) Food, food, food. 15+ cafes on main campus ...",1) Work/life balance. What balance? All those ...,1) Don't dismiss emotional intelligence and ad...,4,2,3,3,5,3,2094,https://www.glassdoor.com/Reviews/Google-Revie...
2,3,google,"New York, NY","May 10, 2014",Current Employee - Software Engineer III,Great balance between big-company security and...,"* If you're a software engineer, you're among ...","* It *is* becoming larger, and with it comes g...",Keep the focus on the user. Everything else wi...,5,5,4,5,5,4,949,https://www.glassdoor.com/Reviews/Google-Revie...
3,4,google,"Mountain View, CA","Feb 8, 2015",Current Employee - Anonymous Employee,The best place I've worked and also the most d...,You can't find a more well-regarded company th...,I live in SF so the commute can take between 1...,Keep on NOT micromanaging - that is a huge ben...,5,2,5,5,4,5,498,https://www.glassdoor.com/Reviews/Google-Revie...
4,5,google,"Los Angeles, CA","Jul 19, 2018",Former Employee - Software Engineer,"Unique, one of a kind dream job",Google is a world of its own. At every other c...,"If you don't work in MTV (HQ), you will be giv...",Promote managers into management for their man...,5,5,5,5,5,5,49,https://www.glassdoor.com/Reviews/Google-Revie...


In [69]:
rev_data['Text'] = rev_data['company'].map(str)+ ' ' + rev_data['job-title'].map(str) + ' ' + rev_data['summary'].map(str) + ' ' +  rev_data['pros'].map(str) + ' ' + rev_data['cons'].map(str) + ' ' + rev_data['advice-to-mgmt'].map(str)



In [70]:
data = rev_data['Text'].values.tolist()

In [71]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

data = [re.sub("\-", "", sent) for sent in data]


In [72]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
        
data_words = list(sent_to_words(data))

print(data_words[:1])

[['google', 'current', 'employee', 'anonymous', 'employee', 'best', 'company', 'to', 'work', 'for', 'people', 'are', 'smart', 'and', 'friendly', 'bureaucracy', 'is', 'slowing', 'things', 'down', 'none']]


In [73]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['google', 'current', 'employee', 'anonymous', 'employee', 'best', 'company', 'to', 'work', 'for', 'people', 'are', 'smart', 'and', 'friendly', 'bureaucracy', 'is', 'slowing', 'things', 'down', 'none']


In [74]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [75]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['google', 'current', 'employee', 'anonymous', 'employee', 'good', 'company', 'work', 'people', 'smart', 'friendly', 'bureaucracy', 'slow', 'thing', 'none']]


In [76]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]]


In [77]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [79]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.039*"manager" + 0.035*"company" + 0.025*"team" + 0.020*"product" + 0.018*"many" + 0.017*"career" + 0.016*"people" + 0.015*"change" + 0.014*"senior" + 0.014*"opportunity"'), (1, '0.242*"microsoft" + 0.050*"management" + 0.028*"need" + 0.025*"review" + 0.023*"new" + 0.018*"system" + 0.017*"high" + 0.016*"support" + 0.012*"always" + 0.012*"performance"'), (2, '0.054*"people" + 0.044*"make" + 0.026*"really" + 0.025*"take" + 0.025*"thing" + 0.022*"well" + 0.018*"way" + 0.016*"much" + 0.016*"want" + 0.015*"also"'), (3, '0.078*"get" + 0.053*"not" + 0.045*"do" + 0.031*"time" + 0.027*"year" + 0.021*"go" + 0.021*"be" + 0.021*"job" + 0.015*"bad" + 0.014*"even"'), (4, '0.122*"employee" + 0.090*"work" + 0.066*"good" + 0.052*"great" + 0.049*"current" + 0.032*"none" + 0.030*"anonymous" + 0.026*"former" + 0.024*"company" + 0.022*"place"'), (5, '0.101*"customer" + 0.060*"service" + 0.052*"training" + 0.036*"meeting" + 0.030*"global" + 0.029*"home" + 0.029*"late" + 0.023*"test" + 0.015*"enjoy" +

In [80]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.1623098284682225

Coherence Score:  0.37998780959140355


In [81]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
