In [1]:
import statsmodels.api as sm
import pandas as pd
import re,string
import nltk
from patsy import dmatrices
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import warnings
%pylab inline
import numpy as np
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances
import spacy

Populating the interactive namespace from numpy and matplotlib


## starting with Amazon

In [2]:
df_ibm = pd.read_csv('ibm.csv', encoding="iso-8859-1")

In [3]:
df_ibm.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management
0,"""Offering Manager""",no-date,5.0,Current Employee,Offering Manager,"Westbury, NY",Recommends,Neutral Outlook,I have been working at IBM full-time (More tha...,IBM is good to their employees,Lacks strategy in certain areas of the business,
1,"""Senior Software Engineer""",no-date,4.0,Former Employee,Anonymous Employee,,,,I worked at IBM full-time,laid back atmosphere with great work life balance,upper management pushes out middle management ...,
2,"""Great Company, excellent management""",no-date,4.0,Former Employee,Anonymous Employee,,Recommends,Positive Outlook,I worked at IBM full-time,Lot of empowerment. I was given really good wo...,"As per the public news, IBM had yearly layoffs...",
3,"""IBM""","Nov 4, 2018",5.0,Former Employee,Various,"Austin, TX",Recommends,Neutral Outlook,I worked at IBM full-time (More than 10 years),"Interesting work, opportunities to work on div...",Not sure if pay is industry-competitive,Review salary structure to ensure industry com...
4,"""Not amenable""",no-date,5.0,Former Employee,Anonymous Employee,,Doesn't Recommend,Neutral Outlook,No opinion of CEO,Tools were always cutting edge,would have been nice to know evaluation criteria,Get to know your employees who actually are in...


In [4]:
df_ibm.shape

(10000, 12)

In [5]:
pros = pd.DataFrame(df_ibm['Pros'], index= df_ibm.index)
cons = pd.DataFrame(df_ibm['Cons'],  index= df_ibm.index)

In [6]:
pros.dropna(axis=0, inplace=True)
cons.dropna(axis=0, inplace=True)

In [7]:
def clean_tokenize(s):
    stop = set(stopwords.words('english'))
    punc = string.punctuation
    s = re.sub(r'[^\w\s]', '',s.lower())
    return([word for word in word_tokenize(s) if word not in stop if word not in punc])
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)
def lemmatize(my_tokens):
    """Function to enable data preparation for topic modeling using LDA, 
    includes removing stop words, lemmatization, tokenization"""
    my_tokens = [get_lemma(token) for token in my_tokens]
    return my_tokens

In [8]:
def dataprep(text_data_df):
    pros = pd.DataFrame(text_data_df['Pros'], index= text_data_df.index)
    cons = pd.DataFrame(text_data_df['Cons'],  index= text_data_df.index)
    pros.dropna(axis=0, inplace=True)
    cons.dropna(axis=0, inplace=True)
    stop = set(stopwords.words('english'))
    punc = string.punctuation
    
    pros['pros_tokens'] =pros.Pros.map(clean_tokenize)
    cons['cons_tokens'] =cons.Cons.map(clean_tokenize)
    #my_tokens = [get_lemma(token) for token in my_tokens]
    pros['pros_lemma_tokens'] = pros['pros_tokens'].apply(lemmatize)
    #pros['pros_lemma_tokens'] = [get_lemma(token) for token in pros['pros_tokens']]
    cons['cons_lemma_tokens'] = cons['cons_tokens'].apply(lemmatize)
    return pros, cons 

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(pros['pros_tokens'])
corpus = [dictionary.doc2bow(text) for text in pros['pros_tokens']]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [11]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, u'0.046*"ibm" + 0.029*"good" + 0.017*"company" + 0.013*"years"')
(1, u'0.073*"people" + 0.034*"great" + 0.032*"work" + 0.024*"smart"')
(2, u'0.130*"work" + 0.091*"good" + 0.044*"benefits" + 0.042*"home"')
(3, u'0.112*"great" + 0.054*"work" + 0.035*"place" + 0.034*"good"')
(4, u'0.023*"company" + 0.022*"opportunities" + 0.020*"technology" + 0.019*"new"')


#### TM for cons

In [12]:
pros_ibm, cons_ibm = dataprep(df_ibm)

In [13]:
pros_ibm.head()

Unnamed: 0,Pros,pros_tokens,pros_lemma_tokens
0,IBM is good to their employees,"[ibm, good, employees]","[ibm, good, employee]"
1,laid back atmosphere with great work life balance,"[laid, back, atmosphere, great, work, life, ba...","[lay, back, atmosphere, great, work, life, bal..."
2,Lot of empowerment. I was given really good wo...,"[lot, empowerment, given, really, good, work, ...","[lot, empowerment, given, really, good, work, ..."
3,"Interesting work, opportunities to work on div...","[interesting, work, opportunities, work, diver...","[interest, work, opportunity, work, diverse, p..."
4,Tools were always cutting edge,"[tools, always, cutting, edge]","[tool, always, cutting, edge]"


In [14]:
cons_ibm.head()

Unnamed: 0,Cons,cons_tokens,cons_lemma_tokens
0,Lacks strategy in certain areas of the business,"[lacks, strategy, certain, areas, business]","[lack, strategy, certain, area, business]"
1,upper management pushes out middle management ...,"[upper, management, pushes, middle, management...","[upper, management, push, middle, management, ..."
2,"As per the public news, IBM had yearly layoffs...","[per, public, news, ibm, yearly, layoffs, bit,...","[per, public, news, ibm, yearly, layoff, bit, ..."
3,Not sure if pay is industry-competitive,"[sure, pay, industrycompetitive]","[sure, pay, industrycompetitive]"
4,would have been nice to know evaluation criteria,"[would, nice, know, evaluation, criteria]","[would, nice, know, evaluation, criterion]"


### Defining generalized functions for TM

In [15]:
def topics_reveal(pros_df, cons_df, num_topics = 5, num_words = 4):
    dictionary = corpora.Dictionary(pros_df['pros_lemma_tokens'])
    corpus = [dictionary.doc2bow(text) for text in pros_df['pros_lemma_tokens']]
    pickle.dump(corpus, open('corpus_pros.pkl', 'wb'))
    NUM_TOPICS = num_topics
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    topics_pros = ldamodel.print_topics(num_words=4)
    print(" For Pros, 5 topics are defined as follows: ")
    for topic in topics_pros:
        print(topic)
        
    dictionary = corpora.Dictionary(cons_df['cons_lemma_tokens'])
    corpus = [dictionary.doc2bow(text) for text in cons_df['cons_lemma_tokens']]
    pickle.dump(corpus, open('corpus_cons.pkl', 'wb'))
    NUM_TOPICS = num_topics
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    topics_cons = ldamodel.print_topics(num_words=num_words)
    print(" For Cons, 5 topics are defined as follows: ")
    for topic in topics_cons:
        print(topic)
    return topics_pros, topics_cons

In [16]:
t = topics_reveal(pros_ibm, cons_ibm)

 For Pros, 5 topics are defined as follows: 
(0, u'0.097*"company" + 0.045*"opportunity" + 0.030*"great" + 0.024*"learn"')
(1, u'0.136*"work" + 0.096*"good" + 0.053*"great" + 0.051*"benefit"')
(2, u'0.033*"opportunity" + 0.025*"training" + 0.019*"great" + 0.017*"experience"')
(3, u'0.112*"people" + 0.042*"smart" + 0.039*"great" + 0.030*"technology"')
(4, u'0.050*"ibm" + 0.033*"work" + 0.017*"get" + 0.012*"best"')
 For Cons, 5 topics are defined as follows: 
(0, u'0.028*"management" + 0.025*"salary" + 0.022*"poor" + 0.018*"low"')
(1, u'0.022*"company" + 0.019*"pay" + 0.019*"people" + 0.018*"ibm"')
(2, u'0.029*"ibm" + 0.029*"company" + 0.019*"job" + 0.015*"get"')
(3, u'0.027*"management" + 0.025*"change" + 0.024*"company" + 0.020*"process"')
(4, u'0.042*"work" + 0.023*"employee" + 0.018*"long" + 0.016*"hours"')


### Pros: They talk about great training and opportunity. And IBM leverage smart people and advance technology
### Cons: Some complains about the salary and issue about the managing process.