### DATA CLEANING AND PREPARATION

In [6]:
#getting libraries
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [7]:
dfm = pd.read_csv('glassdoor_reviews.csv', encoding='cp1252')

In [8]:
dfm = dfm.drop_duplicates()

In [9]:
dfm_sub = dfm[['Negative_Review']]

In [10]:
dfm_sub.head()

Unnamed: 0,Negative_Review
0,nothing to say about for cons
1,compensation is not on par with best
2,does not penalize on poor performance
3,being micro managed in the team
4,rating system if not up to the mark even if y...


In [11]:
dfm_sub.rename(columns={'Negative_Review':'ReviewText',
                          }, 
                 inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [12]:
df = dfm_sub


In [13]:
df.head()

Unnamed: 0,ReviewText
0,nothing to say about for cons
1,compensation is not on par with best
2,does not penalize on poor performance
3,being micro managed in the team
4,rating system if not up to the mark even if y...


In [14]:
df['ReviewTextLower'] = df.ReviewText
df['ReviewTextLower'] = df.ReviewTextLower.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Preparing data for modelling using count vectorizer and tf-idf

In [15]:
count_vectorizer = CountVectorizer(ngram_range=(1,2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)

cv_data = count_vectorizer.fit_transform(df.ReviewTextLower)
tfidf_data = tfidf_vectorizer.fit_transform(df.ReviewTextLower)

In [16]:
# def functions for topic modelings
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def display_topics2(model, feature_names, no_top_words=10, topic_names = None):
    for index, topic in enumerate(model.components_):
        if not topic_names or not topic_names[index]:
            print(f"\nTopic {index}")
        else:
            print(f"\nTopic {topic_names[index]}:")
        msg = ", ".join([f'{feature_names[i]} ({topic[i]:6.4f})' 
                             for i in topic.argsort()[:-no_top_words-1:-1]])
        print(msg)

### Fitting the models to the data

Here we are fitting LSA and NMF Models of Topic Modelling on data transformed via count vectorizer and TFIDF Methods

In [17]:
n_comp = 5
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_tfidf = NMF(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)

### LSA Model with TFIDF transformation

In [20]:
display_topics2(lsa_tfidf, tfidf_vectorizer.get_feature_names(),2)


Topic 0
work life (0.5783), life balance (0.5655)

Topic 1
long hours (0.9848), hard work (0.0436)

Topic 2
poor management (0.9459), rude customers (0.1962)

Topic 3
rude customers (0.8394), bad management (0.4313)

Topic 4
bad management (0.8746), low pay (0.0536)


### LSA Model with Count Vectorization

In [24]:
display_topics2(lsa_cv, count_vectorizer.get_feature_names(),5)


Topic 0
good (0.7220), work (0.3788), pay (0.3082), great (0.1842), benefits (0.1599)

Topic 1
work (0.6940), great (0.3475), people (0.1522), easy (0.0877), place (0.0783)

Topic 2
great (0.5364), pay (0.5158), hours (0.1861), benefits (0.1708), decent (0.1457)

Topic 3
pay (0.4960), work (0.2595), decent (0.2242), easy (0.1577), hours (0.1557)

Topic 4
hours (0.6161), flexible (0.4482), flexible hours (0.2208), schedule (0.1557), job (0.1439)


### NMF Model with TFIDF transformation

In [25]:
display_topics2(nmf_tfidf, tfidf_vectorizer.get_feature_names(),8)


Topic 0
good pay (2.8019), good pay good (0.3027), pay benefits (0.1538), good pay nice (0.1524), pay nice (0.1475), pay great (0.1447), good pay benefits (0.1369), hours good (0.1360)

Topic 1
place work (1.9599), good place (1.2870), good place work (1.1451), great place (0.5862), great place work (0.5607), best place work (0.0858), best place (0.0844), fun place work (0.0784)

Topic 2
work life (1.3582), life balance (1.2827), work life balance (1.2799), good work (0.6877), good work life (0.4287), great work (0.2215), life balance good (0.2010), balance good (0.2010)

Topic 3
people work (1.9985), great people (1.0546), great people work (0.7966), good people (0.4110), good people work (0.3006), nice people (0.2412), nice people work (0.1949), flexible hours (0.1022)

Topic 4
pay good (1.8862), good benefits (0.7328), good hours (0.5624), good pay good (0.5617), decent pay (0.3706), pay good hours (0.3705), flexible schedule (0.2191), decent pay good (0.1952)


### NMF Model with count vectorization

In [26]:
display_topics2(nmf_cv, count_vectorizer.get_feature_names(),5)


Topic 0
good (8.5611), benefits (0.9793), good pay (0.8792), good benefits (0.7142), pay good (0.4984)

Topic 1
work (6.2287), people (0.7048), life (0.6886), easy (0.6580), place (0.6490)

Topic 2
pay (6.0159), decent (1.5036), good pay (1.1819), decent pay (0.8719), benefits (0.7591)

Topic 3
great (5.5202), people (1.2895), benefits (0.9453), great people (0.5838), great benefits (0.5228)

Topic 4
hours (4.0351), flexible (2.7681), flexible hours (1.3047), schedule (1.0462), job (0.7729)
