### DATA CLEANING AND PREPARATION

In [1]:
#getting libraries
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
dfm = pd.read_csv('glassdoor_reviews.csv', encoding='cp1252')

In [3]:
dfm = dfm.drop_duplicates()

In [4]:
dfm_sub = dfm[['Positive_Review']]

In [5]:
dfm_sub.head()

Unnamed: 0,Positive_Review
0,good work life balance and nice salary
1,love the work love the sense of involvement k...
2,if you want to work you will find the right se...
3,good benefits being full time employee
4,before wfh office facilities and perks like f...


In [6]:
dfm_sub.rename(columns={'Positive_Review':'ReviewText',
                          }, 
                 inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [7]:
df = dfm_sub


In [8]:
df.head()

Unnamed: 0,ReviewText
0,good work life balance and nice salary
1,love the work love the sense of involvement k...
2,if you want to work you will find the right se...
3,good benefits being full time employee
4,before wfh office facilities and perks like f...


In [9]:
df['ReviewTextLower'] = df.ReviewText
df['ReviewTextLower'] = df.ReviewTextLower.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Preparing data for modelling using count vectorizer and tf-idf

In [10]:
count_vectorizer = CountVectorizer(ngram_range=(1,2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6, max_features=4000)

cv_data = count_vectorizer.fit_transform(df.ReviewTextLower)
tfidf_data = tfidf_vectorizer.fit_transform(df.ReviewTextLower)

In [11]:
# def functions for topic modelings
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def display_topics2(model, feature_names, no_top_words=10, topic_names = None):
    for index, topic in enumerate(model.components_):
        if not topic_names or not topic_names[index]:
            print(f"\nTopic {index}")
        else:
            print(f"\nTopic {topic_names[index]}:")
        msg = ", ".join([f'{feature_names[i]} ({topic[i]:6.4f})' 
                             for i in topic.argsort()[:-no_top_words-1:-1]])
        print(msg)

### Fitting the models to the data

Here we are fitting LSA and NMF Models of Topic Modelling on data transformed via count vectorizer and TFIDF Methods

In [12]:
n_comp = 5
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_tfidf = NMF(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_tfidf_data = nmf_tfidf.fit_transform(tfidf_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)

### LSA Model with TFIDF transformation

In [13]:
display_topics2(lsa_tfidf, tfidf_vectorizer.get_feature_names(),2)


Topic 0
good pay (0.8316), pay good (0.3589)

Topic 1
place work (0.6609), good place (0.4373)

Topic 2
work life (0.5122), life balance (0.4837)

Topic 3
people work (0.7920), great people (0.4245)

Topic 4
pay good (0.6583), good benefits (0.3011)


### LSA Model with Count Vectorization

In [14]:
display_topics2(lsa_cv, count_vectorizer.get_feature_names(),5)


Topic 0
good (0.7220), work (0.3788), pay (0.3082), great (0.1842), benefits (0.1599)

Topic 1
work (0.6940), great (0.3475), people (0.1522), easy (0.0877), place (0.0784)

Topic 2
great (0.5364), pay (0.5157), hours (0.1861), benefits (0.1708), decent (0.1461)

Topic 3
pay (0.4957), work (0.2595), decent (0.2253), easy (0.1593), hours (0.1565)

Topic 4
hours (0.6158), flexible (0.4504), flexible hours (0.2214), schedule (0.1569), job (0.1457)


### NMF Model with TFIDF transformation

In [15]:
display_topics2(nmf_tfidf, tfidf_vectorizer.get_feature_names(),8)


Topic 0
good pay (2.8022), good pay good (0.3027), pay benefits (0.1538), good pay nice (0.1524), pay nice (0.1475), pay great (0.1447), good pay benefits (0.1369), hours good (0.1360)

Topic 1
place work (1.9602), good place (1.2872), good place work (1.1452), great place (0.5863), great place work (0.5608), best place work (0.0858), best place (0.0844), fun place work (0.0784)

Topic 2
work life (1.3584), life balance (1.2828), work life balance (1.2800), good work (0.6878), good work life (0.4287), great work (0.2215), balance good (0.2011), life balance good (0.2011)

Topic 3
people work (1.9990), great people (1.0549), great people work (0.7968), good people (0.4112), good people work (0.3007), nice people (0.2412), nice people work (0.1949), flexible hours (0.1022)

Topic 4
pay good (1.8865), good benefits (0.7329), good hours (0.5624), good pay good (0.5618), decent pay (0.3706), pay good hours (0.3705), flexible schedule (0.2191), decent pay good (0.1952)


### NMF Model with count vectorization

In [16]:
display_topics2(nmf_cv, count_vectorizer.get_feature_names(),5)


Topic 0
good (8.5612), benefits (0.9793), good pay (0.8792), good benefits (0.7142), pay good (0.4984)

Topic 1
work (6.2286), people (0.7048), life (0.6886), easy (0.6580), place (0.6490)

Topic 2
pay (6.0160), decent (1.5036), good pay (1.1819), decent pay (0.8719), benefits (0.7591)

Topic 3
great (5.5202), people (1.2895), benefits (0.9453), great people (0.5837), great benefits (0.5228)

Topic 4
hours (4.0354), flexible (2.7683), flexible hours (1.3048), schedule (1.0462), job (0.7730)
