In [6]:
# NLTK
import nltk
from nltk import tokenize, word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.text import Text
from nltk.corpus import brown, stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.classify import NaiveBayesClassifier

# TQDM
from tqdm import tqdm_notebook as tqdm
from tqdm import trange
from distutils.command.build import build

# General Libraries
import requests
import re
import pandas as pd
import spacy
import pymysql
import string
import numpy as np
from sqlalchemy import create_engine

# Sentiment Analyser
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [7]:
host='database-1.cg01lulzqvm7.ap-southeast-1.rds.amazonaws.com'
port=int(3306)
user='zhenyuen'
passw = 'zhenyuen488'
database = 'ANL488_database_2'

In [8]:
connection = pymysql.connect(host = f'{host}',  
                             user = f'{user}', 
                             password = f'{passw}',
                             database = f'{database}')

In [9]:
bytedance_glassdoor = pd.read_sql_query('''select * from Bytedance_Glassdoor_Reviews''', connection)
bytedance_glassdoor.head()



Unnamed: 0,date,year exp,score,pros,cons,position
0,2022-08-20,"Current Employee, less than 1 year",5.0,"flexible working culture, good benefits",limited learning opportunities and unclear car...,Anonymous Employee
1,2022-08-17,"Former Employee, less than 1 year",5.0,"all people are sooo nice in our team, enjoy wo...",so many meetings in our team,Data Analyst
2,2022-08-10,"Former Employee, more than 1 year",5.0,"Good culture, great team and good benefits!","No idea when the company will IPO thus, do not...",Manager
3,2022-08-22,"Former Employee, less than 1 year",3.0,"- very fast-paced fintech company, - generous ...","- zero work-life balance, too many unnecessary...",Global SOP Enablement
4,2022-08-18,"Current Employee, more than 1 year",4.0,"Good boss, Modern tech tools, Free lunch","Irregular working hours, Excessive workload , ...",Data Engineer


### Sentiment Analysis

In [10]:
# =================================
# Preprocessing
# =================================
bytedance_glassdoor['reviews'] = bytedance_glassdoor['pros'] + bytedance_glassdoor['cons']

# Convert to lowercase
bytedance_glassdoor['reviews'] = bytedance_glassdoor['reviews'].str.lower()

# Remove Stop Words
stop = stopwords.words('english')
bytedance_glassdoor['reviews'] = bytedance_glassdoor['reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Remove punctuations
bytedance_glassdoor["reviews"] = bytedance_glassdoor['reviews'].str.replace('[^\w\s]','')

  bytedance_glassdoor["reviews"] = bytedance_glassdoor['reviews'].str.replace('[^\w\s]','')


In [11]:
# Lemmentisation
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

bytedance_glassdoor['reviews'] = bytedance_glassdoor['reviews'].apply(lemmatize_text)
bytedance_glassdoor['reviews'] = bytedance_glassdoor['reviews'].apply(', '.join)

bytedance_glassdoor.head()

Unnamed: 0,date,year exp,score,pros,cons,position,reviews
0,2022-08-20,"Current Employee, less than 1 year",5.0,"flexible working culture, good benefits",limited learning opportunities and unclear car...,Anonymous Employee,"flexible, working, culture, good, benefitslimi..."
1,2022-08-17,"Former Employee, less than 1 year",5.0,"all people are sooo nice in our team, enjoy wo...",so many meetings in our team,Data Analyst,"people, sooo, nice, team, enjoy, working, ther..."
2,2022-08-10,"Former Employee, more than 1 year",5.0,"Good culture, great team and good benefits!","No idea when the company will IPO thus, do not...",Manager,"good, culture, great, team, good, benefitsno, ..."
3,2022-08-22,"Former Employee, less than 1 year",3.0,"- very fast-paced fintech company, - generous ...","- zero work-life balance, too many unnecessary...",Global SOP Enablement,"fastpaced, fintech, company, generous, benefit..."
4,2022-08-18,"Current Employee, more than 1 year",4.0,"Good boss, Modern tech tools, Free lunch","Irregular working hours, Excessive workload , ...",Data Engineer,"good, bos, modern, tech, tool, free, lunchirre..."


In [12]:
analyzer = SentimentIntensityAnalyzer()

sentiments_list = list()

for review in bytedance_glassdoor['reviews'].tolist():
    sentence_list = tokenize.sent_tokenize(review)
    sentiments = {'compound': 0.0, 'neg': 0.0, 'neu': 0.0, 'pos': 0.0}
        
    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        sentiments['compound'] += vs['compound']
        sentiments['neg'] += vs['neg']
        sentiments['neu'] += vs['neu']
        sentiments['pos'] += vs['pos']
            
    sentiments['compound'] = sentiments['compound'] / len(sentence_list)
    sentiments['neg'] = sentiments['neg'] / len(sentence_list)
    sentiments['neu'] = sentiments['neu'] / len(sentence_list)
    sentiments['pos'] = sentiments['pos'] / len(sentence_list)
    
    sentiments_list.append(sentiments)  # add this line

In [13]:
bytedance_glassdoor = bytedance_glassdoor.join(pd.DataFrame(sentiments_list))
bytedance_glassdoor.head()

Unnamed: 0,date,year exp,score,pros,cons,position,reviews,compound,neg,neu,pos
0,2022-08-20,"Current Employee, less than 1 year",5.0,"flexible working culture, good benefits",limited learning opportunities and unclear car...,Anonymous Employee,"flexible, working, culture, good, benefitslimi...",0.6808,0.128,0.385,0.487
1,2022-08-17,"Former Employee, less than 1 year",5.0,"all people are sooo nice in our team, enjoy wo...",so many meetings in our team,Data Analyst,"people, sooo, nice, team, enjoy, working, ther...",0.7184,0.0,0.571,0.429
2,2022-08-10,"Former Employee, more than 1 year",5.0,"Good culture, great team and good benefits!","No idea when the company will IPO thus, do not...",Manager,"good, culture, great, team, good, benefitsno, ...",0.872,0.0,0.548,0.452
3,2022-08-22,"Former Employee, less than 1 year",3.0,"- very fast-paced fintech company, - generous ...","- zero work-life balance, too many unnecessary...",Global SOP Enablement,"fastpaced, fintech, company, generous, benefit...",0.9383,0.0,0.786,0.214
4,2022-08-18,"Current Employee, more than 1 year",4.0,"Good boss, Modern tech tools, Free lunch","Irregular working hours, Excessive workload , ...",Data Engineer,"good, bos, modern, tech, tool, free, lunchirre...",0.4767,0.153,0.542,0.305


### Emotion Lexicon

In [14]:
emotion_lexicon_df = pd.read_sql_query('''select * from NRC_Emotion_Lexicon''', connection)
emotion_lexicon_df.head()



Unnamed: 0,Word,Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust
0,aback,0,0,0,0,0,0,0,0,0,0
1,abacus,0,0,0,0,0,0,0,0,0,1
2,abandon,0,1,0,0,0,1,0,1,0,0
3,abandoned,0,1,1,0,0,1,0,1,0,0
4,abandonment,0,1,1,0,0,1,0,1,1,0


In [15]:
emotion_lexicon_df.columns

Index(['Word', 'Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust',
       'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust'],
      dtype='object')

In [16]:
emotions_df = pd.DataFrame(0, index=bytedance_glassdoor.index, columns=emotion_lexicon_df.columns)
emotions_df

Unnamed: 0,Word,Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
965,0,0,0,0,0,0,0,0,0,0,0
966,0,0,0,0,0,0,0,0,0,0,0
967,0,0,0,0,0,0,0,0,0,0,0
968,0,0,0,0,0,0,0,0,0,0,0


In [17]:
stemmer = SnowballStemmer('english')

# TQDM as progress bar
with tqdm(total=len(list(bytedance_glassdoor.iterrows()))) as pbar:
    
    # Iterate over reviews
    for index, row in bytedance_glassdoor.iterrows():
        # Update progress bar
        pbar.update(1)
        
        # Tokenise reviews
        document = word_tokenize(bytedance_glassdoor.loc[index]['reviews'])
        
        # Iterate over words in reviews
        for word in document:
            
            # Stem and convert to lower
            word = stemmer.stem(word.lower())
            
            # Match emotion score with NRC emotions database
            emo_score = emotion_lexicon_df[emotion_lexicon_df['Word'] == word]
            if not emo_score.empty:
                for emotion in list(emotion_lexicon_df.columns.drop("Word")):
                    
                    # Append emotions score
                    emotions_df.at[index, emotion] += emo_score[emotion]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm(total=len(list(bytedance_glassdoor.iterrows()))) as pbar:


  0%|          | 0/970 [00:00<?, ?it/s]

In [18]:
exclusive_list = ['Anger', 'Fear', 'Disgust']
openness_list = ['Joy', 'Trust', 'Anticipation']

emotions_df['is_exclusive'] = emotions_df[exclusive_list].sum(axis = 1)
emotions_df['is_open'] = emotions_df[openness_list].sum(axis = 1)

In [19]:
emotions_df['exclusive_openness'] = emotions_df.apply(lambda x: 'exclusive' if (x['is_exclusive'] > x['is_open']) else 'open', axis = 1)
emotions_df

Unnamed: 0,Word,Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust,is_exclusive,is_open,exclusive_openness
0,0,4,0,0,3,0,0,2,0,1,1,0,6,open
1,0,1,0,0,1,0,0,1,0,0,3,0,5,open
2,0,4,0,1,3,0,1,3,0,2,4,2,10,open
3,0,7,0,0,5,0,0,3,1,0,3,0,11,open
4,0,1,1,0,1,0,0,1,0,1,1,0,3,open
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,0,0,0,0,0,0,0,0,0,0,0,0,0,open
966,0,1,0,0,0,0,0,0,0,0,0,0,0,open
967,0,2,0,0,1,0,0,2,0,0,2,0,5,open
968,0,5,0,0,2,0,0,2,0,1,2,0,6,open


In [20]:
bytedance_glassdoor = pd.concat([bytedance_glassdoor, emotions_df], axis = 1)

In [21]:
bytedance_glassdoor.head()

Unnamed: 0,date,year exp,score,pros,cons,position,reviews,compound,neg,neu,...,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust,is_exclusive,is_open,exclusive_openness
0,2022-08-20,"Current Employee, less than 1 year",5.0,"flexible working culture, good benefits",limited learning opportunities and unclear car...,Anonymous Employee,"flexible, working, culture, good, benefitslimi...",0.6808,0.128,0.385,...,3,0,0,2,0,1,1,0,6,open
1,2022-08-17,"Former Employee, less than 1 year",5.0,"all people are sooo nice in our team, enjoy wo...",so many meetings in our team,Data Analyst,"people, sooo, nice, team, enjoy, working, ther...",0.7184,0.0,0.571,...,1,0,0,1,0,0,3,0,5,open
2,2022-08-10,"Former Employee, more than 1 year",5.0,"Good culture, great team and good benefits!","No idea when the company will IPO thus, do not...",Manager,"good, culture, great, team, good, benefitsno, ...",0.872,0.0,0.548,...,3,0,1,3,0,2,4,2,10,open
3,2022-08-22,"Former Employee, less than 1 year",3.0,"- very fast-paced fintech company, - generous ...","- zero work-life balance, too many unnecessary...",Global SOP Enablement,"fastpaced, fintech, company, generous, benefit...",0.9383,0.0,0.786,...,5,0,0,3,1,0,3,0,11,open
4,2022-08-18,"Current Employee, more than 1 year",4.0,"Good boss, Modern tech tools, Free lunch","Irregular working hours, Excessive workload , ...",Data Engineer,"good, bos, modern, tech, tool, free, lunchirre...",0.4767,0.153,0.542,...,1,0,0,1,0,1,1,0,3,open


### Personalities

In [22]:
from personality_evaluation import *

In [23]:
MBTI_personalities = pd.read_sql_query('''select * from Myers_Briggs_Type_Indicator''', connection)
MBTI_personalities.head()



Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [24]:
MBTI_personalities['Introvert_Extrovert'] = MBTI_personalities['type'].apply(lambda x: 'Introvert' if "I" in x else "Extrovert")
MBTI_personalities['Intuition_Sensing'] = MBTI_personalities['type'].apply(lambda x: 'Intuition' if "N" in x else "Sensing")
MBTI_personalities['Thinking_Feeling'] = MBTI_personalities['type'].apply(lambda x: 'Thinking' if "T" in x else "Feeling")
MBTI_personalities['Judging_Perceiving'] = MBTI_personalities['type'].apply(lambda x: 'Judging' if "J" in x else "Perceiving")

In [25]:
# Convert to lowercase

MBTI_personalities['posts'] = MBTI_personalities['posts'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
MBTI_personalities['posts'] = MBTI_personalities['posts'].apply(lambda x: re.split('http:\/\/.*', str(x))[0])

MBTI_personalities['posts'] = MBTI_personalities['posts'].str.lower()

# Remove Stop Words
stop = stopwords.words('english')
MBTI_personalities['posts'] = MBTI_personalities['posts'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Remove punctuations
MBTI_personalities["posts"] = MBTI_personalities['posts'].str.replace('[^\w\s]',' ')

  MBTI_personalities["posts"] = MBTI_personalities['posts'].str.replace('[^\w\s]',' ')


### Introvert-Extrovert

In [26]:
intro_extro_df = balance_dataset(MBTI_personalities, 'Introvert_Extrovert')
bag_of_words_features = build_bag_of_words_features_filtered

features = feature_creation(intro_extro_df, bag_of_words_features, 'Introvert_Extrovert')
train, test = train_test(features)

In [27]:
IntroExtroClassifier = NaiveBayesClassifier.train(train)

training_accuracy = nltk.classify.util.accuracy(IntroExtroClassifier, train)*100
testing_accuracy = nltk.classify.util.accuracy(IntroExtroClassifier, test)*100
print(f"Training accuracy is: {training_accuracy}")

Training accuracy is: 82.39524702939337


In [28]:
IntroExtroClassifier.show_most_informative_features()

Most Informative Features
               competing = 1              Extrov : Introv =      9.7 : 1.0
            msbossypants = 1              Extrov : Introv =      9.7 : 1.0
                    viva = 1              Extrov : Introv =      9.0 : 1.0
                  cancel = 1              Introv : Extrov =      8.3 : 1.0
                  edward = 1              Introv : Extrov =      8.3 : 1.0
                     2w3 = 1              Extrov : Introv =      8.1 : 1.0
                     7w6 = 1              Extrov : Introv =      8.1 : 1.0
                     7w8 = 1              Extrov : Introv =      8.1 : 1.0
               adulthood = 1              Introv : Extrov =      7.7 : 1.0
                 flighty = 1              Extrov : Introv =      7.7 : 1.0


### Intuition-Sensing

In [29]:
intuition_sensing_df = balance_dataset(MBTI_personalities, 'Intuition_Sensing')
bag_of_words_features = build_bag_of_words_features_filtered

features = feature_creation(intuition_sensing_df, bag_of_words_features, 'Intuition_Sensing')
train, test = train_test(features)

In [30]:
IntuitionSensingClassifier = NaiveBayesClassifier.train(train)

training_accuracy = nltk.classify.util.accuracy(IntuitionSensingClassifier, train)*100
testing_accuracy = nltk.classify.util.accuracy(IntuitionSensingClassifier, test)*100
print(f"Training accuracy is: {training_accuracy}")

Training accuracy is: 94.30512016718914


In [31]:
IntuitionSensingClassifier.show_most_informative_features()

Most Informative Features
                    niss = 1              Sensin : Intuit =     13.0 : 1.0
                    rave = 1              Sensin : Intuit =     13.0 : 1.0
              phlegmatic = 1              Sensin : Intuit =      9.7 : 1.0
              domination = 1              Intuit : Sensin =      9.0 : 1.0
                      47 = 1              Sensin : Intuit =      8.3 : 1.0
                 crystal = 1              Intuit : Sensin =      7.7 : 1.0
                  heroes = 1              Intuit : Sensin =      7.7 : 1.0
                shocking = 1              Intuit : Sensin =      7.7 : 1.0
                   teams = 1              Sensin : Intuit =      7.7 : 1.0
               wandering = 1              Intuit : Sensin =      7.7 : 1.0


### Thinking-Feeling

In [32]:
thinking_feeling_df = balance_dataset(MBTI_personalities, 'Thinking_Feeling')
bag_of_words_features = build_bag_of_words_features_filtered

features = feature_creation(thinking_feeling_df, bag_of_words_features, 'Thinking_Feeling')
train, test = train_test(features)

In [33]:
ThinkingFeelingClassifier = NaiveBayesClassifier.train(train)

training_accuracy = nltk.classify.util.accuracy(ThinkingFeelingClassifier, train)*100
testing_accuracy = nltk.classify.util.accuracy(ThinkingFeelingClassifier, test)*100
print(f"Training accuracy is: {training_accuracy}")

Training accuracy is: 92.38379396984925


In [34]:
ThinkingFeelingClassifier.show_most_informative_features()

Most Informative Features
             inefficient = 1              Thinki : Feelin =     20.3 : 1.0
                  adored = 1              Feelin : Thinki =     11.7 : 1.0
                    rand = 1              Thinki : Feelin =     11.7 : 1.0
                    viva = 1              Feelin : Thinki =     11.7 : 1.0
                    cozy = 1              Feelin : Thinki =     11.0 : 1.0
                 reports = 1              Thinki : Feelin =     11.0 : 1.0
                  empath = 1              Feelin : Thinki =     10.3 : 1.0
                 vividly = 1              Feelin : Thinki =      9.7 : 1.0
                  gandhi = 1              Feelin : Thinki =      9.0 : 1.0
                   pixie = 1              Feelin : Thinki =      9.0 : 1.0


### Judging-Perceiving

In [35]:
judging_perceiving_df = balance_dataset(MBTI_personalities, 'Judging_Perceiving')
bag_of_words_features = build_bag_of_words_features_filtered

features = feature_creation(judging_perceiving_df, bag_of_words_features, 'Judging_Perceiving')
train, test = train_test(features)

In [36]:
JudgingPerceivingClassifier = NaiveBayesClassifier.train(train)

training_accuracy = nltk.classify.util.accuracy(JudgingPerceivingClassifier, train)*100
testing_accuracy = nltk.classify.util.accuracy(JudgingPerceivingClassifier, test)*100
print(f"Training accuracy is: {training_accuracy}")

Training accuracy is: 82.6538041499818


In [37]:
JudgingPerceivingClassifier.show_most_informative_features()

Most Informative Features
                bethdeth = 1              Judgin : Percei =     11.0 : 1.0
                doorslam = 1              Judgin : Percei =      8.1 : 1.0
                    jawz = 1              Judgin : Percei =      7.8 : 1.0
                  copies = 1              Judgin : Percei =      7.7 : 1.0
               promotion = 1              Judgin : Percei =      7.7 : 1.0
                     sei = 1              Percei : Judgin =      7.7 : 1.0
                  stoned = 1              Percei : Judgin =      7.7 : 1.0
               assurance = 1              Judgin : Percei =      7.0 : 1.0
                 delving = 1              Judgin : Percei =      7.0 : 1.0
                 gauging = 1              Judgin : Percei =      7.0 : 1.0


### Innovative vs Traditional

In [38]:
innovative_traditional_data = pd.read_sql_query('''select * from Innovative_Traditional_Companies''', connection)
innovative_traditional_data['posts'] = innovative_traditional_data['posts'].astype(str)
innovative_traditional_data.head()



Unnamed: 0,posts,company,type
0,According to our annual Shop Small Research f...,American Express,innovative
1,"Weitian Chan, second generation owner of Pres...",American Express,innovative
2,Local businesses play a huge role in the live...,American Express,innovative
3,"Lawrence Chow, founder of bicycle shop B-Spok...",American Express,innovative
4,"We want to give a shoutout to our customers, ...",American Express,innovative


In [39]:
innovative_traditional_df = balance_dataset(innovative_traditional_data, 'type')
bag_of_words_features = build_bag_of_words_features_filtered

features = feature_creation(innovative_traditional_df, bag_of_words_features, 'type')
train, test = train_test(features)

In [40]:
InnovativeTraditionalClassifier = NaiveBayesClassifier.train(train)

training_accuracy = nltk.classify.util.accuracy(InnovativeTraditionalClassifier, train)*100
testing_accuracy = nltk.classify.util.accuracy(InnovativeTraditionalClassifier, test)*100
print(f"Training accuracy is: {training_accuracy}")

Training accuracy is: 99.38752783964365


### Apply to dataset

In [41]:
bytedance_glassdoor['raw_reviews'] = bytedance_glassdoor['pros'] + bytedance_glassdoor['cons']

In [42]:
for row in bytedance_glassdoor.itertuples():
    input = row.raw_reviews 
    tokenize = build_bag_of_words_features_filtered(input)
    
    # Classify text
    ie = IntroExtroClassifier.classify(tokenize)
    ns = IntuitionSensingClassifier.classify(tokenize)
    tf = ThinkingFeelingClassifier.classify(tokenize)
    jp = JudgingPerceivingClassifier.classify(tokenize)
    it = InnovativeTraditionalClassifier.classify(tokenize)
    
    bytedance_glassdoor.loc[row.Index, 'Introvert_Extrovert'] = ie
    bytedance_glassdoor.loc[row.Index, 'Intuition_Sensing'] = ns
    bytedance_glassdoor.loc[row.Index, 'Thinking_Feeling'] = tf
    bytedance_glassdoor.loc[row.Index, 'Judging_Perceiving'] = jp
    bytedance_glassdoor.loc[row.Index, 'Innovative_Traditional'] = it
    

In [43]:
display(bytedance_glassdoor['Introvert_Extrovert'].value_counts())
display(bytedance_glassdoor['Intuition_Sensing'].value_counts())
display(bytedance_glassdoor['Thinking_Feeling'].value_counts())
display(bytedance_glassdoor['Judging_Perceiving'].value_counts())
display(bytedance_glassdoor['Innovative_Traditional'].value_counts())

Extrovert    742
Introvert    228
Name: Introvert_Extrovert, dtype: int64

Intuition    506
Sensing      464
Name: Intuition_Sensing, dtype: int64

Thinking    712
Feeling     258
Name: Thinking_Feeling, dtype: int64

Judging       799
Perceiving    171
Name: Judging_Perceiving, dtype: int64

innovative     555
traditional    415
Name: Innovative_Traditional, dtype: int64

In [44]:
company_data = bytedance_glassdoor.copy()

In [45]:
company_data.head()

Unnamed: 0,date,year exp,score,pros,cons,position,reviews,compound,neg,neu,...,Trust,is_exclusive,is_open,exclusive_openness,raw_reviews,Introvert_Extrovert,Intuition_Sensing,Thinking_Feeling,Judging_Perceiving,Innovative_Traditional
0,2022-08-20,"Current Employee, less than 1 year",5.0,"flexible working culture, good benefits",limited learning opportunities and unclear car...,Anonymous Employee,"flexible, working, culture, good, benefitslimi...",0.6808,0.128,0.385,...,1,0,6,open,"flexible working culture, good benefitslimited...",Extrovert,Intuition,Thinking,Judging,innovative
1,2022-08-17,"Former Employee, less than 1 year",5.0,"all people are sooo nice in our team, enjoy wo...",so many meetings in our team,Data Analyst,"people, sooo, nice, team, enjoy, working, ther...",0.7184,0.0,0.571,...,3,0,5,open,"all people are sooo nice in our team, enjoy wo...",Extrovert,Intuition,Feeling,Perceiving,innovative
2,2022-08-10,"Former Employee, more than 1 year",5.0,"Good culture, great team and good benefits!","No idea when the company will IPO thus, do not...",Manager,"good, culture, great, team, good, benefitsno, ...",0.872,0.0,0.548,...,4,2,10,open,"Good culture, great team and good benefits!No ...",Extrovert,Intuition,Thinking,Judging,traditional
3,2022-08-22,"Former Employee, less than 1 year",3.0,"- very fast-paced fintech company, - generous ...","- zero work-life balance, too many unnecessary...",Global SOP Enablement,"fastpaced, fintech, company, generous, benefit...",0.9383,0.0,0.786,...,3,0,11,open,"- very fast-paced fintech company, - generous ...",Extrovert,Intuition,Thinking,Judging,traditional
4,2022-08-18,"Current Employee, more than 1 year",4.0,"Good boss, Modern tech tools, Free lunch","Irregular working hours, Excessive workload , ...",Data Engineer,"good, bos, modern, tech, tool, free, lunchirre...",0.4767,0.153,0.542,...,1,0,3,open,"Good boss, Modern tech tools, Free lunchIrregu...",Extrovert,Sensing,Thinking,Judging,innovative


In [46]:
company_data['Personality_IE'] = company_data['Introvert_Extrovert'].apply(lambda x: 'I' if x == 'Introvert' else 'E')
company_data['Personality_NS'] = company_data['Intuition_Sensing'].apply(lambda x: 'N' if x == 'Intuition' else 'S')
company_data['Personality_JP'] = company_data['Judging_Perceiving'].apply(lambda x: 'J' if x == 'Judging' else 'P')
company_data['Personality_TF'] = company_data['Thinking_Feeling'].apply(lambda x: 'T' if x == 'Thinking' else 'F')
company_data['Personality'] = company_data['Personality_IE'] + company_data['Personality_NS'] + company_data['Personality_JP'] + company_data['Personality_TF']
company_data.drop(columns = ['Personality_IE', 'Personality_NS', 'Personality_JP', 'Personality_TF'], inplace = True)

In [47]:
n = 5
top5_list = company_data['position'].value_counts()[:n].index.tolist()
filter_dataframe = company_data[company_data['position'].isin(top5_list)]

In [48]:
avg_score_df = filter_dataframe.groupby(['position'])['score'].mean().round(2).reset_index()
avg_score_df

Unnamed: 0,position,score
0,Anonymous Employee,4.1
1,Content Moderator,4.53
2,Data Analyst,4.36
3,Product Manager,3.7
4,Software Engineer,3.97


In [60]:
count_df = filter_dataframe['position'].value_counts().round(2).reset_index()
count_df.columns = ['position', 'count']

In [61]:
combined_df = pd.merge(avg_score_df, count_df, on = ['position'], how = 'left')
print(combined_df)

             position  score  count
0  Anonymous Employee   4.10    136
1   Content Moderator   4.53     58
2        Data Analyst   4.36     22
3     Product Manager   3.70     27
4   Software Engineer   3.97     72


In [63]:
combined_df.values.tolist()

[['Anonymous Employee', 4.1, 136],
 ['Content Moderator', 4.53, 58],
 ['Data Analyst', 4.36, 22],
 ['Product Manager', 3.7, 27],
 ['Software Engineer', 3.97, 72]]

In [72]:
import plotly.express as px

fig = px.bar(combined_df, x="position", y="count", text="score")
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [73]:
company_data

Unnamed: 0,date,year exp,score,pros,cons,position,reviews,compound,neg,neu,...,is_exclusive,is_open,exclusive_openness,raw_reviews,Introvert_Extrovert,Intuition_Sensing,Thinking_Feeling,Judging_Perceiving,Innovative_Traditional,Personality
0,2022-08-20,"Current Employee, less than 1 year",5.0,"flexible working culture, good benefits",limited learning opportunities and unclear car...,Anonymous Employee,"flexible, working, culture, good, benefitslimi...",0.6808,0.128,0.385,...,0,6,open,"flexible working culture, good benefitslimited...",Extrovert,Intuition,Thinking,Judging,innovative,ENJT
1,2022-08-17,"Former Employee, less than 1 year",5.0,"all people are sooo nice in our team, enjoy wo...",so many meetings in our team,Data Analyst,"people, sooo, nice, team, enjoy, working, ther...",0.7184,0.000,0.571,...,0,5,open,"all people are sooo nice in our team, enjoy wo...",Extrovert,Intuition,Feeling,Perceiving,innovative,ENPF
2,2022-08-10,"Former Employee, more than 1 year",5.0,"Good culture, great team and good benefits!","No idea when the company will IPO thus, do not...",Manager,"good, culture, great, team, good, benefitsno, ...",0.8720,0.000,0.548,...,2,10,open,"Good culture, great team and good benefits!No ...",Extrovert,Intuition,Thinking,Judging,traditional,ENJT
3,2022-08-22,"Former Employee, less than 1 year",3.0,"- very fast-paced fintech company, - generous ...","- zero work-life balance, too many unnecessary...",Global SOP Enablement,"fastpaced, fintech, company, generous, benefit...",0.9383,0.000,0.786,...,0,11,open,"- very fast-paced fintech company, - generous ...",Extrovert,Intuition,Thinking,Judging,traditional,ENJT
4,2022-08-18,"Current Employee, more than 1 year",4.0,"Good boss, Modern tech tools, Free lunch","Irregular working hours, Excessive workload , ...",Data Engineer,"good, bos, modern, tech, tool, free, lunchirre...",0.4767,0.153,0.542,...,0,3,open,"Good boss, Modern tech tools, Free lunchIrregu...",Extrovert,Sensing,Thinking,Judging,innovative,ESJT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,2021-03-19,Current Employee,4.0,Everything is just more than awesome,Nothing is that needed change,Senior Manager,"everything, awesomenothing, needed, change",0.0000,0.000,1.000,...,0,0,open,Everything is just more than awesomeNothing is...,Extrovert,Sensing,Feeling,Judging,traditional,ESJF
966,2021-03-04,Former Employee,5.0,really great working environment there in,but it is too busy with a great working load,Researcher,"really, great, working, environment, inbut, bu...",0.8588,0.000,0.452,...,0,0,open,really great working environment there inbut i...,Extrovert,Sensing,Feeling,Judging,innovative,ESJF
967,2021-03-05,Former Employee,5.0,"friendly atmosphere, take care of employee nee...",there is nothing to share for it,Anonymous Employee,"friendly, atmosphere, take, care, employee, ne...",0.8486,0.099,0.369,...,0,5,open,"friendly atmosphere, take care of employee nee...",Extrovert,Sensing,Feeling,Judging,traditional,ESJF
968,2021-03-18,"Current Employee, less than 1 year",4.0,"Good salary, career growth opportunities, frie...",Fast-paced and competitive environment with fr...,Community Content Management Specialist,"good, salary, career, growth, opportunity, fri...",0.9337,0.000,0.303,...,0,6,open,"Good salary, career growth opportunities, frie...",Extrovert,Intuition,Thinking,Judging,traditional,ENJT
