# import packages and define functions

In [2]:
import pandas as pd 
import numpy as np
import re
import gensim.corpora as corpora
from scipy import stats
from langdetect import detect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import collections
from collections import Counter
import spacy
import math
from lexicalrichness import LexicalRichness
en_core = spacy.load('en_core_web_sm')

# Define functions

In [3]:
def clean(text):
    text = text.lower() ## lowercase
    text = ''.join([char for char in text if char in 'abcdefghijklmnopqrstuvwxyz ']) ## remove punctuations etc.
    text = ' '.join(text.split())## remove multiple spaces
    return text

def multiple_replace(string):
    rep_dict = {"google":"it", "amazon":"it", "microsoft":"it", "facebook":"it", "apple":"it", "oracle":"it", "salesforce":"it", "intel":"it", "cisco":"it", "uber":"it", "capital one":"it", "ibm":"it", "linkedin":"it"}
    pattern = re.compile("|".join([re.escape(k) for k in sorted(rep_dict,key=len,reverse=True)]), flags=re.DOTALL)
    return pattern.sub(lambda x: rep_dict[x.group(0)], string)

def lemmatized_words_num(text):
    lex = LexicalRichness(text)
    return lex.words

def entropy(bow, prob_dict):
    return sum([-v*math.log2(v)*bow[k] for k, v in prob_dict.items()])

# Load reviews

In [4]:
CS_Reviews_Final = pd.read_csv('CS_Reviews_Final.csv').copy().drop(['Unnamed: 0'], axis=1)
CS_Reviews_Final

Unnamed: 0,title,status,pros,cons,review,rating
0,Dream job,Software Engineer(Current Employee),,,This is one of the best place to work in the w...,5.0
1,"Worked as a contractor software engineer, work...",Software Engineer Contractor(Former Employee),,,"Worked as a contractor software engineer, work...",5.0
2,Productive and fun workplace,Software Engineer(Former Employee),,,It was a fun and interesting experience. Lots ...,5.0
3,Great company to work for!,Quality Assurance Specialist(Current Employee),,,One of the best companies to work for overall....,5.0
4,Best Workplace,Software Engineer(Current Employee),,,Google is a great work place. They pay well an...,5.0
...,...,...,...,...,...,...
7056,Ibm,Delivery Architect(Current Employee),,,I’ve been with IBM for many years of my career...,1.0
7057,5 stars,Sr. Technical Program Manager(Current Employee),,,What is the best part of working at the compan...,5.0
7058,Great company,Software Engineer(Current Employee),,,LinkedIn's food is great. There is a InDay eve...,5.0
7059,"Great WLB, pay and technical challenges",Software Engineer(Current Employee),,,"Great pay, tons of time off.Food is top notch,...",5.0


# Preprocessing reviews

In [5]:
CS_Reviews_Final['processed_review'] = CS_Reviews_Final['review'].apply(clean)
CS_Reviews_Final["processed_review"] = CS_Reviews_Final["processed_review"].apply(multiple_replace)

## lemmatize

In [6]:
stop = stopwords.words('english')
CS_Reviews_Final['wo_stop'] = CS_Reviews_Final['processed_review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
CS_Reviews_Final["lemmatized"] = CS_Reviews_Final['wo_stop'].apply(lambda x: " ".join([y.lemma_ for y in en_core(x)]))

In [7]:
CS_Reviews_Final['total_word_num'] = CS_Reviews_Final['lemmatized'].apply(lemmatized_words_num)
CS_Reviews_Final['raw_word_count'] = CS_Reviews_Final['processed_review'].apply(lambda text: len(text.split()))
CS_Reviews_Final = CS_Reviews_Final.loc[(CS_Reviews_Final['total_word_num'] != 0)]
CS_Reviews_Final

Unnamed: 0,title,status,pros,cons,review,rating,processed_review,wo_stop,lemmatized,total_word_num,raw_word_count
0,Dream job,Software Engineer(Current Employee),,,This is one of the best place to work in the w...,5.0,this is one of the best place to work in the w...,one best place work world couldnt ask culture ...,one good place work world could not ask cultur...,19,34
1,"Worked as a contractor software engineer, work...",Software Engineer Contractor(Former Employee),,,"Worked as a contractor software engineer, work...",5.0,worked as a contractor software engineer work ...,worked contractor software engineer work time ...,work contractor software engineer work time fl...,17,26
2,Productive and fun workplace,Software Engineer(Former Employee),,,It was a fun and interesting experience. Lots ...,5.0,it was a fun and interesting experience lots o...,fun interesting experience lots handson truly ...,fun interesting experience lot handson truly l...,13,29
3,Great company to work for!,Quality Assurance Specialist(Current Employee),,,One of the best companies to work for overall....,5.0,one of the best companies to work for overall ...,one best companies work overall stable company...,one good company work overall stable company p...,16,32
4,Best Workplace,Software Engineer(Current Employee),,,Google is a great work place. They pay well an...,5.0,it is a great work place they pay well and res...,great work place pay well respect employees te...,great work place pay well respect employee tea...,24,51
...,...,...,...,...,...,...,...,...,...,...,...
7056,Ibm,Delivery Architect(Current Employee),,,I’ve been with IBM for many years of my career...,1.0,ive been with it for many years of my career i...,ive many years career facet every step done la...,I ve many year career facet every step do layo...,27,58
7057,5 stars,Sr. Technical Program Manager(Current Employee),,,What is the best part of working at the compan...,5.0,what is the best part of working at the compan...,best part working companythe culturewhat stres...,good part work companythe culturewhat stressfu...,17,34
7058,Great company,Software Engineer(Current Employee),,,LinkedIn's food is great. There is a InDay eve...,5.0,its food is great there is a inday every month...,food great inday every month lot learn culture...,food great inday every month lot learn culture...,13,30
7059,"Great WLB, pay and technical challenges",Software Engineer(Current Employee),,,"Great pay, tons of time off.Food is top notch,...",5.0,great pay tons of time offfood is top notch pr...,great pay tons time offfood top notch probably...,great pay ton time offfood top notch probably ...,22,31


# Entropy

## Get Global probability

In [8]:
total_word_collection = ' '.join(CS_Reviews_Final.lemmatized.to_list())
word_tokenize_total_word = word_tokenize(total_word_collection)
len_tok = len(word_tokenize_total_word)
total_word_Counter = Counter(word_tokenize_total_word)
global_key_mapping = {k:v/len_tok for k,v in total_word_Counter.items()}
global_key_mapping

{'one': 0.003810371843722716,
 'good': 0.018837923318434913,
 'place': 0.009627115508039942,
 'work': 0.045435359556863585,
 'world': 0.0010060769359753453,
 'could': 0.0017346154068540436,
 'not': 0.005712666739905984,
 'ask': 0.0006186794951112756,
 'culture': 0.00634291033772962,
 'amazing': 0.001347217965989974,
 'benefit': 0.004197769284586786,
 'unbeatable': 1.1564102712360291e-05,
 'well': 0.004729718009355359,
 'that': 0.0004278718003573308,
 's': 0.0009771666791944446,
 'I': 0.0018444743826214664,
 'm': 0.00046834615985059177,
 'bless': 2.8910256780900728e-05,
 'contractor': 0.0014859871985382974,
 'software': 0.002422679518239481,
 'engineer': 0.0020815384882248524,
 'time': 0.007620743687445432,
 'flexibility': 0.0006013333410427352,
 'great': 0.015495897634562791,
 'worker': 0.0008788718061393822,
 'empowerment': 1.1564102712360291e-05,
 'low': 0.0009656025764820843,
 'treat': 0.0008383974466461211,
 'secondclass': 1.1564102712360291e-05,
 'citizen': 4.6256410849441164e-05,

In [9]:
len_tok

172949

In [10]:
len(global_key_mapping)

13519

In [11]:
# Retrieve key-value pairs with the highest 5 values
highest_pairs = sorted(global_key_mapping.items(), key=lambda x: x[1], reverse=True)[:5]

print("Key-value pairs with the highest 5 values:")
for pair in highest_pairs:
    print(pair)


Key-value pairs with the highest 5 values:
('work', 0.045435359556863585)
('good', 0.018837923318434913)
('great', 0.015495897634562791)
('company', 0.014963948909794217)
('team', 0.01146580783930523)


## Get entropy for each sample

In [12]:
CS_Reviews_Final['tok'] = CS_Reviews_Final['lemmatized'].apply(word_tokenize)
CS_Reviews_Final['bow'] = CS_Reviews_Final['tok'].apply(Counter)
CS_Reviews_Final['num_unique_word'] = CS_Reviews_Final['bow'].apply(len)
CS_Reviews_Final['key'] = CS_Reviews_Final['bow'].apply(lambda l:list(l.keys()))
CS_Reviews_Final['prob'] = [{k:v for k,v in global_key_mapping.items() if k in x} for x in CS_Reviews_Final['key']]
CS_Reviews_Final['entropy'] = CS_Reviews_Final.apply(lambda x: entropy(x.bow, x.prob), axis=1)
CS_Reviews_Final

Unnamed: 0,title,status,pros,cons,review,rating,processed_review,wo_stop,lemmatized,total_word_num,raw_word_count,tok,bow,num_unique_word,key,prob,entropy
0,Dream job,Software Engineer(Current Employee),,,This is one of the best place to work in the w...,5.0,this is one of the best place to work in the w...,one best place work world couldnt ask culture ...,one good place work world could not ask cultur...,19,34,"[one, good, place, work, world, could, not, as...","{'one': 1, 'good': 1, 'place': 1, 'work': 2, '...",18,"[one, good, place, work, world, could, not, as...","{'one': 0.003810371843722716, 'good': 0.018837...",0.849380
1,"Worked as a contractor software engineer, work...",Software Engineer Contractor(Former Employee),,,"Worked as a contractor software engineer, work...",5.0,worked as a contractor software engineer work ...,worked contractor software engineer work time ...,work contractor software engineer work time fl...,17,26,"[work, contractor, software, engineer, work, t...","{'work': 2, 'contractor': 3, 'software': 1, 'e...",14,"[work, contractor, software, engineer, time, f...","{'work': 0.045435359556863585, 'contractor': 0...",0.679704
2,Productive and fun workplace,Software Engineer(Former Employee),,,It was a fun and interesting experience. Lots ...,5.0,it was a fun and interesting experience lots o...,fun interesting experience lots handson truly ...,fun interesting experience lot handson truly l...,13,29,"[fun, interesting, experience, lot, handson, t...","{'fun': 1, 'interesting': 1, 'experience': 2, ...",11,"[fun, interesting, experience, lot, handson, t...","{'work': 0.045435359556863585, 'fun': 0.002578...",0.589490
3,Great company to work for!,Quality Assurance Specialist(Current Employee),,,One of the best companies to work for overall....,5.0,one of the best companies to work for overall ...,one best companies work overall stable company...,one good company work overall stable company p...,16,32,"[one, good, company, work, overall, stable, co...","{'one': 1, 'good': 1, 'company': 2, 'work': 1,...",15,"[one, good, company, work, overall, stable, pr...","{'one': 0.003810371843722716, 'good': 0.018837...",0.818592
4,Best Workplace,Software Engineer(Current Employee),,,Google is a great work place. They pay well an...,5.0,it is a great work place they pay well and res...,great work place pay well respect employees te...,great work place pay well respect employee tea...,24,51,"[great, work, place, pay, well, respect, emplo...","{'great': 1, 'work': 2, 'place': 2, 'pay': 1, ...",20,"[great, work, place, pay, well, respect, emplo...","{'good': 0.018837923318434913, 'place': 0.0096...",1.214855
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7056,Ibm,Delivery Architect(Current Employee),,,I’ve been with IBM for many years of my career...,1.0,ive been with it for many years of my career i...,ive many years career facet every step done la...,I ve many year career facet every step do layo...,27,58,"[I, ve, many, year, career, facet, every, step...","{'I': 1, 've': 1, 'many': 1, 'year': 1, 'caree...",25,"[I, ve, many, year, career, facet, every, step...","{'place': 0.009627115508039942, 'work': 0.0454...",0.602547
7057,5 stars,Sr. Technical Program Manager(Current Employee),,,What is the best part of working at the compan...,5.0,what is the best part of working at the compan...,best part working companythe culturewhat stres...,good part work companythe culturewhat stressfu...,17,34,"[good, part, work, companythe, culturewhat, st...","{'good': 1, 'part': 2, 'work': 3, 'companythe'...",14,"[good, part, work, companythe, culturewhat, st...","{'good': 0.018837923318434913, 'work': 0.04543...",0.922879
7058,Great company,Software Engineer(Current Employee),,,LinkedIn's food is great. There is a InDay eve...,5.0,its food is great there is a inday every month...,food great inday every month lot learn culture...,food great inday every month lot learn culture...,13,30,"[food, great, inday, every, month, lot, learn,...","{'food': 2, 'great': 1, 'inday': 1, 'every': 1...",12,"[food, great, inday, every, month, lot, learn,...","{'good': 0.018837923318434913, 'culture': 0.00...",0.429123
7059,"Great WLB, pay and technical challenges",Software Engineer(Current Employee),,,"Great pay, tons of time off.Food is top notch,...",5.0,great pay tons of time offfood is top notch pr...,great pay tons time offfood top notch probably...,great pay ton time offfood top notch probably ...,22,31,"[great, pay, ton, time, offfood, top, notch, p...","{'great': 1, 'pay': 1, 'ton': 1, 'time': 1, 'o...",22,"[great, pay, ton, time, offfood, top, notch, p...","{'good': 0.018837923318434913, 'time': 0.00762...",0.484396


# Save Processed Data

In [None]:
columns_to_exclude = ['tok', 'bow', 'key', 'prob']  # Replace with the actual column names you want to exclude
selected_columns = CS_Reviews_Final.drop(columns=columns_to_exclude)
selected_columns.to_csv("Reviews_complexity.csv")