In [1]:
%load_ext autoreload
%autoreload 2

In [138]:
import pandas as pd
import numpy as np
from WorkforceSentimentMonitoring.data import get_data, merge, holdout, drop_wrong_language

## Import & Clean Data

In [3]:
submission, train, test = get_data()
df = merge(submission, train, test)
df = drop_wrong_language(df, 'review')
target = [
    "work-balance",
    "culture-values",
    "career-opportunities",
    "comp-benefits",
    "senior-mgmt",
    "overall",
]

  0%|          | 0/10 [00:00<?, ?it/s]

Identifying entries in other languages...


100%|██████████| 10/10 [08:51<00:00, 53.17s/it]


Drop 434 entries? [y] / n

Dropping 434 entries...


### Deal with Contractions

In [9]:
import contractions

In [13]:
df.review[1]

'Moving at the speed of light, burn out is inevitable 1) Food, food, food. 15+ cafes on main campus (MTV) alone. Mini-kitchens, snacks, drinks, free breakfast/lunch/dinner, all day, errr\'day.  2) Benefits/perks. Free 24:7 gym access (on MTV campus). Free (self service) laundry (washer/dryer) available. Bowling alley. Volley ball pit. Custom-built and exclusive employee use only outdoor sport park (MTV). Free health/fitness assessments. Dog-friendly. Etc. etc. etc.  3) Compensation. In ~2010 or 2011, Google updated its compensation packages so that they were more competitive.  4) For the size of the organization (30K+), it has remained relatively innovative, nimble, and fast-paced and open with communication but, that is definitely changing (for the worse).  5) With so many departments, focus areas, and products, *in theory*, you should have plenty of opportunity to grow your career (horizontally or vertically). In practice, not true.  6) You get to work with some of the brightest, mos

In [16]:
def expand_contractions(text_series):
    return text_series.apply(contractions.fix)

In [17]:
df['review'] = expand_contractions(df['review'])

In [20]:
df.review[1]

'Moving at the speed of light, burn out is inevitable 1) Food, food, food. 15+ cafes on main campus (MTV) alone. Mini-kitchens, snacks, drinks, free breakfast/lunch/dinner, all day, errr\'day.  2) Benefits/perks. Free 24:7 gym access (on MTV campus). Free (self service) laundry (washer/dryer) available. Bowling alley. Volley ball pit. Custom-built and exclusive employee use only outdoor sport park (MTV). Free health/fitness assessments. Dog-friendly. Etc. etc. etc.  3) Compensation. In ~2010 or 2011, Google updated its compensation packages so that they were more competitive.  4) For the size of the organization (30K+), it has remained relatively innovative, nimble, and fast-paced and open with communication but, that is definitely changing (for the worse).  5) With so many departments, focus areas, and products, *in theory*, you should have plenty of opportunity to grow your career (horizontally or vertically). In practice, not true.  6) You get to work with some of the brightest, mos

### Preprocess

In [30]:
# Define X and y
X = pd.DataFrame(df.review)
y = df[target].copy()

In [31]:
from WorkforceSentimentMonitoring.encoders import Preprocessor

In [32]:
X

Unnamed: 0,review
0,Best Company to work for People are smart and ...
1,"Moving at the speed of light, burn out is inev..."
2,Great balance between big-company security and...
3,The best place I have worked and also the most...
4,Execellent for engineers Impact driven. Best t...
...,...
52376,great place to grow! Great health benefits. Ma...
52377,An ocean of opportunities diverse set of peopl...
52378,Tech Gaint Equip its employees wid huge salari...
52379,Terrible They had great health benefits (no lo...


In [33]:
preprocessor = Preprocessor()
preprocessor.fit_transform(X)

Unnamed: 0,review
0,best company to work for people are smart and ...
1,moving at the speed of light burn out is inev...
2,great balance between big company security and...
3,the best place i have worked and also the most...
4,execellent for engineer impact driven best te...
...,...
52376,great place to grow great health benefit man...
52377,an ocean of opportunity diverse set of people ...
52378,tech gaint equip it employee wid huge salary ...
52379,terrible they had great health benefit no lon...


In [53]:
X['review'] = X.review.str.replace('\s+', ' ')

In [72]:
X['review'] = X.review.str.strip()

## Import Lexicon

In [35]:
lexicon = pd.read_csv('../lexicon/EmotionIntensityLexicon.txt',sep='\t')

In [36]:
lexicon.emotion.unique()

array(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness',
       'surprise', 'trust'], dtype=object)

In [37]:
lexicon

Unnamed: 0,word,emotion,emotion-intensity-score
0,outraged,anger,0.964
1,brutality,anger,0.959
2,hatred,anger,0.953
3,hateful,anger,0.940
4,terrorize,anger,0.939
...,...,...,...
9916,fugitive,trust,0.141
9917,divorce,trust,0.133
9918,mistakes,trust,0.133
9919,bait,trust,0.133


## Bag of Words

In [74]:
tmp = X.review[1].split(' ')

In [75]:
tmp = [word for word in tmp if word]

In [76]:
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
from WorkforceSentimentMonitoring.preprocessing import remove_stopwords

In [83]:
X.review = X.review.apply(remove_stopwords)

In [88]:
X['length'] = X.review.str.split(' ').apply(len)

In [92]:
vectorizer = CountVectorizer()

In [93]:
X_vectorized = vectorizer.fit_transform(X.review)

In [95]:
X_vectorized.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [99]:
X_vectorized = pd.DataFrame(X_vectorized.toarray(), columns=vectorizer.get_feature_names())

In [107]:
emo_words = [word for word in lexicon.word if word in X_vectorized.columns]

In [214]:
emo_words

['outraged',
 'brutality',
 'hatred',
 'hateful',
 'furious',
 'enraged',
 'furiously',
 'execution',
 'angered',
 'rage',
 'loathe',
 'hostile',
 'murder',
 'pissed',
 'explosive',
 'vengeful',
 'ferocious',
 'killing',
 'combative',
 'vengeance',
 'wrath',
 'torment',
 'vicious',
 'threatening',
 'abhorrent',
 'fighting',
 'attacking',
 'bloodshed',
 'assault',
 'assassination',
 'strangle',
 'explode',
 'malicious',
 'hostility',
 'attack',
 'hell',
 'murderous',
 'malice',
 'beating',
 'outrage',
 'irate',
 'tumultuous',
 'destroying',
 'violent',
 'stab',
 'slaughter',
 'abomination',
 'obliterate',
 'belligerent',
 'prick',
 'cruelty',
 'horrid',
 'rabid',
 'torture',
 'hate',
 'hating',
 'tyrannical',
 'demonic',
 'despicable',
 'angry',
 'livid',
 'madman',
 'vindictive',
 'terrorist',
 'venomous',
 'threaten',
 'savage',
 'atrocity',
 'fierce',
 'abuse',
 'tyrant',
 'anger',
 'slam',
 'punching',
 'punched',
 'destructive',
 'ruthless',
 'slap',
 'destroyed',
 'retaliatory',
 

In [215]:
X_vectorized[emo_words]

Unnamed: 0,outraged,brutality,hatred,hateful,furious,enraged,furiously,execution,angered,rage,...,deny,mislead,unaccountable,unreliable,addict,falsehood,thang,divorce,mistakes,bait
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52376,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52377,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52379,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [216]:
X

Unnamed: 0,review,length
0,best company work people smart friendly bureau...,9
1,moving speed light burn inevitable food food f...,389
2,great balance big company security fun fast mo...,440
3,best place worked also demanding find well reg...,387
4,execellent engineer impact driven best tech wo...,13
...,...,...
52376,great place grow great health benefit many int...,27
52377,ocean opportunity diverse set people problem s...,32
52378,tech gaint equip employee wid huge salary high...,23
52379,terrible great health benefit longer told many...,57


In [217]:
lexicon

Unnamed: 0,word,emotion,emotion-intensity-score
0,outraged,anger,0.964
1,brutality,anger,0.959
2,hatred,anger,0.953
3,hateful,anger,0.940
4,terrorize,anger,0.939
...,...,...,...
9916,fugitive,trust,0.141
9917,divorce,trust,0.133
9918,mistakes,trust,0.133
9919,bait,trust,0.133


In [218]:
table = pd.pivot_table(lexicon, values='emotion-intensity-score', index='word', columns='emotion', fill_value=0)

In [219]:
table

emotion,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aaaaaaah,0.000,0.00,0.0,0.344,0.000,0.000,0.000,0.000
aaaah,0.000,0.00,0.0,0.234,0.000,0.000,0.000,0.000
abacus,0.000,0.00,0.0,0.000,0.000,0.000,0.000,0.406
abandon,0.000,0.00,0.0,0.531,0.000,0.703,0.000,0.000
abandoned,0.222,0.00,0.0,0.534,0.000,0.828,0.000,0.000
...,...,...,...,...,...,...,...,...
zany,0.000,0.00,0.0,0.000,0.000,0.000,0.555,0.000
zeal,0.000,0.50,0.0,0.000,0.547,0.000,0.484,0.328
zealous,0.000,0.00,0.0,0.000,0.393,0.000,0.000,0.258
zen,0.000,0.00,0.0,0.000,0.515,0.000,0.000,0.000


In [222]:
lexicon

Unnamed: 0,word,emotion,emotion-intensity-score
0,outraged,anger,0.964
1,brutality,anger,0.959
2,hatred,anger,0.953
3,hateful,anger,0.940
4,terrorize,anger,0.939
...,...,...,...
9916,fugitive,trust,0.141
9917,divorce,trust,0.133
9918,mistakes,trust,0.133
9919,bait,trust,0.133


In [223]:
select_row = lexicon[(lexicon.word == 'hatred') & (lexicon.emotion == 'anger')]

In [224]:
select_row.iloc[0]['emotion-intensity-score']

0.953

In [225]:
def get_emotion_score(row, lexicon, emotion):
    review = row['review']
    review_words = review.split(' ')
    score = 0
    for word in review_words:
        select_row = lexicon[(lexicon.word == word) & (lexicon.emotion == emotion)]
        if len(select_row) > 0:
            score += select_row.iloc[0]['emotion-intensity-score']
    return score / row['length']

In [226]:
tmp = X.sample(100)

In [227]:
emotions = lexicon.emotion.unique()

In [228]:
emotions

array(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness',
       'surprise', 'trust'], dtype=object)

In [229]:
tmp

Unnamed: 0,review,length
32194,sorted helped manage key function day day help...,40
12171,great company general correct base salary envi...,14
37695,amazon india good place high growth structured...,28
3462,great company learn grow endless possibility l...,59
38895,great experience love job smart motivated cowo...,49
...,...,...
13216,business manager worked business manager apple...,34
52380,microsoft good place work structured organizat...,34
50513,learn load prepared hand life microsoft always...,79
32121,project manager experience management team sup...,44


In [230]:
from tqdm import tqdm

In [231]:
for emotion in tqdm(emotions):
    tmp[f'{emotion}_score'] = tmp.apply(lambda x: get_emotion_score(x, lexicon, emotion), axis=1)

100%|██████████| 8/8 [01:40<00:00, 12.58s/it]


In [232]:
tmp['anger_score'] = tmp.apply(lambda x: get_emotion_score(x, lexicon, 'anger'), axis=1)

In [233]:
tmp

Unnamed: 0,review,length,anger_score,anticipation_score,disgust_score,fear_score,joy_score,sadness_score,surprise_score,trust_score
32194,sorted helped manage key function day day help...,40,0.000000,0.039075,0.000000,0.011325,0.049475,0.013675,0.019925,0.114275
12171,great company general correct base salary envi...,14,0.007286,0.068643,0.000000,0.000000,0.048000,0.000000,0.000000,0.165786
37695,amazon india good place high growth structured...,28,0.016179,0.027357,0.013964,0.013393,0.075429,0.025429,0.034893,0.044071
3462,great company learn grow endless possibility l...,59,0.008441,0.075627,0.000000,0.010068,0.054780,0.014559,0.005831,0.111102
38895,great experience love job smart motivated cowo...,49,0.015694,0.017551,0.000000,0.014980,0.075776,0.007735,0.000000,0.065694
...,...,...,...,...,...,...,...,...,...,...
13216,business manager worked business manager apple...,34,0.000000,0.020706,0.000000,0.000000,0.025294,0.000000,0.000000,0.049412
52380,microsoft good place work structured organizat...,34,0.000000,0.039765,0.000000,0.010559,0.106441,0.000000,0.023000,0.139000
50513,learn load prepared hand life microsoft always...,79,0.021418,0.037696,0.013152,0.022937,0.029949,0.019785,0.000000,0.015025
32121,project manager experience management team sup...,44,0.010591,0.058955,0.000000,0.006386,0.072773,0.011364,0.000000,0.112386
