In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from WorkforceSentimentMonitoring.data import get_data, merge, holdout, drop_wrong_language

In [6]:
import contractions

## Import & Clean Data

In [5]:
submission, train, test = get_data()
df = merge(submission, train, test)
df = drop_wrong_language(df, 'review')
target = [
    "work-balance",
    "culture-values",
    "career-opportunities",
    "comp-benefits",
    "senior-mgmt",
    "overall",
]

  0%|          | 0/10 [00:00<?, ?it/s]

Identifying entries in other languages...


100%|██████████| 10/10 [07:24<00:00, 44.41s/it]


Drop 437 entries? [y] / n

Dropping 437 entries...


### Deal with Contractions

In [7]:
df.review[1]

'Moving at the speed of light, burn out is inevitable 1) Food, food, food. 15+ cafes on main campus (MTV) alone. Mini-kitchens, snacks, drinks, free breakfast/lunch/dinner, all day, errr\'day.  2) Benefits/perks. Free 24:7 gym access (on MTV campus). Free (self service) laundry (washer/dryer) available. Bowling alley. Volley ball pit. Custom-built and exclusive employee use only outdoor sport park (MTV). Free health/fitness assessments. Dog-friendly. Etc. etc. etc.  3) Compensation. In ~2010 or 2011, Google updated its compensation packages so that they were more competitive.  4) For the size of the organization (30K+), it has remained relatively innovative, nimble, and fast-paced and open with communication but, that is definitely changing (for the worse).  5) With so many departments, focus areas, and products, *in theory*, you should have plenty of opportunity to grow your career (horizontally or vertically). In practice, not true.  6) You get to work with some of the brightest, mos

In [8]:
def expand_contractions(text_series):
    return text_series.apply(contractions.fix)

In [9]:
df['review'] = expand_contractions(df['review'])

In [10]:
df.review[1]

'Moving at the speed of light, burn out is inevitable 1) Food, food, food. 15+ cafes on main campus (MTV) alone. Mini-kitchens, snacks, drinks, free breakfast/lunch/dinner, all day, errr\'day.  2) Benefits/perks. Free 24:7 gym access (on MTV campus). Free (self service) laundry (washer/dryer) available. Bowling alley. Volley ball pit. Custom-built and exclusive employee use only outdoor sport park (MTV). Free health/fitness assessments. Dog-friendly. Etc. etc. etc.  3) Compensation. In ~2010 or 2011, Google updated its compensation packages so that they were more competitive.  4) For the size of the organization (30K+), it has remained relatively innovative, nimble, and fast-paced and open with communication but, that is definitely changing (for the worse).  5) With so many departments, focus areas, and products, *in theory*, you should have plenty of opportunity to grow your career (horizontally or vertically). In practice, not true.  6) You get to work with some of the brightest, mos

### Preprocess

In [11]:
# Define X and y
X = pd.DataFrame(df.review)
y = df[target].copy()

In [15]:
from WorkforceSentimentMonitoring.encoders import Preprocessor

In [16]:
X

Unnamed: 0,review
0,Best Company to work for People are smart and ...
1,"Moving at the speed of light, burn out is inev..."
2,Great balance between big-company security and...
3,The best place I have worked and also the most...
4,Execellent for engineers Impact driven. Best t...
...,...
52373,great place to grow! Great health benefits. Ma...
52374,An ocean of opportunities diverse set of peopl...
52375,Tech Gaint Equip its employees wid huge salari...
52376,Terrible They had great health benefits (no lo...


In [17]:
preprocessor = Preprocessor()
preprocessor.fit_transform(X)

Unnamed: 0,review
0,best company work people smart friendly bureau...
1,moving speed light burn inevitable food food f...
2,great balance big company security fun fast mo...
3,best place worked also demanding find well reg...
4,execellent engineer impact driven best tech wo...
...,...
52373,great place grow great health benefit many int...
52374,ocean opportunity diverse set people problem s...
52375,tech gaint equip employee wid huge salary high...
52376,terrible great health benefit longer told many...


In [20]:
X['review'] = X.review.str.replace('\s+', ' ')

In [21]:
X['review'] = X.review.str.strip()

## Import Lexicon

In [22]:
lexicon = pd.read_csv('../lexicon/EmotionIntensityLexicon.txt',sep='\t')

In [23]:
lexicon.emotion.unique()

array(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness',
       'surprise', 'trust'], dtype=object)

In [24]:
lexicon

Unnamed: 0,word,emotion,emotion-intensity-score
0,outraged,anger,0.964
1,brutality,anger,0.959
2,hatred,anger,0.953
3,hateful,anger,0.940
4,terrorize,anger,0.939
...,...,...,...
9916,fugitive,trust,0.141
9917,divorce,trust,0.133
9918,mistakes,trust,0.133
9919,bait,trust,0.133


## Bag of Words

In [25]:
tmp = X.review[1].split(' ')

In [27]:
tmp = [word for word in tmp if word]

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
from WorkforceSentimentMonitoring.preprocessing import remove_stopwords

In [30]:
X.review = X.review.apply(remove_stopwords)

In [33]:
X['length'] = X.review.str.split(' ').apply(len)

In [36]:
X.head()

Unnamed: 0,review,length
0,best company work people smart friendly bureau...,9
1,moving speed light burn inevitable food food f...,385
2,great balance big company security fun fast mo...,436
3,best place worked also demanding find well reg...,384
4,execellent engineer impact driven best tech wo...,13


In [37]:
lexicon.head()

Unnamed: 0,word,emotion,emotion-intensity-score
0,outraged,anger,0.964
1,brutality,anger,0.959
2,hatred,anger,0.953
3,hateful,anger,0.94
4,terrorize,anger,0.939


In [38]:
table = pd.pivot_table(lexicon, values='emotion-intensity-score', index='word', columns='emotion', fill_value=0)

table.head()

emotion,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aaaaaaah,0.0,0.0,0.0,0.344,0.0,0.0,0.0,0.0
aaaah,0.0,0.0,0.0,0.234,0.0,0.0,0.0,0.0
abacus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.406
abandon,0.0,0.0,0.0,0.531,0.0,0.703,0.0,0.0
abandoned,0.222,0.0,0.0,0.534,0.0,0.828,0.0,0.0


In [39]:
select_row = lexicon[(lexicon.word == 'hatred') & (lexicon.emotion == 'anger')]

In [40]:
select_row.iloc[0]['emotion-intensity-score']

0.953

In [225]:
def get_emotion_score(row, lexicon, emotion):
    review = row['review']
    review_words = review.split(' ')
    score = 0
    for word in review_words:
        select_row = lexicon[(lexicon.word == word) & (lexicon.emotion == emotion)]
        if len(select_row) > 0:
            score += select_row.iloc[0]['emotion-intensity-score']
    return score / row['length']

In [226]:
tmp = X.sample(100)

In [227]:
emotions = lexicon.emotion.unique()

In [228]:
emotions

array(['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness',
       'surprise', 'trust'], dtype=object)

In [55]:
from tqdm import tqdm

In [231]:
for emotion in tqdm(emotions):
    tmp[f'{emotion}_score'] = tmp.apply(lambda x: get_emotion_score(x, lexicon, emotion), axis=1)

100%|██████████| 8/8 [01:40<00:00, 12.58s/it]


In [232]:
tmp['anger_score'] = tmp.apply(lambda x: get_emotion_score(x, lexicon, 'anger'), axis=1)

## Function implementations

In [46]:
def create_emotion_dictionary(lexicon):
    """Create dict with word : emo_array pairs"""
    # create pivot table to better extract the word : array pairs
    table = pd.pivot_table(lexicon, values='emotion-intensity-score',
                           index='word', columns='emotion', fill_value=0)
    # create dictionary
    emo_scores_dict = {word : value for word , value in zip(table.index, table.values)}
    return emo_scores_dict

In [47]:
def create_wordcount_vector(corpus):
    """Vectorize corpus. Corpus is a pd.Series with texts"""
    vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii')
    X_vectorized = vectorizer.fit_transform(corpus)
    X_vectorized = X_vectorized.toarray()
    columns = vectorizer.get_feature_names()
    X_vectorized = pd.DataFrame(X_vectorized, columns=columns)
    return X_vectorized

In [42]:
def simplify_emotion_dict_and_wordcount(emo_scores_dict, word_count_vec):
    """Deletes the emotion keys in the dictionary that aren't present in the dataset"""
    # Create set intersection with words appearing in both the dic and the vector
    columns_intersection = set(word_count_vec.columns).intersection(set(emo_scores_dict.keys()))
    # Drop unnecessary word columns in word_count_vec
    word_count_vec = word_count_vec[columns_intersection]
    # Drop innecessary entries in emo_scores_dict
    keys_to_drop = set(emo_scores_dict.keys()).difference(columns_intersection)
    for key in keys_to_drop:
        emo_scores_dict.pop(key)
    return emo_scores_dict, word_count_vec

In [65]:
def get_emotion_score(X, lexicon):
    """Extract emotion scores"""

    X_vectorized = create_wordcount_vector(X['review'])
    emo_scores_dict = create_emotion_dictionary(lexicon)
    emo_scores_dict, X_vectorized = simplify_emotion_dict_and_wordcount(emo_scores_dict,
                                                                        X_vectorized)
    X['length'] = X.review.str.split(' ').apply(len)
    emotions = lexicon.emotion.unique()

    # Create new empty columns for emotion_scores
    for emo in table.columns:
        X[f'{emo}_score'] = np.nan
    # iterate through every row
    for i in tqdm(range(len(X))):
        # select columns containing words in the word count vector
        col_selector = X_vectorized.loc[i] > 0
        review = X_vectorized.loc[i, col_selector]
        # create an empty np.array with 8 spaces to add the results to
        emo_score = np.zeros(8)
        # iterate over the words contained in the review
        for j in range(len(review)):
            # select the word (string)
            word = review.index[j]
            # select the count (int)
            word_count = review[j]
            # compute emo_score by multiplying the array from the dict with the
            # word count
            emo_array = emo_scores_dict[word] * word_count
            # add emo_array to emo_score array
            emo_score += emo_array
        # compute the average emo_array for the entire review
        emo_score_avg = emo_score / X.length[i]
        # iterate over the emotion columns to append the corresponding value
        for idx, emo in enumerate(emotions):
            X[f'{emo}_score'][i] = emo_score_avg[idx]

    return X
    

In [66]:
X = get_emotion_score(X, lexicon)

100%|██████████| 52378/52378 [01:33<00:00, 557.31it/s]


## Logistic Regression Test

### Holdout

In [86]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [96]:
X_train.head()

Unnamed: 0,length,anger_score,anticipation_score,disgust_score,fear_score,joy_score,sadness_score,surprise_score,trust_score
6811,0.019149,0.046875,0.0,0.030875,0.04375,0.0,0.03155,0.00585,0.01485
52314,0.010638,0.018875,0.059917,0.016292,0.015625,0.12825,0.017583,0.01825,0.134417
11789,0.007447,0.0,0.056833,0.0,0.037333,0.091889,0.019944,0.0,0.082056
12313,0.00266,0.0,0.062444,0.0,0.0,0.087556,0.0,0.0,0.0
42192,0.030319,0.0,0.03277,0.0,0.018213,0.035852,0.00718,0.007934,0.018951


In [81]:
X_train = X_train.drop(columns='review')
X_test = X_test.drop(columns='review')

### Scale Length

In [83]:
scaler = MinMaxScaler()
X_train['length'] = scaler.fit_transform(X_train[['length']])
X_test['length'] = scaler.transform(X_test[['length']])

### Model

In [95]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train['overall'])
model.score(X_test, y_test['overall'])

0.316851215476645

## Tfidf

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [98]:
def create_tfidf_vector(corpus):
    """Vectorize corpus. Corpus is a pd.Series with texts"""
    vectorizer = TfidfVectorizer(strip_accents='ascii')
    X_vectorized = vectorizer.fit_transform(corpus)
    X_vectorized = X_vectorized.toarray()
    columns = vectorizer.get_feature_names()
    X_vectorized = pd.DataFrame(X_vectorized, columns=columns)
    return X_vectorized

def get_emotion_score(X, lexicon):
    """Extract emotion scores"""

    X_vectorized = create_tfidf_vector(X['review'])
    emo_scores_dict = create_emotion_dictionary(lexicon)
    emo_scores_dict, X_vectorized = simplify_emotion_dict_and_wordcount(emo_scores_dict,
                                                                        X_vectorized)
    X['length'] = X.review.str.split(' ').apply(len)
    emotions = lexicon.emotion.unique()

    # Create new empty columns for emotion_scores
    for emo in table.columns:
        X[f'{emo}_score'] = np.nan
    # iterate through every row
    for i in tqdm(range(len(X))):
        # select columns containing words in the word count vector
        col_selector = X_vectorized.loc[i] > 0
        review = X_vectorized.loc[i, col_selector]
        # create an empty np.array with 8 spaces to add the results to
        emo_score = np.zeros(8)
        # iterate over the words contained in the review
        for j in range(len(review)):
            # select the word (string)
            word = review.index[j]
            # select the count (int)
            word_count = review[j]
            # compute emo_score by multiplying the array from the dict with the
            # word count
            emo_array = emo_scores_dict[word] * word_count
            # add emo_array to emo_score array
            emo_score += emo_array
        # iterate over the emotion columns to append the corresponding value
        for idx, emo in enumerate(emotions):
            X[f'{emo}_score'][i] = emo_score[idx]

    return X

In [105]:
X = get_emotion_score(X, lexicon)

100%|██████████| 52378/52378 [01:38<00:00, 531.42it/s]


In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
scaler = MinMaxScaler()
X_train['length'] = scaler.fit_transform(X_train[['length']])
X_test['length'] = scaler.transform(X_test[['length']])
X_train = X_train.drop(columns='review')
X_test = X_test.drop(columns='review')

In [109]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train['overall'])
model.score(X_test, y_test['overall'])

0.3193967163039328

In [122]:
X = pd.DataFrame(df['review'])

In [123]:
X = get_emotion_score(X, lexicon)

100%|██████████| 52378/52378 [01:37<00:00, 537.66it/s]


In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
scaler = MinMaxScaler()
X_train['length'] = scaler.fit_transform(X_train[['length']])
X_test['length'] = scaler.transform(X_test[['length']])
X_train = X_train.drop(columns='review')
X_test = X_test.drop(columns='review')

In [125]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train['overall'])
model.score(X_test, y_test['overall'])

0.32092401680030547

In [129]:
mapping = {1 : 0,
           2 : 0,
           3 : 1,
           4 : 2,
           5 : 2}
y_three = y.replace(mapping)

In [133]:
X_train, X_test, y_three_train, y_three_test = train_test_split(X, y_three, test_size=0.3)
scaler = MinMaxScaler()
X_train['length'] = scaler.fit_transform(X_train[['length']])
X_test['length'] = scaler.transform(X_test[['length']])
X_train = X_train.drop(columns='review')
X_test = X_test.drop(columns='review')

In [143]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_three_train['overall'])
model.score(X_test, y_three_test['overall'])

0.5125365915743922

In [135]:
mapping = {1 : 0,
           2 : 0,
           3 : 0,
           4 : 1,
           5 : 1}
y_bin = y.replace(mapping)

In [136]:
X_train, X_test, y_bin_train, y_bin_test = train_test_split(X, y_bin, test_size=0.3)
scaler = MinMaxScaler()
X_train['length'] = scaler.fit_transform(X_train[['length']])
X_test['length'] = scaler.transform(X_test[['length']])
X_train = X_train.drop(columns='review')
X_test = X_test.drop(columns='review')

In [144]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_bin_train['overall'])
model.score(X_test, y_bin_test['overall'])

0.5771923125875016

In [145]:
from sklearn.svm import SVC

In [147]:
model = SVC()
model.fit(X_train, y_bin_train['overall'])
model.score(X_test, y_bin_test['overall'])

0.5785923380425099

In [148]:
model = SVC(class_weight='balanced', verbose=True, kernel='linear')
model.fit(X_train, y_bin_train['overall'])
model.score(X_test, y_bin_test['overall'])

[LibSVM]

0.5736922489499809

In [149]:
model = SVC(class_weight='balanced')
model.fit(X_train, y_bin_train['overall'])
model.score(X_test, y_bin_test['overall'])

0.577637775232277

In [150]:
model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': 'balanced',
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [152]:
from scipy.stats import loguniform, uniform
from sklearn.model_selection import RandomizedSearchCV

In [168]:
grid = {'C': loguniform(0.1, 1.0),
        'break_ties': [False, True],
        #'coef0': loguniform(0.0, 0.5),
        'gamma': ['scale', 'auto'],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'shrinking': [True, False]}

In [169]:
grid

{'C': <scipy.stats._distn_infrastructure.rv_frozen at 0x1299ddcd0>,
 'break_ties': [False, True],
 'gamma': ['scale', 'auto'],
 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
 'shrinking': [True, False]}

In [170]:
model = SVC(class_weight='balanced')
gridsearch = RandomizedSearchCV(model, grid,
                                scoring=['f1', 'balanced_accuracy', 'f1_weighted'],
                                cv=10, n_jobs=-1, verbose=1, refit=False)

In [171]:
gridsearch.fit(X_train, y_bin_train['overall'])

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 20.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 290.1min finished


RandomizedSearchCV(cv=10, estimator=SVC(class_weight='balanced'), n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1299ddcd0>,
                                        'break_ties': [False, True],
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid'],
                                        'shrinking': [True, False]},
                   refit=False,
                   scoring=['f1', 'balanced_accuracy', 'f1_weighted'],
                   verbose=1)