In [94]:
import pandas as pd
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from scipy.stats import loguniform, uniform
from sklearn.model_selection import RandomizedSearchCV
import os

In [95]:
from WorkforceSentimentMonitoring.data import get_prepaired_data
from WorkforceSentimentMonitoring.encoders import Preprocessor
from WorkforceSentimentMonitoring.encoders import EmotionScoresExtractor
from WorkforceSentimentMonitoring.encoders import CustomMinMaxScaler
from WorkforceSentimentMonitoring.encoders import FeatureEngineer

from sklearn.pipeline import Pipeline
import joblib

In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
X_train, X_test, y_train, y_test = get_prepaired_data()

Reading data...
Merging data into a single DataFrame...
Dropping initial text columns...
Identifying entries in other languages...


HBox(children=(HTML(value='Dask Apply'), FloatProgress(value=0.0, max=8.0), HTML(value='')))


Drop 431 entries? [y] / n

Dropping 431 entries...
Splitting train and test...
Encoding targets...
Done!


In [9]:
preprocessor = Preprocessor()
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [13]:
emo_extractor = EmotionScoresExtractor()
X_train = emo_extractor.fit_transform(X_train)
emo_extractor = EmotionScoresExtractor()
X_test = emo_extractor.fit_transform(X_test)

100%|██████████| 36668/36668 [01:02<00:00, 589.17it/s]
100%|██████████| 15716/15716 [00:26<00:00, 593.07it/s]


In [16]:
scaler = CustomMinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
engineer = FeatureEngineer()
X_train = engineer.fit_transform(X_train)

In [37]:
X_train

Unnamed: 0,review,length,anger_score,anticipation_score,disgust_score,fear_score,joy_score,sadness_score,surprise_score,trust_score,subjectivity_review,polarity_review
0,review at apple you will meet some of the most...,0.007648,0.000000,0.000000,0.000000,0.000000,0.092231,0.000000,0.237910,0.000000,0.611310,0.385714
1,amazing best company in the world there are ab...,0.001055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.700000,0.600000
2,senior manager operation smart people interest...,0.021361,0.000000,0.201113,0.057583,0.083206,0.150040,0.036093,0.066779,0.164266,0.540377,0.174802
3,pay well but they own you the pay is good you ...,0.010285,0.000000,0.464431,0.000000,0.000000,0.237933,0.046479,0.145582,0.107076,0.534776,0.247258
4,great pay benefit but the pay for an entry lev...,0.033755,0.122394,0.086545,0.000000,0.053487,0.121497,0.137045,0.008245,0.288837,0.524580,0.231863
...,...,...,...,...,...,...,...,...,...,...,...,...
36663,at home advisor love the people management is ...,0.006857,0.013411,0.000000,0.000000,0.074641,0.485499,0.000000,0.000000,0.178611,0.783333,0.266667
36664,need longer break and lunch good pay benefit a...,0.003692,0.000000,0.167541,0.000000,0.000000,0.144773,0.000000,0.227597,0.090964,0.433333,0.216667
36665,senior escalation engineer amazing package mer...,0.005538,0.000000,0.303007,0.000000,0.000000,0.368046,0.029116,0.112620,0.365320,0.650000,0.150000
36666,fine get paid job security ok all about fast n...,0.003692,0.041655,0.122823,0.018566,0.022375,0.116205,0.018090,0.074265,0.306324,0.475000,0.329167


In [100]:
pipeline = Pipeline([
    ('preprocessor', Preprocessor()),
    ('emo_extractor', EmotionScoresExtractor()),
    ('scaler', CustomMinMaxScaler()),
    ('engineer', FeatureEngineer())
])
joblib.dump(pipeline, 'final_pipe.joblib')

['final_pipe.joblib']

In [42]:
from WorkforceSentimentMonitoring.trainer import MultiNBFeaturesExtractor

In [43]:
nb_extractor = MultiNBFeaturesExtractor()
tmp = nb_extractor.fit_transform(X_train)



In [72]:
X_train

Unnamed: 0,review,length,anger_score,anticipation_score,disgust_score,fear_score,joy_score,sadness_score,surprise_score,trust_score,subjectivity_review,polarity_review,work-balance_nb,culture-values_nb,career-opportunities_nb,comp-benefits_nb,senior-mgmt_nb,overall_nb
0,review at apple you will meet some of the most...,0.007648,0.000000,0.000000,0.000000,0.000000,0.092231,0.000000,0.237910,0.000000,0.611310,0.385714,0,1,1,1,0,0
1,amazing best company in the world there are ab...,0.001055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.700000,0.600000,1,1,1,1,1,1
2,senior manager operation smart people interest...,0.021361,0.000000,0.201113,0.057583,0.083206,0.150040,0.036093,0.066779,0.164266,0.540377,0.174802,1,1,1,1,1,1
3,pay well but they own you the pay is good you ...,0.010285,0.000000,0.464431,0.000000,0.000000,0.237933,0.046479,0.145582,0.107076,0.534776,0.247258,0,1,1,1,1,1
4,great pay benefit but the pay for an entry lev...,0.033755,0.122394,0.086545,0.000000,0.053487,0.121497,0.137045,0.008245,0.288837,0.524580,0.231863,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36663,at home advisor love the people management is ...,0.006857,0.013411,0.000000,0.000000,0.074641,0.485499,0.000000,0.000000,0.178611,0.783333,0.266667,0,1,1,1,1,1
36664,need longer break and lunch good pay benefit a...,0.003692,0.000000,0.167541,0.000000,0.000000,0.144773,0.000000,0.227597,0.090964,0.433333,0.216667,0,0,0,0,0,0
36665,senior escalation engineer amazing package mer...,0.005538,0.000000,0.303007,0.000000,0.000000,0.368046,0.029116,0.112620,0.365320,0.650000,0.150000,0,1,1,1,1,0
36666,fine get paid job security ok all about fast n...,0.003692,0.041655,0.122823,0.018566,0.022375,0.116205,0.018090,0.074265,0.306324,0.475000,0.329167,0,0,0,0,0,0


## Testing Pipe for NB and saving new models

In [124]:
def set_pipe_dfidf_nb():
    vectorizer = ColumnTransformer([
            ('vectorizer' ,TfidfVectorizer(), 'review')
        ],
        remainder='drop')
        
    pipe = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', MultinomialNB())
        ])
    return pipe

def random_gridsearch_pipe(pipe, X, y, **kwargs):
    grid = dict(
        vectorizer__vectorizer__analyzer = ['char', 'word'],
        vectorizer__vectorizer__ngram_range = [(1,2), (1,3), (1,4), (1,5), (1,6), (1,7)],
        vectorizer__vectorizer__max_df = loguniform(0.7, 1.0),
        vectorizer__vectorizer__min_df = loguniform(0.001, 0.1),
        vectorizer__vectorizer__stop_words = [None, 'english'],
        vectorizer__vectorizer__norm = ['l1', 'l2'],
        classifier__alpha = loguniform(0.001, 1)  
    )
    
    gridsearch = RandomizedSearchCV(pipe, grid, n_iter=10,
                                verbose=1, refit=True,
                                scoring='balanced_accuracy', n_jobs=-1)
    gridsearch.fit(X, y)
    
    return gridsearch.best_estimator_

def export_joblib(pipe, name):
    dirname = os.path.abspath('')
    filename = os.path.join(dirname, f'../joblib_files/{name}.joblib')
    joblib.dump(pipe, filename)

def extract_NB_predictions(X, y, pipe, targets):
    for target in tqdm(targets):
        pipe.fit(X, y[target])
        feature_name = f'{target}_nb'
        export_joblib(pipe, feature_name)
        X[feature_name] = pipe.predict(X)
    return X

def iterative_gridsearch_pipe(X, y):
    targets = y.columns
    for target in tqdm(targets):
        pipe = set_pipe_dfidf_nb()
        best_estimator = random_gridsearch_pipe(pipe, X, y[target]) 
        feature_name = f'{target}_nb'
        export_joblib(best_estimator, feature_name)
        X[feature_name] = best_estimator.predict(X)
    return X

In [None]:
X = iterative_gridsearch_pipe(X_train, y_train)

  0%|          | 0/6 [00:00<?, ?it/s][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 29.4min finished
 17%|█▋        | 1/6 [29:38<2:28:11, 1778.38s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 92.0min finished
 33%|███▎      | 2/6 [2:04:39<3:17:01, 2955.34s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 23.5min finished
 50%|█████     | 3/6 [2:31:25<2:07:31, 2550.50s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 35.5min finished
 67%|██████▋   | 4/6 [4:40:28<2:16:56, 4108.09s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 501.1min finished
 83%|████████▎ | 5/6 [13:04:38<3:19:10, 11950.71s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


## Pipeline testing

In [129]:
tmp_pipe = set_pipe_dfidf_nb()

In [130]:
X_tmp, y_tmp = X_train.sample(100, random_state=2), y_train.sample(100, random_state=2)

In [131]:
X_tmp = X_tmp[['review']]

In [132]:
X_tmp.head()

Unnamed: 0,review
19852,a a first job out of college it is amazing per...
14071,learning google better place to learn and have...
13159,customer service it is big and fantstic compan...
5308,not for normal people great people to work wit...
19397,thankful to work at amazon learning opportunit...


In [133]:
X = iterative_gridsearch_pipe(X_tmp, y_tmp)

  0%|          | 0/6 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    6.6s finished
 17%|█▋        | 1/6 [00:07<00:38,  7.70s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.4s finished
 33%|███▎      | 2/6 [00:10<00:24,  6.18s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.9s finished
 50%|█████     | 3/6 [00:14<00:16,  5.66s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.9s finished
 67%|██████▋   | 4/6 [00:17<00:09,  4.87s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.9s finished
 83%|████████▎ | 5/6 [00:22<00:04,  4.80s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.9s finished
100%|██████████| 6/6 [00:25<00:00,  4.31s/it]


In [134]:
best_estimator_tmp

Pipeline(steps=[('vectorizer',
                 ColumnTransformer(transformers=[('vectorizer',
                                                  TfidfVectorizer(analyzer='char',
                                                                  max_df=0.9922969356692659,
                                                                  min_df=0.004481460859411207,
                                                                  ngram_range=(1,
                                                                               7),
                                                                  norm='l1',
                                                                  stop_words='english'),
                                                  'review')])),
                ('classifier', MultinomialNB(alpha=0.0017667011850638827))])

In [135]:
X.head()

Unnamed: 0,review,work-balance_nb,culture-values_nb,career-opportunities_nb,comp-benefits_nb,senior-mgmt_nb,overall_nb
19852,a a first job out of college it is amazing per...,1,1,1,1,1,1
14071,learning google better place to learn and have...,1,1,0,1,1,1
13159,customer service it is big and fantstic compan...,0,0,0,0,0,0
5308,not for normal people great people to work wit...,0,0,0,0,0,0
19397,thankful to work at amazon learning opportunit...,1,1,1,1,1,1


## Export models again!! yes

In [136]:
X_train = iterative_gridsearch_pipe(X_train, y_train)

  0%|          | 0/6 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 14.7min finished
 17%|█▋        | 1/6 [15:59<1:19:56, 959.33s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 13.5min finished
 33%|███▎      | 2/6 [32:35<1:04:41, 970.50s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 10.9min finished
 50%|█████     | 3/6 [43:50<44:04, 881.61s/it]  [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 10.5min finished
 67%|██████▋   | 4/6 [57:13<28:36, 858.21s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 13.7min finished
 83%|████████▎ | 5/6 [1:11:11<14:12, 852.22s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 11.8min finished
100%|██████████| 6/6 [1:23:55<00:00, 839.18s/it]
