# Pipeline

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [129]:
from WorkforceSentimentMonitoring.data import get_prepaired_data
from WorkforceSentimentMonitoring.encoders import Preprocessor, CustomMinMaxScaler, FeatureEngineer
from WorkforceSentimentMonitoring.preprocessing import lemmatize
import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from langdetect import detect

from scipy.stats import loguniform, uniform
import joblib
from tqdm import tqdm

## Get data

In [3]:
X_train, X_test, y_train, y_test = get_prepaired_data()

Reading data...
Merging data into a single DataFrame...
Dropping initial text columns...
Identifying entries in other languages...
Drop 442 entries? [y] / n

Dropping 442 entries...
Process completed.
Splitting train and test...
Encoding targets...
Done!


## Preprocess

In [4]:
preprocessor = Preprocessor()
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [5]:
X_train.head()

Unnamed: 0,review
0,specialist ptgreat benefit work life balance ...
1,amazong place to work a long a you can cope up...
2,customer service associatefriendly environment...
3,shipping clerkamazon when it first opened up w...
4,many benefit and experience to learn fromamazi...


## Feature Engineering

In [6]:
engineer = FeatureEngineer()
X_train = engineer.fit_transform(X_train)
X_test = engineer.fit_transform(X_test)

100%|██████████| 1/1 [01:04<00:00, 64.99s/it]
100%|██████████| 1/1 [00:33<00:00, 33.95s/it]


In [7]:
display(X_train.head(1))
display(X_test.head(1))

Unnamed: 0,review,review_length,subjectivity_review,polarity_review
0,specialist ptgreat benefit work life balance ...,273,0.651515,0.045455


Unnamed: 0,review,review_length,subjectivity_review,polarity_review
0,so much for competative pay good benefit a few...,566,0.351667,-0.018333


In [14]:
y_train

Unnamed: 0,work-balance,culture-values,career-opportunities,comp-benefits,senior-mgmt,overall
0,2,2,1,2,2,2
1,1,2,1,2,2,2
2,0,2,1,0,2,2
3,0,0,2,2,0,0
4,0,2,1,2,2,2
...,...,...,...,...,...,...
36656,1,2,2,2,2,2
36657,2,2,2,2,2,2
36658,1,0,0,2,0,1
36659,0,0,0,1,0,0


### NB function + export joblib

In [114]:
def export_joblib(estimator, name):
    dirname = os.path.abspath('')
    filename = os.path.join(dirname, f'../joblib_files/{name}.joblib')
    joblib.dump(estimator, filename)

In [270]:
targets = y_train.columns
def extract_NB_predictions(X, y, targets):
    for target in tqdm(targets):
        vectorizer = ColumnTransformer([
            ('vectorizer' ,TfidfVectorizer(), 'review')
        ], remainder='drop')

        pipe = make_pipeline(
            (vectorizer),
            (MultinomialNB())
        )
        pipe.fit(X, y[target])
        feature_name = f'{target}_nb'
        export_joblib(pipe, feature_name)
        X[feature_name] = pipe.predict(X)
    return X

### Test 1

In [122]:
X_tmp = X_train.sample(100, random_state=2).copy()
y_tmp = y_train.sample(100, random_state=2).copy()

In [123]:
tmp = extract_NB_predictions(X_tmp, y_tmp, targets)

100%|██████████| 6/6 [00:11<00:00,  1.96s/it]


In [124]:
tmp.iloc[:, -len(targets):].head()

Unnamed: 0,work-balance_nb,culture-values_nb,career-opportunities_nb,comp-benefits_nb,senior-mgmt_nb,overall_nb
625,2,2,2,2,2,2
31184,0,2,2,2,2,2
7969,2,2,2,2,2,2
20621,2,2,2,2,2,2
19382,0,2,2,2,2,2


In [125]:
from sklearn.metrics import f1_score, classification_report
tmp_preds = tmp.iloc[:, -len(targets):]
tmp_preds
reports = {}
for target in targets:
    reports[target] = classification_report(y_tmp[target], tmp_preds[f'{target}_nb'])
    print(target)
    print(reports[target])

work-balance
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        32
           1       1.00      0.17      0.29        29
           2       0.61      1.00      0.76        39

    accuracy                           0.74       100
   macro avg       0.86      0.70      0.67       100
weighted avg       0.84      0.74      0.69       100

culture-values
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.00      0.00      0.00        15
           2       0.66      1.00      0.80        66

    accuracy                           0.66       100
   macro avg       0.22      0.33      0.27       100
weighted avg       0.44      0.66      0.52       100

career-opportunities
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.00      0.00      0.00        26
           2       0.56   

  _warn_prf(average, modifier, msg_start, len(result))


### Test 2

In [126]:
X_tmp = X_train.sample(10000, random_state=2).copy()
y_tmp = y_train.sample(10000, random_state=2).copy()

tmp = extract_NB_predictions(X_tmp, y_tmp, targets)

tmp_preds = tmp.iloc[:, -len(targets):]

reports = {}
for target in targets:
    reports[target] = classification_report(y_tmp[target], tmp_preds[f'{target}_pred_nb'])
    print(target)
    print(reports[target])

100%|██████████| 6/6 [08:08<00:00, 81.47s/it]


KeyError: 'work-balance_pred_nb'

In [118]:
tmp

Unnamed: 0,review,review_length,subjectivity_review,polarity_review,work-balance_nb,culture-values_nb,career-opportunities_nb,comp-benefits_nb,senior-mgmt_nb,overall_nb
625,great people great work great perk but incr...,220,0.668651,0.283069,2,2,2,2,2,2
31184,high stress ton of hour great paythe pay and...,1547,0.520143,0.307330,2,2,2,2,2,2
7969,they wasted my time nice cafeteria it wa exci...,1263,0.574815,0.166667,2,2,2,2,2,2
20621,best career is in googlesalary is very good in...,91,0.540000,0.955000,2,2,2,2,2,2
19382,senior software designerample opportunity and ...,961,0.482353,0.222549,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...
10746,great companygreat company to work forsome tim...,72,0.675000,0.250000,2,2,2,2,2,2
13305,wonder placesolution oriented on on onafter sa...,65,0.000000,0.000000,2,2,2,2,2,2
10533,great people average managementthe people are...,393,0.630925,0.289367,2,2,2,2,2,2
12259,ok but look beyond the hypemany perk free fo...,195,0.575000,0.450000,2,2,2,2,2,2


### Test 3

In [None]:
X_tmp = X_train.sample(1000, random_state=2).copy()
y_tmp = y_train.sample(1000, random_state=2).copy()

tmp = extract_NB_predictions(X_tmp, y_tmp, targets)

In [280]:
targets

Index(['work-balance', 'culture-values', 'career-opportunities',
       'comp-benefits', 'senior-mgmt', 'overall'],
      dtype='object')

In [119]:
tmp_preds = tmp.iloc[:, -len(targets):]

reports = {}
for target in targets:
    reports[target] = classification_report(y_tmp[target], tmp_preds[f'{target}_nb'], )
    print(target)
    print(reports[target])

100%|██████████| 6/6 [01:33<00:00, 15.53s/it]

work-balance
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       270
           1       0.00      0.00      0.00       235
           2       0.49      1.00      0.66       495

    accuracy                           0.49      1000
   macro avg       0.17      0.33      0.22      1000
weighted avg       0.25      0.49      0.33      1000

culture-values
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       171
           1       0.00      0.00      0.00       169
           2       0.66      1.00      0.80       660

    accuracy                           0.66      1000
   macro avg       0.22      0.33      0.27      1000
weighted avg       0.44      0.66      0.52      1000

career-opportunities
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       178
           1       0.00      0.00      0.00       226
           2       0.60   


  _warn_prf(average, modifier, msg_start, len(result))


## Grid Search

In [None]:
# oversample small sample
# undersample big class
# combine small classes
# grid search params, balanced_accuracy

In [321]:
X_tmp = X_train.sample(10_000, random_state=2).copy()
y_tmp = y_train.sample(10_000, random_state=2).copy()

In [262]:
X_tmp.head()

Unnamed: 0,review,review_length,subjectivity_review,polarity_review
625,great people great work great perk but incr...,220,0.668651,0.283069
31184,high stress ton of hour great paythe pay and...,1547,0.520143,0.30733
7969,they wasted my time nice cafeteria it wa exci...,1263,0.574815,0.166667
20621,best career is in googlesalary is very good in...,91,0.54,0.955
19382,senior software designerample opportunity and ...,961,0.482353,0.222549


In [263]:
y_tmp.head()

Unnamed: 0,work-balance,culture-values,career-opportunities,comp-benefits,senior-mgmt,overall
625,2,2,1,2,2,2
31184,0,1,1,2,2,2
7969,1,1,0,0,1,1
20621,2,2,2,2,2,2
19382,0,2,2,2,2,2


In [264]:
vectorizer = ColumnTransformer([
        ('vectorizer' ,TfidfVectorizer(), 'review')
    ],
    remainder='drop')
    
pipe = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', MultinomialNB())
    ])

In [265]:
grid = dict(
    vectorizer__vectorizer__analyzer = ['char', 'word'],
    vectorizer__vectorizer__ngram_range = [(1,2), (1,3), (1,4), (1,5), (1,6), (1,7)],
    vectorizer__vectorizer__max_df = loguniform(0.7, 1.0),
    vectorizer__vectorizer__min_df = loguniform(0.001, 0.1),
    vectorizer__vectorizer__stop_words = [None, 'english'],
    vectorizer__vectorizer__norm = ['l1', 'l2'],
    classifier__alpha = loguniform(0.001, 1)
)

In [266]:
grid

{'vectorizer__vectorizer__analyzer': ['char', 'word'],
 'vectorizer__vectorizer__ngram_range': [(1, 2),
  (1, 3),
  (1, 4),
  (1, 5),
  (1, 6),
  (1, 7)],
 'vectorizer__vectorizer__max_df': <scipy.stats._distn_infrastructure.rv_frozen at 0x13100b910>,
 'vectorizer__vectorizer__min_df': <scipy.stats._distn_infrastructure.rv_frozen at 0x12f039ad0>,
 'vectorizer__vectorizer__stop_words': [None, 'english'],
 'vectorizer__vectorizer__norm': ['l1', 'l2'],
 'classifier__alpha': <scipy.stats._distn_infrastructure.rv_frozen at 0x12dac3790>}

In [267]:
gridsearch = RandomizedSearchCV(pipe, grid, n_iter=10,
                                scoring = 'balanced_accuracy',
                                verbose=1)

In [268]:
gridsearch.fit(X_tmp, y_tmp['overall'])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  8.4min finished


RandomizedSearchCV(estimator=Pipeline(steps=[('vectorizer',
                                              ColumnTransformer(transformers=[('vectorizer',
                                                                               TfidfVectorizer(),
                                                                               'review')])),
                                             ('classifier', MultinomialNB())]),
                   param_distributions={'classifier__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x12dac3790>,
                                        'vectorizer__vectorizer__analyzer': ['char',
                                                                             'word'],
                                        'vectorizer__vectorizer_...structure.rv_frozen object at 0x13100b910>,
                                        'vectorizer__vectorizer__min_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x12f039ad0>,
                     

In [276]:
best_pipe = gridsearch.best_estimator_

### Testing best pipeline

In [319]:
def extract_NB_predictions(X, y, pipe, targets):
    for target in tqdm(targets):
        pipe.fit(X[['review']], y[target])
        feature_name = f'{target}_nb'
        export_joblib(pipe, feature_name)
        X[feature_name] = pipe.predict(X)
    return X

In [None]:
X_tmp = X_train.sample(10_000, random_state=2).copy()
y_tmp = y_train.sample(10_000, random_state=2).copy()

tmp = extract_NB_predictions(X_tmp, y_tmp, best_pipe, targets)

In [330]:
tmp = extract_NB_predictions(X_tmp, y_tmp, best_pipe, targets)

100%|██████████| 6/6 [02:43<00:00, 27.25s/it]


In [332]:
tmp_preds = tmp.iloc[:, -len(targets):]

reports = {}
for target in targets:
    reports[target] = classification_report(y_tmp[target], tmp_preds[f'{target}_nb'],
                                            target_names=['negative','positive'])
    print(target.center(53, '-'))
    print(reports[target])

---------------------work-balance--------------------
              precision    recall  f1-score   support

    negative       0.69      0.69      0.69      5015
    positive       0.69      0.69      0.69      4985

    accuracy                           0.69     10000
   macro avg       0.69      0.69      0.69     10000
weighted avg       0.69      0.69      0.69     10000

--------------------culture-values-------------------
              precision    recall  f1-score   support

    negative       0.70      0.54      0.61      3484
    positive       0.78      0.87      0.83      6516

    accuracy                           0.76     10000
   macro avg       0.74      0.71      0.72     10000
weighted avg       0.75      0.76      0.75     10000

-----------------career-opportunities----------------
              precision    recall  f1-score   support

    negative       0.69      0.59      0.63      4023
    positive       0.75      0.82      0.78      5977

    accuracy        

## New Target encoding (0 - 1)

In [324]:
new_encoding = {0 : 0,
                1 : 0,
                2 : 1}
for col in y_tmp.columns:
    y_tmp[col] = y_tmp[col].map(new_encoding)

In [325]:
y_tmp

Unnamed: 0,work-balance,culture-values,career-opportunities,comp-benefits,senior-mgmt,overall
625,1,1,0,1,1,1
31184,0,0,0,1,1,1
7969,0,0,0,0,0,0
20621,1,1,1,1,1,1
19382,0,1,1,1,1,1
...,...,...,...,...,...,...
35319,0,1,1,0,1,1
3742,0,1,1,1,0,1
24623,1,1,1,0,1,1
33306,1,0,1,1,0,0


In [309]:
gridsearch = RandomizedSearchCV(pipe, grid, n_iter=10,
                                scoring = 'balanced_accuracy',
                                verbose=1)
gridsearch.fit(X_tmp, y_tmp['overall'])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  5.5min finished


RandomizedSearchCV(estimator=Pipeline(steps=[('vectorizer',
                                              ColumnTransformer(transformers=[('vectorizer',
                                                                               TfidfVectorizer(),
                                                                               'review')])),
                                             ('classifier', MultinomialNB())]),
                   param_distributions={'classifier__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x12dac3790>,
                                        'vectorizer__vectorizer__analyzer': ['char',
                                                                             'word'],
                                        'vectorizer__vectorizer_...structure.rv_frozen object at 0x13100b910>,
                                        'vectorizer__vectorizer__min_df': <scipy.stats._distn_infrastructure.rv_frozen object at 0x12f039ad0>,
                     

In [310]:
best_pipe = gridsearch.best_estimator_

In [311]:
tmp = extract_NB_predictions(X_tmp, y_tmp, best_pipe, targets)

100%|██████████| 6/6 [02:08<00:00, 21.39s/it]


In [312]:
tmp.head()

Unnamed: 0,review,review_length,subjectivity_review,polarity_review,work-balance_nb,culture-values_nb,career-opportunities_nb,comp-benefits_nb,senior-mgmt_nb,overall_nb
625,great people great work great perk but incr...,220,0.668651,0.283069,1,1,1,1,1,1
31184,high stress ton of hour great paythe pay and...,1547,0.520143,0.30733,0,0,0,0,0,0
7969,they wasted my time nice cafeteria it wa exci...,1263,0.574815,0.166667,0,0,0,0,0,0
20621,best career is in googlesalary is very good in...,91,0.54,0.955,1,1,1,1,1,1
19382,senior software designerample opportunity and ...,961,0.482353,0.222549,0,0,1,1,0,0


In [313]:
tmp_preds = tmp.iloc[:, -len(targets):]

reports = {}
for target in targets:
    reports[target] = classification_report(y_tmp[target], tmp_preds[f'{target}_nb'],
                                            target_names=['negative','positive'])
    print(target.center(53, '-'))
    print(reports[target])

---------------------work-balance--------------------
              precision    recall  f1-score   support

    negative       0.69      0.69      0.69      5015
    positive       0.69      0.69      0.69      4985

    accuracy                           0.69     10000
   macro avg       0.69      0.69      0.69     10000
weighted avg       0.69      0.69      0.69     10000

--------------------culture-values-------------------
              precision    recall  f1-score   support

    negative       0.70      0.54      0.61      3484
    positive       0.78      0.87      0.83      6516

    accuracy                           0.76     10000
   macro avg       0.74      0.71      0.72     10000
weighted avg       0.75      0.76      0.75     10000

-----------------career-opportunities----------------
              precision    recall  f1-score   support

    negative       0.69      0.59      0.63      4023
    positive       0.75      0.82      0.78      5977

    accuracy        

In [314]:
gridsearch.best_params_

{'classifier__alpha': 0.03458454818476731,
 'vectorizer__vectorizer__analyzer': 'char',
 'vectorizer__vectorizer__max_df': 0.8307731986661769,
 'vectorizer__vectorizer__min_df': 0.013723547074227254,
 'vectorizer__vectorizer__ngram_range': (1, 4),
 'vectorizer__vectorizer__norm': 'l2',
 'vectorizer__vectorizer__stop_words': 'english'}

In [316]:
gridsearch.n_features_in_

10

## Pipeline

In [None]:
pipe = Pipeline([
    ('preprocessor', Preprocessor()),
    ('engineer', FeatureEngineer()),
    ('scaler', CustomMinMaxScaler())
])

In [125]:
X_train

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,Amazon gave me a chance,I was asking for employment and they gave me a...,I don't have any down sides,Keep doing what you are doing,Amazon gave me a chance I was asking for emplo...
1,A great company to work for with a few glaring...,"Great co-workers, challenging environment that...",In a crowded retail environment where employee...,The attendance policy is abusive and bordering...,A great company to work for with a few glaring...
2,Senior Program Manager,lots of opportunities for work internally grea...,not the best talent some uninspiring leaders p...,,Senior Program Manager lots of opportunities f...
3,"Great company, lots of politics",- Compensation - Great product - Enhances resume,- No life balance - Lots of internal politics ...,,"Great company, lots of politics - Compensation..."
4,Cloud Support Engineer,1) Great working environment. less work pressu...,1) Need to be active about career growth. Norm...,Keep mixing things up and get every employee i...,Cloud Support Engineer 1) Great working enviro...
...,...,...,...,...,...
36668,Working for a big compay,Large company lots of talk about improvments a...,You need to give up your social and personal l...,,Working for a big compay Large company lots of...
36669,SDE,"Great, smart people to work with who know what...",Windows seems like a really slow org of gettin...,,"SDE Great, smart people to work with who know ..."
36670,Amazon Restaurants,Great sales team & direct managers,Upper management makes the job miserable and m...,,Amazon Restaurants Great sales team & direct m...
36671,Good company for life,Good living benefits and work life balance. Go...,Salary is not so good. Salary is not so good.S...,"Different manger differs, need to be more aggr...",Good company for life Good living benefits and...


# Extra: Implementation wrong language function

In [None]:
def drop_wrong_language(df, column, language = 'en', inplace=False):
    '''drops entries written in languages other thatn the specified'''
    print('Identifying entries in other languages...')
    is_wrong = df[column].apply(detect) != language
    n_rows_to_drop = is_wrong.sum()

    user_confirmation = None
    while not (user_confirmation is 'y' or user_confirmation is 'n'):
        user_confirmation = input(f'Drop {n_rows_to_drop} entries? y / [n]\n') or 'n'
    if user_confirmation is 'y':
        if inplace:
            print(f'Dropping {n_rows_to_drop} entries...')
            df = df[~is_wrong]
            df.reset_index(inplace=True, drop=True)
            print('Process completed.')
            return df
        else:
            print(f'Dropping {n_rows_to_drop} entries...')
            print('Process completed.')
            return df[~is_wrong].reset_index(inplace=True, drop=True)
    else:
        print('Process aborted')
        return None