# Pipeline

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
from WorkforceSentimentMonitoring.data import get_prepaired_data
from WorkforceSentimentMonitoring.encoders import Preprocessor, CustomMinMaxScaler, FeatureEngineer
from WorkforceSentimentMonitoring.preprocessing import lemmatize
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from langdetect import detect

## Get data

In [70]:
X_train, X_test, y_train, y_test = get_prepaired_data()

Reading data...
Merging data into a single DataFrame...
Identifying entries in other languages...
Drop 437 entries? y / [n]
y
Dropping 437 entries...
Process completed.
Splitting train and test...
Done!
Encoding targets...


## Preprocess

In [71]:
preprocessor = Preprocessor()
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [72]:
X_train.head(1)

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,great company to work for,amazing benefit a company that value voluntee...,it s hard if you don t have the customer servi...,,great company to work for amazing benefit a ...


## Feature Engineering

In [73]:
engineer = FeatureEngineer()
X_train = engineer.fit_transform(X_train)
X_test = engineer.fit_transform(X_test)

100%|██████████| 5/5 [02:27<00:00, 29.46s/it]
100%|██████████| 5/5 [01:08<00:00, 13.73s/it]


In [74]:
display(X_train.head(1))
display(X_test.head(1))

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review,summary_length,positives_length,negatives_length,advice_to_mgmt_length,review_length,subjectivity_summary,polarity_summary,subjectivity_positives,polarity_positives,subjectivity_negatives,polarity_negatives,subjectivity_advice_to_mgmt,polarity_advice_to_mgmt,subjectivity_review,polarity_review
0,great company to work for,amazing benefit a company that value voluntee...,it s hard if you don t have the customer servi...,,great company to work for amazing benefit a ...,26,67,65,0,164,0.75,0.8,0.45,0.3,0.541667,-0.291667,,,0.547917,0.277083


Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review,summary_length,positives_length,negatives_length,advice_to_mgmt_length,review_length,subjectivity_summary,polarity_summary,subjectivity_positives,polarity_positives,subjectivity_negatives,polarity_negatives,subjectivity_advice_to_mgmt,polarity_advice_to_mgmt,subjectivity_review,polarity_review
0,fast paced environment,great work culture swift environment your wo...,political too little coordination location n...,,fast paced environment great work culture swi...,22,63,60,0,151,0.6,0.2,0.75,0.8,0.3,-0.09375,,,0.4875,0.203125


## Pipeline

In [76]:
scaler = CustomMinMaxScaler()
scaler.fit_transform(X_test)

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review,summary_length,positives_length,negatives_length,advice_to_mgmt_length,review_length,subjectivity_summary,polarity_summary,subjectivity_positives,polarity_positives,subjectivity_negatives,polarity_negatives,subjectivity_advice_to_mgmt,polarity_advice_to_mgmt,subjectivity_review,polarity_review
0,fast paced environment,great work culture swift environment your wo...,political too little coordination location n...,,fast paced environment great work culture swi...,0.183333,0.008473,0.005990,0.000000,0.010684,0.600000,0.200000,0.750000,0.800000,0.3000,-0.093750,,,0.487500,0.203125
1,family room specialist,great culture great benefit for part time comp...,feedback can be overbearing at time,keep doing what you re doing and don t be afra...,family room specialist great culture great ben...,0.183333,0.008781,0.003165,0.021173,0.017028,0.000000,0.000000,0.750000,0.800000,0.0000,0.000000,0.900000,-0.600000,0.800000,0.333333
2,great place but money isn t good,great people to work with benefit are great,the pay isn t industry ave or leading,it would be nice since we hear it all the time...,great place but money isn t good great people...,0.275000,0.005546,0.003391,0.054592,0.030718,0.675000,0.750000,0.750000,0.800000,0.0000,0.000000,0.833333,0.378788,0.764286,0.605195
3,best talent best place,free breakfast free lunch free dinner only ...,work life balance you are hiring the best of ...,keep listening to u a we are scaling the crea...,best talent best place free breakfast free lu...,0.183333,0.052072,0.020457,0.037755,0.072565,0.300000,1.000000,0.807143,0.564286,0.3250,0.650000,0.500000,0.500000,0.575000,0.646429
4,specialist,great company work friendly culture is amazi...,no career growth not high enough pay,none,specialist great company work friendly cultu...,0.083333,0.013249,0.003391,0.001020,0.010351,0.000000,0.000000,0.716667,0.591667,0.5200,-0.040000,0.000000,0.000000,0.638000,0.339000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15709,good for junior hard for senior,advanced architecture lot of cool thing for j...,really hard to get promoted in company partic...,,good for junior hard for senior advanced arch...,0.266667,0.009552,0.007007,0.000000,0.013578,0.570833,0.204167,0.625000,0.375000,0.4375,-0.062500,,,0.544444,0.172222
15710,working at google is awesome,great people great work culture,no con it been absolutely amazing,,working at google is awesome great people gr...,0.241667,0.003697,0.003504,0.000000,0.005565,1.000000,1.000000,0.750000,0.800000,0.9000,0.600000,,,0.850000,0.800000
15711,risk analyst,work from home work culture management,rotational shift rotational shift,not applicable,risk analyst work from home work culture manag...,0.100000,0.004622,0.002939,0.003571,0.005008,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000
15712,fast paced world class company with nearly un...,m is an incredible company with numerous highl...,a company go i m not going to through a stone...,,fast paced world class company with nearly un...,0.908333,0.017255,0.028255,0.000000,0.048637,0.533333,0.055556,0.560000,0.360000,0.4750,0.050000,,,0.525000,0.180556


In [78]:
scaler.transform(X_train)

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review,summary_length,positives_length,negatives_length,advice_to_mgmt_length,review_length,subjectivity_summary,polarity_summary,subjectivity_positives,polarity_positives,subjectivity_negatives,polarity_negatives,subjectivity_advice_to_mgmt,polarity_advice_to_mgmt,subjectivity_review,polarity_review
0,great company to work for,amazing benefit a company that value voluntee...,it s hard if you don t have the customer servi...,,great company to work for amazing benefit a ...,0.216667,0.009090,0.006555,0.000000,0.012131,0.750,0.80,0.450000,0.300000,0.541667,-0.291667,,,0.547917,0.277083
1,great company but doesn t always pay competiti...,great culture on site cafeteria is super good ...,pay well but doe not always pay a well a compe...,provide further opportunity for growth beyond ...,great company but doesn t always pay competiti...,0.416667,0.007703,0.026108,0.017347,0.040289,0.750,0.80,0.654167,0.633333,0.484848,0.149621,0.500000,0.000000,0.591246,0.420244
2,year in and still enjoying it,challenging lot of opportunity great people,if you don t buy into the amazon way you will...,,year in and still enjoying it challenging l...,0.258333,0.005700,0.005425,0.000000,0.009126,0.600,0.50,0.875000,0.650000,0.000000,0.000000,,,0.783333,0.600000
3,so far so awesome,i feel like i fell asleep and woke up in the f...,none that i ve found so far,,so far so awesome i feel like i fell asleep an...,0.141667,0.006779,0.002260,0.000000,0.005231,1.000,0.55,0.125000,0.000000,1.000000,0.100000,,,0.781250,0.300000
4,sr product manager,super smart people i did not say kind and c...,no career progression map for non sde role cu...,well the nyt article say it all do i need to...,sr product manager super smart people i did ...,0.158333,0.015560,0.031759,0.076276,0.073790,0.000,0.00,0.701905,0.329524,0.568750,0.256250,0.587500,0.300000,0.625733,0.297894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36659,call centre work but at home,good benefit healthcare dental etc reductio...,lot of angry customer and very impatient tigh...,,call centre work but at home good benefit he...,0.241667,0.015098,0.010850,0.000000,0.021035,0.000,0.00,0.600000,0.700000,0.671429,-0.272143,,,0.657143,-0.077714
36660,peculiar in a good way,very customer and employee focused with excell...,pace of change and innovation is challenging,easy on the acronym,peculiar in a good way very customer and emplo...,0.183333,0.022185,0.004182,0.004847,0.020590,0.600,0.70,0.590000,0.540000,1.000000,0.500000,0.833333,0.433333,0.672917,0.541667
36661,need to be more agile culture not what it wa,great ceo and senior leadership team excel...,struggle to against against a strategy in ma...,culture is what made google the most iconic co...,need to be more agile culture not what it wa ...,0.375000,0.031120,0.045547,0.077551,0.102059,0.625,0.50,0.690476,0.535714,0.336395,0.136735,0.516667,0.195238,0.516553,0.302041
36662,senior treasury manager,great company technology and innovation drive...,organizational complexity affecting coordinati...,streamline coordination and communication acro...,senior treasury manager great company technol...,0.191667,0.009860,0.011302,0.020918,0.025821,0.000,0.00,0.750000,0.800000,0.000000,0.000000,0.000000,0.000000,0.750000,0.800000


In [79]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,7))

In [80]:
def add_multinomial_nb_prediction_feature(df_train, df_test, y_train, y_test):
    """vectorize and predict with Naive Bayes"""
    scores_dict= {}
    for score in y_test.columns:
        result_scores = {}
        for feature in df_train.select_dtypes('object').columns:
            # instantiate vectorizer
            vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,7))
            # vectorize train set
            X_train = vectorizer.fit_transform(df_train[feature].astype('U'))
            # vectorize test set
            X_test = vectorizer.transform(df_test[feature].astype('U'))
            # select target
            target = y_test[score]
            # instantiate model
            model = MultinomialNB()
            # fit model
            model.fit(X_train, y_train[score])
            # predict with train and test sets and store predictions in new col
            df_train[f"{feature}_{score}_nb"] = model.predict(X_train)
            df_test[f"{feature}_{score}_nb"] = model.predict(X_test)
            # evaluate model and append results in dictionary
            result_scores[f'{feature}'] = model.score(X_test, target)
        # append results to scores dictionary
        scores_dict[f'{score}'] = result_scores
    return df_train, df_test, scores_dict

In [80]:
def add_multinomial_nb_prediction_feature(df_train, df_test, y_train, y_test):
    """vectorize and predict with Naive Bayes"""
    scores_dict= {}
    for score in y_test.columns:
        result_scores = {}
        for feature in df_train.select_dtypes('object').columns:
            # instantiate vectorizer
            pipe = make_pipeline(vectorizer)
            vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,7))
            # vectorize train set
            X_train = vectorizer.fit_transform(df_train[feature].astype('U'))
            # vectorize test set
            X_test = vectorizer.transform(df_test[feature].astype('U'))
            # select target
            target = y_test[score]
            # instantiate model
            model = MultinomialNB()
            # fit model
            model.fit(X_train, y_train[score])
            # predict with train and test sets and store predictions in new col
            df_train[f"{feature}_{score}_nb"] = model.predict(X_train)
            df_test[f"{feature}_{score}_nb"] = model.predict(X_test)
            # evaluate model and append results in dictionary
            result_scores[f'{feature}'] = model.score(X_test, target)
        # append results to scores dictionary
        scores_dict[f'{score}'] = result_scores
    return df_train, df_test, scores_dict

In [81]:
X_train_tmp, X_test_tmp, scores_dict = add_multinomial_nb_prediction_feature(X_train, X_test, y_train, y_test)

In [59]:
def encode_target(y):
    encoding = {1 : 0, 2 : 0, 3 : 1, 4 : 2, 5 : 2}
    for col in y.columns:
        y[col] = y[col].map(encoding)
    
    return y

In [68]:
y_test_tmp = encode_target(y_test)

In [69]:
y_test

Unnamed: 0,work-balance,culture-values,career-opportunities,comp-benefits,senior-mgmt,overall
0,2,2,2,2,2,2
1,2,2,1,1,2,2
2,2,1,2,0,0,0
3,1,2,2,2,2,2
4,2,2,2,2,2,2
...,...,...,...,...,...,...
15710,1,0,1,0,2,0
15711,1,1,2,1,0,2
15712,0,0,1,1,0,0
15713,2,2,2,2,1,1


In [9]:
pipe = Pipeline([
    ('preprocessor', Preprocessor()),
    ('engineer', FeatureEngineer()),
    ('scaler', CustomMinMaxScaler()),
    ()
])

ValueError: not enough values to unpack (expected 2, got 0)

# Extra: Implementation wrong language function

In [None]:
def drop_wrong_language(df, column, language = 'en', inplace=False):
    '''drops entries written in languages other thatn the specified'''
    print('Identifying entries in other languages...')
    is_wrong = df[column].apply(detect) != language
    n_rows_to_drop = is_wrong.sum()

    user_confirmation = None
    while not (user_confirmation is 'y' or user_confirmation is 'n'):
        user_confirmation = input(f'Drop {n_rows_to_drop} entries? y / [n]\n') or 'n'
    if user_confirmation is 'y':
        if inplace:
            print(f'Dropping {n_rows_to_drop} entries...')
            df = df[~is_wrong]
            df.reset_index(inplace=True, drop=True)
            print('Process completed.')
            return df
        else:
            print(f'Dropping {n_rows_to_drop} entries...')
            print('Process completed.')
            return df[~is_wrong].reset_index(inplace=True, drop=True)
    else:
        print('Process aborted')
        return None