# Pipeline

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
from WorkforceSentimentMonitoring.data import get_prepaired_data
from WorkforceSentimentMonitoring.encoders import Preprocessor, CustomMinMaxScaler, FeatureEngineer
from WorkforceSentimentMonitoring.preprocessing import lemmatize
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from langdetect import detect

## Get data

In [None]:
X_train, X_test, y_train, y_test = get_prepaired_data()

Reading data...
Merging data into a single DataFrame...


## Preprocess

In [5]:
preprocessor = Preprocessor()
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [6]:
X_train.head(1)

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,specialist,great environment to work in,work life is not that balanced,,specialist great environment to work in work ...


## Feature Engineering

In [7]:
engineer = FeatureEngineer()
X_train = engineer.fit_transform(X_train)
X_test = engineer.fit_transform(X_test)

100%|██████████| 5/5 [03:16<00:00, 39.25s/it]
100%|██████████| 5/5 [00:56<00:00, 11.29s/it]


In [8]:
display(X_train.head(1))
display(X_test.head(1))

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review,summary_length,positives_length,negatives_length,advice_to_mgmt_length,review_length,subjectivity_summary,polarity_summary,subjectivity_positives,polarity_positives,subjectivity_negatives,polarity_negatives,subjectivity_advice_to_mgmt,polarity_advice_to_mgmt,subjectivity_review,polarity_review
0,specialist,great environment to work in,work life is not that balanced,,specialist great environment to work in work ...,10,29,31,0,76,0.0,0.0,0.75,0.8,0.0,0.0,,,0.75,0.8


Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review,summary_length,positives_length,negatives_length,advice_to_mgmt_length,review_length,subjectivity_summary,polarity_summary,subjectivity_positives,polarity_positives,subjectivity_negatives,polarity_negatives,subjectivity_advice_to_mgmt,polarity_advice_to_mgmt,subjectivity_review,polarity_review
0,great,everything in term of benefit,none that i can think of,,great everything in term of benefit none that ...,5,29,24,0,64,0.75,0.8,0.0,0.0,0.0,0.0,,,0.75,0.8


## Pipeline

In [10]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,7))

In [34]:
def add_multinomial_nb_prediction_feature(df_train, df_test, y_train, y_test):
    """vectorize and predict with Naive Bayes"""
    scores_dict= {}
    for score in y_test.columns:
        result_scores = {}
        for feature in df_train.select_dtypes('object').columns:
            # instantiate vectorizer
            vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,7))
            # vectorize train set
            X_train = vectorizer.fit_transform(df_train[feature].astype('U'))
            # vectorize test set
            X_test = vectorizer.transform(df_test[feature].astype('U'))
            # select target
            target = y_test[score]
            # instantiate model
            model = MultinomialNB()
            # fit model
            model.fit(X_train, y_train[score])
            # predict with train and test sets and store predictions in new col
            df_train[f"{feature}_{score}_nb"] = model.predict(X_train)
            df_test[f"{feature}_{score}_nb"] = model.predict(X_test)
            # evaluate model and append results in dictionary
            result_scores[f'{feature}'] = model.score(X_test, target)
        # append results to scores dictionary
        scores_dict[f'{score}'] = result_scores
    return df_train, df_test, scores_dict

In [35]:
X_train_tmp, X_test_tmp, scores_dict = add_multinomial_nb_prediction_feature(X_train, X_test, y_train, y_test)

In [59]:
def encode_target(y):
    encoding = {1 : 0, 2 : 0, 3 : 1, 4 : 2, 5 : 2}
    for col in y.columns:
        y[col] = y[col].map(encoding)
    
    return y

In [68]:
y_test_tmp = encode_target(y_test)

In [69]:
y_test

Unnamed: 0,work-balance,culture-values,career-opportunities,comp-benefits,senior-mgmt,overall
0,2,2,2,2,2,2
1,2,2,1,1,2,2
2,2,1,2,0,0,0
3,1,2,2,2,2,2
4,2,2,2,2,2,2
...,...,...,...,...,...,...
15710,1,0,1,0,2,0
15711,1,1,2,1,0,2
15712,0,0,1,1,0,0
15713,2,2,2,2,1,1


In [9]:
pipe = Pipeline([
    ('preprocessor', Preprocessor()),
    ('engineer', FeatureEngineer()),
    ('scaler', CustomMinMaxScaler()),
    ()
])

ValueError: not enough values to unpack (expected 2, got 0)

# Extra: Implementation wrong language function

In [None]:
def drop_wrong_language(df, column, language = 'en', inplace=False):
    '''drops entries written in languages other thatn the specified'''
    print('Identifying entries in other languages...')
    is_wrong = df[column].apply(detect) != language
    n_rows_to_drop = is_wrong.sum()

    user_confirmation = None
    while not (user_confirmation is 'y' or user_confirmation is 'n'):
        user_confirmation = input(f'Drop {n_rows_to_drop} entries? y / [n]\n') or 'n'
    if user_confirmation is 'y':
        if inplace:
            print(f'Dropping {n_rows_to_drop} entries...')
            df = df[~is_wrong]
            df.reset_index(inplace=True, drop=True)
            print('Process completed.')
            return df
        else:
            print(f'Dropping {n_rows_to_drop} entries...')
            print('Process completed.')
            return df[~is_wrong].reset_index(inplace=True, drop=True)
    else:
        print('Process aborted')
        return None