# Pipeline

## Imports

In [82]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [83]:
from WorkforceSentimentMonitoring.data import get_prepaired_data
from WorkforceSentimentMonitoring.encoders import Preprocessor, CustomMinMaxScaler, FeatureEngineer
from WorkforceSentimentMonitoring.preprocessing import lemmatize
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression

from langdetect import detect

import joblib

## Get data

In [None]:
X_train, X_test, y_train, y_test = get_prepaired_data()

Reading data...
Merging data into a single DataFrame...
Dropping initial text columns...
Identifying entries in other languages...


## Preprocess

In [29]:
preprocessor = Preprocessor()
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [30]:
X_train.head(1)

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,brand specialist,high learning curve high independent,people management work life balance,,brand specialist high learning curve high inde...


## Feature Engineering

In [25]:
engineer = FeatureEngineer()
X_train = engineer.fit_transform(X_train)
X_test = engineer.fit_transform(X_test)

100%|██████████| 5/5 [00:00<00:00, 37.58it/s]
100%|██████████| 5/5 [00:00<00:00, 36.35it/s]


In [31]:
display(X_train.head(1))
display(X_test.head(1))

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,brand specialist,high learning curve high independent,people management work life balance,,brand specialist high learning curve high inde...


Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,great stepping stone,amazing benefit and employee right everything...,apple talk big about internal growth and devel...,promote internally employee to management you...,great stepping stone amazing benefit and emplo...


## Pipeline

In [None]:
pipe = Pipeline([
    ('preprocessor', Preprocessor()),
    ('engineer', FeatureEngineer()),
    ('scaler', CustomMinMaxScaler())
])

In [125]:
X_train

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,Amazon gave me a chance,I was asking for employment and they gave me a...,I don't have any down sides,Keep doing what you are doing,Amazon gave me a chance I was asking for emplo...
1,A great company to work for with a few glaring...,"Great co-workers, challenging environment that...",In a crowded retail environment where employee...,The attendance policy is abusive and bordering...,A great company to work for with a few glaring...
2,Senior Program Manager,lots of opportunities for work internally grea...,not the best talent some uninspiring leaders p...,,Senior Program Manager lots of opportunities f...
3,"Great company, lots of politics",- Compensation - Great product - Enhances resume,- No life balance - Lots of internal politics ...,,"Great company, lots of politics - Compensation..."
4,Cloud Support Engineer,1) Great working environment. less work pressu...,1) Need to be active about career growth. Norm...,Keep mixing things up and get every employee i...,Cloud Support Engineer 1) Great working enviro...
...,...,...,...,...,...
36668,Working for a big compay,Large company lots of talk about improvments a...,You need to give up your social and personal l...,,Working for a big compay Large company lots of...
36669,SDE,"Great, smart people to work with who know what...",Windows seems like a really slow org of gettin...,,"SDE Great, smart people to work with who know ..."
36670,Amazon Restaurants,Great sales team & direct managers,Upper management makes the job miserable and m...,,Amazon Restaurants Great sales team & direct m...
36671,Good company for life,Good living benefits and work life balance. Go...,Salary is not so good. Salary is not so good.S...,"Different manger differs, need to be more aggr...",Good company for life Good living benefits and...


# Extra: Implementation wrong language function

In [None]:
def drop_wrong_language(df, column, language = 'en', inplace=False):
    '''drops entries written in languages other thatn the specified'''
    print('Identifying entries in other languages...')
    is_wrong = df[column].apply(detect) != language
    n_rows_to_drop = is_wrong.sum()

    user_confirmation = None
    while not (user_confirmation is 'y' or user_confirmation is 'n'):
        user_confirmation = input(f'Drop {n_rows_to_drop} entries? y / [n]\n') or 'n'
    if user_confirmation is 'y':
        if inplace:
            print(f'Dropping {n_rows_to_drop} entries...')
            df = df[~is_wrong]
            df.reset_index(inplace=True, drop=True)
            print('Process completed.')
            return df
        else:
            print(f'Dropping {n_rows_to_drop} entries...')
            print('Process completed.')
            return df[~is_wrong].reset_index(inplace=True, drop=True)
    else:
        print('Process aborted')
        return None