# Pipeline

## Imports

In [9]:
%load_ext autoreload
%autoreload 2

In [10]:
from WorkforceSentimentMonitoring.data import get_prepaired_data
from WorkforceSentimentMonitoring.encoders import Preprocessor, CustomMinMaxScaler, FeatureEngineer
from WorkforceSentimentMonitoring.preprocessing import lemmatize
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression

from langdetect import detect

In [61]:
X_train, X_test, y_train, y_test = get_prepaired_data()

Identifying entries in other languages...
Drop 433 entries? y / [n]
y


In [62]:
preprocessor = Preprocessor()
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [13]:
X_train.head(10)

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,awesome company,a ton of growth opportunity really great mana...,day to day can get a little repetitive,,awesome company a ton of growth opportunity r...
1,great company,great place to work at,nothing i feel like i need to mention,,great company great place to work at nothing i...
2,great but only if you are great,if you work really hard and make the right peo...,working with other department e g window of...,punish those who thwart one microsoft and so...,great but only if you are great if you wo...
3,very informative learning experience for early...,one of the most recognizable company in the bu...,the compensation wa a bit lower than i felt it...,be extroverted open to teamwork etc,very informative learning experience for early...
4,sde,good company good environment good people,very hectic schedule no work life balance,,sde good company good environment good peopl...
5,reciever,pay is good if you have no experience managem...,it s like working in a huge high school the d...,stop feeding into the heathen they make one c...,reciever pay is good if you have no experience...
6,a good place to grow,dynamic and fast moving company,getting bureaucratic too many reporting goal...,le reporting more real work,a good place to grow dynamic and fast moving c...
7,qae,awesome place with great tool,nothing to find out even after working for year,excellent mangement,qae awesome place with great tool nothing to...
8,game of throne,great benefit with good pay a long a you know...,a day to day struggle working with management ...,learn to be a bit more human take that empath...,game of throne great benefit with good pay a ...
9,at amazon you are given a lot of responsibilit...,at amazon they trust you to finish a project f...,you are almost set up to fail it is a self se...,,at amazon you are given a lot of responsibilit...


In [14]:
tmp = X_train.head(10).copy()

In [16]:
user_confirmation = None
while not user_confirmation:
    user_confirmation = input(f'Drop entries? y / [n]\n') or 'n'

Drop entries? y / [n]



In [58]:
def drop_wrong_language(df, column, language = 'en', inplace=False):
    '''drops entries written in languages other thatn the specified'''
    print('Identifying entries in other languages...')
    is_wrong = df[column].apply(detect) != language
    n_rows_to_drop = is_wrong.sum()

    user_confirmation = None
    while not (user_confirmation is 'y' or user_confirmation is 'n'):
        user_confirmation = input(f'Drop {n_rows_to_drop} entries? y / [n]\n') or 'n'
    if user_confirmation is 'y':
        if inplace:
            print(f'Dropping {n_rows_to_drop} entries...')
            df = df[~is_wrong]
            df.reset_index(inplace=True, drop=True)
            print('Process completed.')
            return df
        else:
            print(f'Dropping {n_rows_to_drop} entries...')
            print('Process completed.')
            return df[~is_wrong].reset_index(inplace=True, drop=True)
    else:
        print('Process aborted')
        return None

In [56]:
X_test[~is_wrong]

Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,i wouldn t wish it on my worst enemy aws rds,rsus part of the salary is in stock backda...,smart people who are jerk literally no per...,,i wouldn t wish it on my worst enemy aws rds ...
1,solid place to grow a career,great culture of growth and learning respectf...,can be challenging to navigate the size and co...,,solid place to grow a career great culture of ...
2,work lift balance,work with smart people nice working environment,no work life balance high stress,,work lift balance work with smart people nice...
3,great place to work for tech people,great colleague to work with and learn from,typical low light a a large corporation job r...,,great place to work for tech people great coll...
4,cloud solution architect,leading edge technology fair work environment...,a little bit slow reaction,,cloud solution architect leading edge technolo...
...,...,...,...,...,...
15839,engineering paradise,the tooling is incredible for example you ca...,there is a lot of complexity and much of the ...,,engineering paradise the tooling is incredible...
15840,quality expected quality not always given,good pay great benefit phenomenal exercise w...,long hour physically demanding work many esl...,,quality expected quality not always given g...
15841,superb,for fresher it one of the best company,salary is le no good hike,,superb for fresher it one of the best company ...
15842,drama and politics,constantly working on bleeding edge software w...,ton of office politics since the review are b...,get rid of the personal review system base it...,drama and politics constantly working on bleed...


In [60]:
drop_wrong_language(X_test, column='review', inplace=True )

Identifying entries in other languages...
Drop 139 entries? y / [n]
y
Dropping 139 entries...
Process completed.


Unnamed: 0,summary,positives,negatives,advice_to_mgmt,review
0,i wouldn t wish it on my worst enemy aws rds,rsus part of the salary is in stock backda...,smart people who are jerk literally no per...,,i wouldn t wish it on my worst enemy aws rds ...
1,solid place to grow a career,great culture of growth and learning respectf...,can be challenging to navigate the size and co...,,solid place to grow a career great culture of ...
2,work lift balance,work with smart people nice working environment,no work life balance high stress,,work lift balance work with smart people nice...
3,great place to work for tech people,great colleague to work with and learn from,typical low light a a large corporation job r...,,great place to work for tech people great coll...
4,cloud solution architect,leading edge technology fair work environment...,a little bit slow reaction,,cloud solution architect leading edge technolo...
...,...,...,...,...,...
15839,engineering paradise,the tooling is incredible for example you ca...,there is a lot of complexity and much of the ...,,engineering paradise the tooling is incredible...
15840,quality expected quality not always given,good pay great benefit phenomenal exercise w...,long hour physically demanding work many esl...,,quality expected quality not always given g...
15841,superb,for fresher it one of the best company,salary is le no good hike,,superb for fresher it one of the best company ...
15842,drama and politics,constantly working on bleeding edge software w...,ton of office politics since the review are b...,get rid of the personal review system base it...,drama and politics constantly working on bleed...


## 