# Pipeline

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from WorkforceSentimentMonitoring.data import get_prepaired_data
from WorkforceSentimentMonitoring.encoders import Preprocessor, CustomMinMaxScaler, FeatureEngineer
from WorkforceSentimentMonitoring.preprocessing import lemmatize
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression

from langdetect import detect

## Get data

In [3]:
X_train, X_test, y_train, y_test = get_prepaired_data()

Reading data...
Merging data into a single DataFrame...
Identifying entries in other languages...
Drop 434 entries? y / [n]
yy
Drop 434 entries? y / [n]
y
Dropping 434 entries...
Process completed.
Splitting train and test...
Done!


## Preprocess

In [None]:
preprocessor = Preprocessor()
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [None]:
X_train.head(1)

## Feature Engineering

In [None]:
engineer = FeatureEngineer()
X_train = engineer.fit_transform(X_train)
X_test = engineer.fit_transform(X_test)

In [None]:
display(X_train.head(1))
display(X_test.head(1))

## Pipeline

In [None]:
pipe = Pipeline([
    ('preprocessor', Preprocessor()),
    ('engineer', FeatureEngineer()),
    ('scaler', CustomMinMaxScaler()),
    ()
])

# Extra: Implementation wrong language function

In [None]:
def drop_wrong_language(df, column, language = 'en', inplace=False):
    '''drops entries written in languages other thatn the specified'''
    print('Identifying entries in other languages...')
    is_wrong = df[column].apply(detect) != language
    n_rows_to_drop = is_wrong.sum()

    user_confirmation = None
    while not (user_confirmation is 'y' or user_confirmation is 'n'):
        user_confirmation = input(f'Drop {n_rows_to_drop} entries? y / [n]\n') or 'n'
    if user_confirmation is 'y':
        if inplace:
            print(f'Dropping {n_rows_to_drop} entries...')
            df = df[~is_wrong]
            df.reset_index(inplace=True, drop=True)
            print('Process completed.')
            return df
        else:
            print(f'Dropping {n_rows_to_drop} entries...')
            print('Process completed.')
            return df[~is_wrong].reset_index(inplace=True, drop=True)
    else:
        print('Process aborted')
        return None