# Feature Engineering Front End (FEFE)

## Direct import

Does not have to be installed, but it will not install the dependencies.

In [None]:
import sys
sys.path.insert(0, '../scoring')
from fefe import notebook_front_end, FeatureEngineeringAPI

# Data import

_Do not forget to cast the datetimes!_

In [None]:
import pandas as pd
data = pd.read_csv('../scoring/fefe/tests/toy_data.csv', index_col=0)


# cast your datetimes, please!
data['TIME'] = pd.to_datetime(data['TIME'])
data['TRANSACTION_TIME'] = pd.to_datetime(data['TRANSACTION_TIME'])

data.head()

# Front End (for new features)

This part should ease config creation.

However, it is not needed - see next part how to modify config.

In [None]:
notebook_front_end(data, port=8062)

# Config (output from front end)

The config is made from several parts. If you are not able to open the front end, you can still manipulate with the config manually.  

## Config description (tl;dr: look at the example)


<details>
<summary>
Definition of individual parts of the config and individual parameters.  

___
**\~\~\~ Click to expand \~\~\~**
___
</summary>


### meta:

_Definition of global informations, to be applied anywhere._

- granularity
    - describes the granularity to be used for aggregation.
    - possible values:
        - 'days' (or 'd', 'day', etc.)
        - 'weeks'
        - 'months'
        - 'years
        - 'order' 
            - takes into account only the order, not the time.
            - useful to get last transaction amount (order = 1)
            - useful for calculation using `NUM_GROUP` variables
- index
    - name of the index column in the dataset
    - usually 'CUSTOMER_ID', 'SKP_CREDIT_CASE', 'CUID' ...
- inf_value
    - value to replace infinities, if they appear in data
- nan_value
    - value to replace nans, if they appear in data
- target_time
    - time column, which is used as decision time.
- transaction_time
    - time column, to be used as time of the individual transaction.
- order
    - specification of column to be used for subsetting of the data using time ranges.
    - 'TIME_ORDER' means that column with such name will be created from `target_time` and `transaction_time`
    - otherwise FEFE will use this specified column.

### simple:

_Definition of simple variables._

- VARIABLE_NAME:
    - name of varibale, on which following transformations will be applied.
    - transformations:
        - functions
            - list of functions
            - usually, `pandas.DataFrame.GroupBy.[fun]` is applied
        - segmentations
            - list of variable names or `str(None)` (None, but as a string)
            - values from such variables are used in pd.DataFrame.query like `VARIABLE_NAME == 'value'`
        - time_ranges
            - list of tuples (which are saved as strings)
            - single tuple means from-to, which will be used with `order` column
        - queries
            - string to be put into `pd.DataFrame.query()` function
            - possible inputs:
                - `VARIABLE == 'value'` (notice double `=`)
                - `VARIABLE < 20`
                - `VARIABLE_1 == 'value_1';VARIABLE_1 == 'value_2'`
                    - this will iterate over individual queries
                - ```VARIABLE_1 == 'value_1';VARIABLE_1 =='value_2';VARIABLE_2 == 'value_a';VARIABLE_2 == 'value_b'```
                - ...

### ratio:

_Definition of ratio variables_

- DENOMINATOR_VARIABLE_NAME
    - NUMERATOR_VARIABLE_NAME
        - transformations:
            - similar like in simple case
            - functions are defined as tuple of functions
            - time_ranges are tuples of tuples of integers (two time ranges together)
            - segmentations & queries are applied at the variables at the same time

### time_since:

- TIME_VARIABLE_NAME
    - transformations:
        - from - whether take first or last event
        - segmentations  - same as in simple
        - queries - same as in simple
          
</details>

## Config example

In [None]:
CONFIG={
    'meta': {
        'granularity': 'days',
        'index': 'CUSTOMER_ID',
        'target_time': 'TIME',
        'transaction_time': 'TRANSACTION_TIME',
        'inf_value': None,
        'nan_value': None,
        'order': 'TIME_ORDER',

    },
    'ratio': {
        'TRANSACTION_AMOUNT': {
            'TRANSACTION_AMOUNT': {
                'functions': ["('min', 'min')", "('max', 'max')", "('mean', 'mean')", "('sum', 'sum')"],
                'segmentations': ['None', 'TRANSACTION_PURPOSE'],
                'time_ranges': ['((0, 30), (30, 180))', '((0, 180), (180, 360))'],
            },
        },
    },
    'simple': {
        'TRANSACTION_AMOUNT': {
            'functions': ['min', 'max', 'sum', 'mean'],
            'queries': "TRANSACTION_FEE < 100;TRANSACTION_CLASS == 'ATM'",
            'segmentations': ['None', 'TRANSACTION_PURPOSE', 'TRANSACTION_TYPE', 'TRANSACTION_PLACE'],
            'time_ranges': ['(0, 360)', '(0, inf)'],
        },
        'TRANSACTION_FEE': {
            'functions': ['min', 'max', 'sum', 'mean'],
            'queries': "TRANSACTION_FEE < 100;TRANSACTION_CLASS == 'ATM'",
            'segmentations': ['None', 'TRANSACTION_PURPOSE', 'TRANSACTION_TYPE'],
            'time_ranges': ['(0, 360)', '(0, inf)'],
        },
        'TRANSACTION_PURPOSE': {
            'functions': ['mode', 'mode_multicolumn'],
            'segmentations': ['None'],
            'time_ranges': ['(0, 360)'],
        }
    },
    'time_since': {'TRANSACTION_TIME': {'from': ['first', 'last'], 'queries': "TRANSACTION_PURPOSE=='hazard'"}},
}


# Initialisation of calculation

In [None]:
name_shortening_dictionary = {
    "TRANSACTION_FEE":"FEE",
    "TRANSACTION_AMOUNT": "AMNT",
    "TRAVELLING": "TRVL",
    "HAZARD": "HZRD",
    "TRANSACTION_PURPOSE": "PRPS",
    "TRANSACTION": "TNX",
    "_EQ_": "_",
    "_0D_": "_",
    "_INFD":"",   
}

fe = FeatureEngineeringAPI(
    config=CONFIG,
    shortening_dictionary=name_shortening_dictionary,
    logger_kwargs={"log_level": 20})

# Calculation of dataframe

In [None]:
from IPython.display import display
with pd.option_context('display.max_columns', 200):
    
    display(fe.dataframe(data, max_nan_share=0.9))

# Output in sql

In [None]:
print(fe.sql(data,
             table_name='data',
             feature_subset=[
#                  'MAX_TRANSACTION_AMOUNT_0D_INFD',
#                  'MIN_TRANSACTION_AMOUNT_0D_INFD'
             ]))