# E: Feature Engineering

## imports

In [1]:


import pandas as pd
import numpy as np

# for na pipeline
import warnings
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import TransformerMixin  # for custom transformers

from joblib import dump, load

## read in data

In [2]:
df_XY = pd.read_csv("output/c_resulttradewattr.csv")

In [3]:
##  get_feature_names function
# https://johaupt.github.io/scikit-learn/tutorial/python/data%20processing/ml%20pipeline/model%20interpretation/columnTransformer_feature_names.html
def get_feature_names(column_transformer):
    """Get feature names from all transformers.
    Returns
    -------
    feature_names : list of strings
        Names of the features produced by transform.
    """
    # Remove the internal helper function
    # check_is_fitted(column_transformer)

    # Turn loopkup into function for better handling with pipeline later
    def get_names(trans):
        # >> Original get_feature_names() method
        if trans == "drop" or (hasattr(column, "__len__") and not len(column)):
            return []
        if trans == "passthrough":
            if hasattr(column_transformer, "_df_columns"):
                if (not isinstance(column, slice)) and all(
                    isinstance(col, str) for col in column
                ):
                    return column
                else:
                    return column_transformer._df_columns[column]
            else:
                indices = np.arange(column_transformer._n_features)
                return [i for i in indices[column]]
        if not hasattr(trans, "get_feature_names"):
            # >>> Change: Return input column names if no method avaiable
            # Turn error into a warning
            warnings.warn(
                "Transformer %s (type %s) does not "
                "provide get_feature_names. "
                "Will return input column names if available"
                % (str(name), type(trans).__name__)
            )
            # For transformers without a get_features_names method, use the input
            # names to the column transformer
            if column is None:
                return []
            else:
                return [f for f in column]

        return [f for f in trans.get_feature_names()]

    ### Start of processing
    feature_names = []

    # Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
    if type(column_transformer) == sklearn.pipeline.Pipeline:
        l_transformers = [
            (name, trans, None) for step, name, trans in column_transformer._iter()
        ]
    else:
        # For column transformers, follow the original method
        l_transformers = column_transformer.transformers_

    for name, trans, column in l_transformers:
        if type(trans) == sklearn.pipeline.Pipeline:
            # Recursive call on pipeline
            _names = get_feature_names(trans)
            # if pipeline has no transformer that returns names
            if len(_names) == 0:
                _names = [f for f in column]
            feature_names.extend(_names)
        else:
            feature_names.extend(get_names(trans))

    return feature_names

## custom transformers

In [4]:
class Numerizer(TransformerMixin):
    "convert numbers and % to numbers as well"
    import pandas as pd
    import numpy as np

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
#         Y = X.apply(pd.to_numeric, args=({"errors":"coerce"})).fillna(np.nan)

        Y = X.apply((lambda x: (
            pd.to_numeric(x.astype(str).str.replace(r'%', r'e-2'),errors='coerce')
            )
            )
        )

        return Y


class StringTransformer(TransformerMixin):
    import pandas as pd

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        Y = pd.DataFrame(X).astype("string")
        return Y

## Add Weights

In [5]:
df_XY['Age'] = df_XY['Open_Year'] - min(df_XY['Open_Year']-1)
df_XY['Weight'] = 0.8 # hyperparam for exponential weighting
df_XY['Weight'] = df_XY['Weight'].pow(df_XY['Age'],fill_value=0)

In [6]:
df_XY['Age'].value_counts()

2    742
6    437
5    339
7    235
8    205
3    197
4    104
1     86
9     49
Name: Age, dtype: int64

In [7]:
df_XY['Weight'].value_counts()

0.640000    742
0.262144    437
0.327680    339
0.209715    235
0.167772    205
0.512000    197
0.409600    104
0.800000     86
0.134218     49
Name: Weight, dtype: int64

## create na pipeline

In [8]:
df_XY.loc[0,df_XY.columns.duplicated()]

Series([], Name: 0, dtype: object)

In [9]:
# remove all nan columns
df_XY = df_XY.dropna(axis=1, how='all')

In [10]:
df_XY.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'Open_Date', 'Close_Date',
       'Symbol', 'Quantity', 'Pnl', 'OpenAct', 'CloseAct', 'Open_Price',
       'Close_Price', 'Comm_Tot', 'DATE', 'ACTION', 'QTYCHG', 'PRICE', 'TIME',
       'UNNAMED: 6', 'COMMISSION', 'UNNAMED: 8', 'CASH CHG (PNL)', 'COMMENTS',
       'PCTRETURN', 'STARTDATE', 'COMPANY NAME (IN ALPHABETICAL ORDER)',
       'TICKER', 'STOP', '% TO STOP', 'CURRENT PRICE', '% TO TARGET',
       'AT PRICE', 'TARGET', 'EPS1', 'EPS2', 'FYEND', 'DAYSTOFYEND',
       'FYEPSNXT', 'GROWTH*0.5TO0.75', 'ROIC (BW ROA ROE)', 'TGT FWD P/E',
       'YEARS TO NORMALIZATION', 'LASTUPDATED', 'CATEGORY', 'COMMENTS.1',
       'FILENAME', 'DayOfWeek0Mon', 'Date_YahooFinance', 'Close_^GSPC',
       'Close_^VIX', 'Close_^GSPC_200MA', 'SP500from200MA', 'Open_Year',
       'CONS_SENT_Date', 'CONS_SENT_Index', 'Date',
       'AAII_0_level_0_Reported_Date', 'AAII_1_level_0_Reported_Bullish',
       'AAII_2_level_0_Reported_Neutral', 'AAII_3_level_0

In [11]:
# update columns headers to clean up
df_XY.columns = list(
    pd.Series(df_XY.columns)
    .astype(str)
    .str.replace(" ", "_", regex=True)
    .str.upper()
    .str.strip()
    .str.replace("/", "_")
    .str.replace("*", "_")
)

# avoid duplicates
df_XY = df_XY.loc[:,~df_XY.columns.duplicated()]

# start with numeric, utilizng explore data before
numeric_features = df_XY.convert_dtypes().select_dtypes(include=np.number).columns.tolist()
numeric_features = numeric_features + [
    "%_TO_STOP",
    "%_TO_TARGET",
    "GROWTH_0.5TO0.75",
    "ROIC_(BW_ROA_ROE)",
    "TGT_FWD_P_E",
    "YEARS_TO_NORMALIZATION",
]
numeric_features = list(set(numeric_features))

numeric_transformer = Pipeline(
    steps=[
        ("numerizer", Numerizer()),
        ("imputer", SimpleImputer(missing_values=np.nan,strategy="median")),
    ]
)
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="_NA_")),
        ("stringtransformer", StringTransformer()),
    ]
)

# numerical

# categorical_features = ['embarked', 'sex', 'pclass']
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_features = list(set(df_XY.columns).difference(set(numeric_features)))

preprocessor_na = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    # remainder = 'passthrough' # not needed anymore
)

XY_imputed = preprocessor_na.fit_transform(df_XY)

columns = get_feature_names(preprocessor_na)

df_XY_imputed = pd.DataFrame(XY_imputed, columns=columns).convert_dtypes()

  pd.Series(df_XY.columns)


In [12]:
df_XY_imputed.head()

Unnamed: 0,%_TO_TARGET,AAII_BULLISH_AVERAGE_-_ST._DEV.,AAII_BULLISH_BULL-BEAR_SPREAD,UNNAMED:_0.2,UNNAMED:_0,YEARS_TO_NORMALIZATION,%_TO_STOP,PNL,QTYCHG,PRICE,...,UNNAMED:_8,SYMBOL,DAYSTOFYEND,CONS_SENT_DATE,TICKER,FYEND,OPENACT,STOP,DATE_YAHOOFINANCE,STARTDATE
0,0.0045,0.276547,0.138889,0,0,6,-0.1473,-4851.23,200,46.8,...,_NA_,AER,_NA_,2015-06-30,_NA_,_NA_,B,_NA_,2015-06-30,_NA_
1,0.0045,0.276547,0.138889,1,10,6,-0.1473,-4728.0,2000,16.49,...,_NA_,ABX,_NA_,2015-06-30,_NA_,_NA_,S,_NA_,2015-06-30,_NA_
2,0.0045,0.276547,0.138889,2,35,6,-0.1473,2229.17,20,689.73,...,_NA_,AZO,_NA_,2015-06-30,_NA_,_NA_,B,_NA_,2015-06-30,_NA_
3,0.0045,0.276547,0.138889,3,259,6,-0.1473,-1573.1,100,59.36,...,_NA_,AIG,_NA_,2015-06-30,_NA_,_NA_,B,_NA_,2015-06-30,_NA_
4,0.0045,0.276547,0.138889,4,64,6,-0.1473,6677.8,4000,5.97,...,_NA_,VALE,_NA_,2015-06-30,_NA_,_NA_,S,_NA_,2015-06-30,_NA_


In [13]:
df_XY_imputed.columns

Index(['%_TO_TARGET', 'AAII_BULLISH_AVERAGE_-_ST._DEV.',
       'AAII_BULLISH_BULL-BEAR_SPREAD', 'UNNAMED:_0.2', 'UNNAMED:_0',
       'YEARS_TO_NORMALIZATION', '%_TO_STOP', 'PNL', 'QTYCHG', 'PRICE',
       'AAII_3_LEVEL_0_REPORTED_BEARISH', 'AAII_S&P_500_WEEKLY_CLOSE',
       'QUANTITY', 'COMM_TOT', 'AGE', 'DAYOFWEEK0MON',
       'AAII_BULLISH_8-WEEK_MOV_AVG', 'AAII_2_LEVEL_0_REPORTED_NEUTRAL',
       'CLOSE_^GSPC', 'CLOSE_PRICE', 'AAII_S&P_500_WEEKLY_LOW',
       'AAII_BULLISH_AVERAGE_+ST._DEV.', 'AAII_1_LEVEL_0_REPORTED_BULLISH',
       'CLOSE_^GSPC_200MA', 'AAII_BULLISH_BULLISH_AVERAGE', 'OPEN_YEAR',
       'GROWTH_0.5TO0.75', 'AAII_S&P_500_WEEKLY_HIGH', 'ROIC_(BW_ROA_ROE)',
       'CLOSE_^VIX', 'SP500FROM200MA', 'TGT_FWD_P_E', 'CONS_SENT_INDEX',
       'OPEN_PRICE', 'COMMISSION', 'UNNAMED:_0.1', 'WEIGHT',
       'AAII_4_LEVEL_0_REPORTED_TOTAL', 'OPEN_DATE', 'TARGET', 'COMMENTS.1',
       'ACTION', 'FILENAME', 'CATEGORY', 'CASH_CHG_(PNL)', 'COMMENTS',
       'COMPANY_NAME_(IN_ALPHAB

In [14]:
# df_XY_imputed["%_TO_STOP"].hist()

In [15]:
# create target

df_XY_imputed["PCT_RET_FINAL"] = df_XY_imputed["PNL"] / (
    df_XY_imputed["OPEN_PRICE"] * df_XY_imputed["QUANTITY"]
)

In [16]:
# TODO create moving avg

In [17]:
# Final columns

print(df_XY_imputed.columns)

Index(['%_TO_TARGET', 'AAII_BULLISH_AVERAGE_-_ST._DEV.',
       'AAII_BULLISH_BULL-BEAR_SPREAD', 'UNNAMED:_0.2', 'UNNAMED:_0',
       'YEARS_TO_NORMALIZATION', '%_TO_STOP', 'PNL', 'QTYCHG', 'PRICE',
       'AAII_3_LEVEL_0_REPORTED_BEARISH', 'AAII_S&P_500_WEEKLY_CLOSE',
       'QUANTITY', 'COMM_TOT', 'AGE', 'DAYOFWEEK0MON',
       'AAII_BULLISH_8-WEEK_MOV_AVG', 'AAII_2_LEVEL_0_REPORTED_NEUTRAL',
       'CLOSE_^GSPC', 'CLOSE_PRICE', 'AAII_S&P_500_WEEKLY_LOW',
       'AAII_BULLISH_AVERAGE_+ST._DEV.', 'AAII_1_LEVEL_0_REPORTED_BULLISH',
       'CLOSE_^GSPC_200MA', 'AAII_BULLISH_BULLISH_AVERAGE', 'OPEN_YEAR',
       'GROWTH_0.5TO0.75', 'AAII_S&P_500_WEEKLY_HIGH', 'ROIC_(BW_ROA_ROE)',
       'CLOSE_^VIX', 'SP500FROM200MA', 'TGT_FWD_P_E', 'CONS_SENT_INDEX',
       'OPEN_PRICE', 'COMMISSION', 'UNNAMED:_0.1', 'WEIGHT',
       'AAII_4_LEVEL_0_REPORTED_TOTAL', 'OPEN_DATE', 'TARGET', 'COMMENTS.1',
       'ACTION', 'FILENAME', 'CATEGORY', 'CASH_CHG_(PNL)', 'COMMENTS',
       'COMPANY_NAME_(IN_ALPHAB

In [18]:
## check no na's left in numerical

try:
    assert (
        df_XY_imputed[numeric_features].isna().sum().sum() == 0
    ), "NAs remain in numerical"
except:
    print("NAs remain in numerical")

## API Spec

In [19]:
## import api spec

import yaml
from yaml import Loader

with open("data-tests/_apispecs.yaml") as f:
    api_specs = yaml.load(f, Loader=Loader)

In [20]:
## validate based on api spec

from openapi_schema_validator import validate
import json

schema = api_specs["components"]["schemas"]["Tradelog"]

json_str = df_XY_imputed.to_json(orient="records")
json_test = json.loads(json_str)

i = 0
for row in json_test:
    try:
        validate(row, schema)
    except:
        print(f"failed on {i}th row ")
        break
    i = i + 1

print("validation completed")

validation completed


In [21]:
## save api spec to html

import os

# feed yaml file to swagger python, then create api.html
os.system(
    "python swagger_yaml_to_html.py < data-tests/_apispecs.yaml > templates/api.html"
)

0

## Resort & Save Results

In [22]:
df_XY_imputed = df_XY_imputed.reindex(sorted(df_XY_imputed.columns), axis=1)

In [23]:
## save results

df_XY_imputed.to_csv("output/e_resultcleaned.csv")

In [24]:
## save imputer

dump(preprocessor_na, "output/e_preprocessor_na.joblib")

['output/e_preprocessor_na.joblib']