In [1]:
# <api>
from sklearn2pmml.decoration import ContinuousDomain, CategoricalDomain
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import Imputer, Normalizer
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler
from sklearn.preprocessing import FunctionTransformer

import base64
from six import string_types
from enum import Enum
import numpy as np
import pandas as pd
import logging

try:
    from exceptions import Exception
except:
    pass

logger = logging.getLogger(__name__)



In [2]:
# <api>
class DataMapperError(Exception):
    pass

In [12]:
# <api>
def is_binary(ftr_vlst):
    uniq_sp = ftr_vlst.unique()
    uniq_sp = uniq_sp[~pd.isnull(uniq_sp)]
    return 2 == len(uniq_sp)

In [None]:
# <api>
def onehot_encoder_with_missing(trn_series, na='CreditX-NA'):
    unary = (trn_series.unique()[0] == 1)
    binary_with_na = is_binary(trn_series) and na in set(trn_series)
    if unary or binary_with_na:
        return LabelEncoder()
    else:
        return LabelBinarizer()


def continuous_feature_transform(ftr_trf, col):
    prep = None
    if col in ftr_trf:
        if isinstance(ftr_trf[col], string_types):
            prep = FtrTransFunc(ftr_trf[col]).method
        elif isinstance(ftr_trf[col], FtrTransFunc):
            prep = ftr_trf[col].method
        else:
            raise DataMapperError('invalid feature transformer: {}'.format(ftr_trf[col]))
    return prep

In [3]:
# <api>
class MisValFunc(Enum):
    """
    AS_VALUE:  means set as new value (both training and serving)
    AS_MEAN:  means set as asMean (both training and serving)
    AS_MEDIAN:  means set as asMedian (both training and serving)
    AS_MODE:  means set as mode (both training and serving)
    AS_IS:  means drop_row during training but asIs during serving(warning)
    DROP_ROW:  means drop_row during training but asMean/asMode during serving
    DEFAULT: drop_row during training but asMean/asMode during serving
    """
    AS_VALUE = 'asValue'
    AS_MEAN = 'asMean'
    AS_MEDIAN = 'asMedian'
    AS_MODE = 'asMode'
    AS_IS = 'asIs'
    DROP_ROW = 'dropRow'
    DEFAULT = 'dropRow'

    def categoricalTransform(self,
                             series,
                             missing_value_replacement=None,
                             invalid_value_treatment='asMissing'):
        if self is MisValFunc.AS_VALUE:
            missing_value_treatment = 'asValue'
            if missing_value_replacement is None:
                raise DataMapperError('no missing_value_replacement for {}'.format(self))
        elif self is MisValFunc.AS_MODE:
            missing_value_treatment = 'asMode'
        elif self is MisValFunc.AS_MEAN or self is MisValFunc.AS_MEDIAN:
            missing_value_treatment = 'asMode'
        elif self is MisValFunc.DROP_ROW or self is MisValFunc.DEFAULT:
            missing_value_treatment = 'asMode'
        elif self is MisValFunc.AS_IS:
            missing_value_treatment = 'asIs'
        else:
            raise DataMapperError('missing_value_treatment for categorical {}'.format(self))
        domain = CategoricalDomain(invalid_value_treatment=invalid_value_treatment,
                                   missing_value_treatment=missing_value_treatment,
                                   missing_value_replacement=missing_value_replacement)
        encoder = self.encoder(series)
        return [domain, encoder] if encoder else [domain]

    def continuousTransform(self,
                            missing_value_replacement=None,
                            invalid_value_treatment='asIs'):
        if self is MisValFunc.AS_VALUE:
            missing_value_treatment = 'asValue'
            if missing_value_replacement is None:
                raise DataMapperError('no missing_value_replacement for {}'.format(self))
        elif self is MisValFunc.AS_MEDIAN or self is MisValFunc.AS_MODE:
            missing_value_treatment = 'asMedian'
        elif self is MisValFunc.AS_MEAN:
            missing_value_treatment = 'asMean'
        elif self is MisValFunc.AS_IS or self is MisValFunc.DROP_ROW or self is MisValFunc.DEFAULT:
            missing_value_treatment = 'asMean'
        else:
            raise DataMapperError('missing_value_treatment for continuous {}'.format(self))
        domain = ContinuousDomain(invalid_value_treatment=invalid_value_treatment,
                                  missing_value_treatment=missing_value_treatment,
                                  missing_value_replacement=missing_value_replacement)
        imputer = self.imputer()
        return [domain, imputer] if imputer else [domain]

    def encoder(self, series):
        return onehot_encoder_with_missing(series)

    def imputer(self, missing_values='NaN'):
        if self is MisValFunc.DROP_ROW or self is MisValFunc.AS_MEAN:
            return Imputer(missing_values=missing_values, strategy='median')
        elif self is MisValFunc.AS_MODE:
            return Imputer(missing_values=missing_values, strategy='most_frequent')
        elif self is MisValFunc.AS_MEAN:
            return Imputer(missing_values=missing_values, strategy='mean')
        else:
            return None

    def apply(self, ftr, data, val=None):
        if self is MisValFunc.DEFAULT:
            set_default_value(data, ftr, val)
            return data
        elif self is MisValFunc.AS_MEAN:
            set_mean(data, ftr)
            return data
        elif self is MisValFunc.AS_MEDIAN:
            set_median(data, ftr)
            return data
        elif self is MisValFunc.DROP_ROW:
            drop_row(data, ftr)
            return data
        elif self is MisValFunc.AS_VALUE:
            newVal = set_as_new_class(data, ftr)
            return self.value, newVal
        else:
            raise NotImplementedError("unsupported missing value transformer")

In [4]:
# <api>
class FtrTransFunc(Enum):
    MIN_MAX_SCALER = 'MinMaxScaler'
    STANDARD_SCALER = 'StandardScaler'
    MAX_ABS_SCALER = 'MaxAbsScaler'
    NORMALIZER = 'Normalizer'
    BINARIZER = 'Binarizer'
    ONE_HOT_ENCODER = 'OneHotEncoder'
    LOG1P = 'log1p'
    LOG = 'log'

    @property
    def method(self):
        if self is FtrTransFunc.MIN_MAX_SCALER:
            return MinMaxScaler(copy=False)
        elif self is FtrTransFunc.STANDARD_SCALER:
            return StandardScaler(copy=False)
        elif self is FtrTransFunc.MAX_ABS_SCALER:
            return MaxAbsScaler(copy=False)
        elif self is FtrTransFunc.NORMALIZER:
            ft = FunctionTransformer(Normalizer(axis=0), False)
            ft.name = self.name
            return ft
        elif self is FtrTransFunc.BINARIZER:
            return LabelBinarizer(copy=False)
        elif self is FtrTransFunc.ONE_HOT_ENCODER:
            return OneHotEncoder()
        elif self is FtrTransFunc.LOG1P:
            ft = FunctionTransformer(np.log1p, False)
            ft.name = self.name
            return ft
        elif self is FtrTransFunc.LOG:
            ft = FunctionTransformer(np.log, False)
            ft.name = self.name
            return ft
        else:
            raise NotImplementedError("unsupported feature transformer")

    def apply(self, series):
        after = self.method.fit_transform(series)
        if 'category' != series.dtype.name and contain_nan(after):
            raise Exception('feature contains nan when transformed by ' + self)
        if 'category' != series.dtype.name and contain_inf(after):
            raise Exception('feature contains inf when transformed by ' + self)
        return after

In [5]:
# <api>
def b64_file_data(fig_path):
    fig_data = None
    with open(fig_path, 'r') as infile:
        fig_data = infile.read()
    return base64.b64encode(fig_data)

In [6]:
# <api>
def drop_row(data, ftr):
    data.dropna(how='any', subset=[ftr], inplace=True)


def contain_nan(series):
    where = np.where(np.isnan(series))
    return 0 != len(where[0])


def contain_inf(series):
    where = np.where(np.isinf(series))
    return 0 != len(where[0])

In [7]:
# <api>
def set_as_new_class(data, ftr):
    uniq_v = data[ftr].unique()
    uniq_v = uniq_v[~pd.isnull(uniq_v)]
    if 0 == len(uniq_v):
        raise Exception('all values of ' + ftr + ' are nan')

    # maybe need more check for the data type
    v = uniq_v[0]
    if isinstance(v, str):
        new_v = ftr + '_NA'
    elif isinstance(v, (float, int)):
        new_v = uniq_v.astype('float32').max() + 1
        data[ftr] = data[ftr].astype('float32')
    else:
        raise Exception('categorical value is string or numerical?')
    set_default_value(data, ftr, new_v)
    data[ftr] = data[ftr].astype('category')
    return new_v

In [8]:
# <api>
def set_default_value(data, ftr, v):
    if v is None:
        raise Exception('value is None')

    old_type = 'category'
    if 'category' == data[ftr].dtype.name:
        data[ftr] = data[ftr].astype('object', copy=True)
        v = str(v)
    else:
        data[ftr] = data[ftr].astype('float32', copy=True)
        v = float(v)
        old_type = 'float32'

    data[ftr].fillna(v, inplace=True)
    data[ftr] = data[ftr].astype(old_type, copy=True)

In [9]:
# <api>
def set_mean(data, ftr):
    series = data[ftr]
    tmp = Imputer(axis=1, strategy='median').fit_transform(series)
    data[ftr].update(pd.Series(tmp[0]))


def set_median(data, ftr):
    series = data[ftr]
    tmp = Imputer(axis=1, strategy='median').fit_transform(series)
    data[ftr].update(pd.Series(tmp[0]))

In [11]:
# <api>
def move_target_last(data, target_col):
    reindex_col = [c for c in data.columns]
    if target_col not in reindex_col:
        return data
    reindex_col.remove(target_col)
    reindex_col.append(target_col)
    return data.reindex_axis(reindex_col, axis=1)

In [13]:
# <api>
def dataMapperBuilder(trn_d, categ_ftr, conti_ftr,
                      mis_val=None, ftr_trf=None):
    """
    build dataFrameMapper according to colume type
    trn_d: traning data in DataFrame format
    categ_ftr: categorical feature(to_dummies)
    conti_ftr: continuous feature(feature transformer)
    mis_val: missing value treatment
    ftr_trf: feature transformer
    """
    mis_val = mis_val if mis_val else {}
    ftr_trf = ftr_trf if ftr_trf else {}
    c_map = []
    for col in trn_d.columns:
        if col in categ_ftr:
            # missing imputer
            mis, val = mis_val.get(col, ('asMean', None))
            misValFunc = MisValFunc(mis)
            op_lst = misValFunc.categoricalTransform(mis_val[col],
                                                     missing_value_replacement=val)
        elif col in conti_ftr:
            # missing imputer
            mis, val = mis_val.get(col, ('asMean', None))
            misValFunc = MisValFunc(mis)
            op_lst = misValFunc.continuousTransform(missing_value_replacement=val)
            # feature transform
            ftr_trans = continuous_feature_transform(ftr_trf, col)
            if ftr_trans:
                op_lst.append(ftr_trans)
        else:
            op_lst = None
        c_map.append((col, op_lst))
    return DataFrameMapper(c_map, df_out=True)

In [14]:
# <api>
def dataMapperPrepare(trn_d, parent_dfm, target_col=None):
    """
    datamapper prepare
    trn_d: train data
    parent_dfm: parent model datamapper
    target_col: target_col would be validated according to mapper
    """
    # target check
    target = [feature for feature, mapper in parent_dfm.features
              if mapper is None]
    if target_col and target:
        if target_col in target:
            raise DataMapperError('df_mapper target mismatch')
        data = move_target_last(trn_d, target_col)
    elif target and not target_col:
        target_col = target[0]
        data = move_target_last(trn_d, target_col)
    else:
        raise DataMapperError('df_mapper no target error')

    # domain ftr check

    categ_ftr = [feature for feature, mapper in parent_dfm.features
                 if mapper and mapper.domain_ == 'categoricaldomain']
    conti_ftr = [feature for feature, mapper in parent_dfm.features
                 if mapper and mapper.domain_ == 'continuousdomain']

    domain_ftr = set(categ_ftr) | set(conti_ftr)
    if set(trn_d.columns) - set(target) - domain_ftr:
        raise DataMapperError("""
        datamapper inconsistency:
        col = {}
        target = {}
        categorical = {}
        continuous = {}
        """.format(set(trn_d.columns), set(target),
                  set(categ_ftr), set(conti_ftr)))

    # categ_ftr string encode
    for ftr in categ_ftr:
        data[ftr] = data[ftr].str.encode('utf-8')

    missing_value_treatment = {name: mapper.missing_value_treatment
                               for (name, mapper) in parent_dfm.features
                               if mapper}
    missing_value_replacement = {name: mapper.missingValueReplacement
                                 for (name, mapper) in parent_dfm.features
                                 if mapper}
    ftr_trf = {name: mapper.steps[-1][0] for (name, mapper) in parent_dfm.features
               if mapper and name in domain_ftr}
    mis_val = {}

    # missing value treatment
    for col, treatment in missing_value_treatment.items():
        defaultVal = missing_value_replacement[col]
        if treatment:
            data[col] = data[col].fillna(defaultVal)
            mis_val[col] = (treatment, defaultVal)

    dfm = dataMapperBuilder(data, categ_ftr, conti_ftr, mis_val, ftr_trf)
    return data, dfm

In [15]:
# <api>
def buildTrainMapper(data, target, id_column=None):
    (transformed, categorical_features,
     continueous_features, invalid_feature) = prepare_for_training(data, target, id_column)
    datamapper = dataMapperBuilder(transformed, categorical_features, continueous_features)
    return transformed, datamapper

In [16]:
# <api>
def prepare_for_training(data, target, id_column=None):
    """
    prepare_for_training shortcuts: using pandas infer
    data: train data
    target: label
    id_column: drop columns
    """
    transformed = data.copy()

    tmp = transformed.pop(target)
    transformed.insert(transformed.shape[1], target, tmp)

    if id_column and id_column in transformed.columns:
        invalid_features = transformed[id_column]
        transformed.drop(id_column, axis=1, inplace=True)

    contineous_describe = transformed.describe()
    non_features = set([target]) | set(invalid_features)
    continueous_features = set(contineous_describe.columns) - non_features
    categorical_features = set(transformed.columns) - set(continueous_features) - non_features
    for feature in categorical_features:
        transformed[feature] = transformed[feature].astype('category')
    for feature in continueous_features:
        transformed[feature] = transformed[feature].astype('float32')

    return transformed, categorical_features, continueous_features, invalid_features