In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gc

class Feature:
    
    class Utils:
        def _set_type(series, dtype):
            _max, _min = max(series), min(series)
            if dtype == 'uint':
                if _max <= 255: return np.uint8
                elif _max <= 65535: return np.uint16
                elif _max <= 4294967295: return np.uint32
                else: return np.uint64
            elif dtype == 'int':
                if _min >= -128 and _max <= 127: return np.int8
                elif _min >=-32768 and _max <= 32767: return np.int16
                elif _min >= -2147483648 and _max <= 2147483647: return np.int32
                else: return np.int64
            elif dtype == 'float':
                if max(abs(_min), _max) <= 3.4028235e+38: return np.float32
                else: return np.float64

        def save(df=None, flg='both', train_len=0, url='./', name='default'):
            if flg == 'train':
                df.reset_index(drop=True).to_pickle(url + 'train__' + name + '.pkl')
            elif flg == 'test':
                df.reset_index(drop=True).to_pickle(url + 'test__' + name + '.pkl')
            else:
                df[:train_len].reset_index(drop=True).to_pickle(url + 'train__' + name + '.pkl')
                df[train_len:].reset_index(drop=True).to_pickle(url + 'test__' + name + '.pkl')
    
    
    
    # params['col']
    def count(df=None, cols=None, col_name=None, params=None):
        """Transforms features by scaling each feature to a given range.
        
        This estimator scales and translates each feature individually such
        that it is in the given range on the training set, i.e. between
        zero and one.
        The transformation is given by::
            X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
            X_scaled = X_std * (max - min) + min
        where min, max = feature_range.
        This transformation is often used as an alternative to zero mean,
        unit variance scaling.
        Read more in the :ref:`User Guide <preprocessing_scaler>`.
        .. versionadded:: 0.17
           *minmax_scale* function interface
            to :class:`sklearn.preprocessing.MinMaxScaler`.
        Parameters
        ----------
        df : dataframe, shape (n_samples, n_features)
            The data.
        cols : array-like
            Array of string names of columns that want to count.
        col_name : str
            The name of column that you want to return at the end.
        params : dictionary
            Params is a dictionary that has various parametors.
            In this method, only params['col'] is used.
            params['col'] is a array of string of column name.
        See also
        --------
        MinMaxScaler: Performs scaling to a given range using the``Transformer`` API
            (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).
        Notes
        -----
        For a comparison of the different scalers, transformers, and normalizers,
        see :ref:`examples/preprocessing/plot_all_scaling.py
        <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
        """    
        r_col = params['col']
        dtype = {x: df[x].dtype for x in cols if x in df.columns.values}
        d_cols = list(cols)
        d_cols.append(r_col)
        result = df[d_cols].groupby(by=cols)[[r_col]].count().rename(index=str, columns={r_col: col_name}).reset_index()
        dtype[col_name] = Feature.Utils._set_type(result[col_name], 'uint')
        _df = df.merge(result.astype(dtype), on=cols, how='left')
        r = _df[[col_name]].copy()
        del _df, result, d_cols, dtype
        gc.collect()
        return r
    
    def unique_count(df=None, cols=None, col_name=None, params=None):
        r_col = cols[-1]
        dtype = {x: df[x].dtype for x in cols[:-1] if x in df.columns.values}
        result = df[cols].groupby(by=cols[:-1])[[r_col]].nunique().rename(index=str, columns={r_col: col_name}).reset_index()
        dtype[col_name] = Feature.Utils._set_type(result[col_name], 'uint')
        _df = df.merge(result.astype(dtype), on=cols, how='left')
        r = _df[[col_name]].copy()
        del _df, result, d_cols, dtype
        gc.collect()
        return r
    
    def cumulative_count(df=None, cols=None, col_name=None, params=None):
        result = df[cols].groupby(by=cols).cumcount().rename(col_name)
        r = result.astype(Feature.Utils._set_type(result, 'uint'))
        r = r.to_frame()
        del result
        gc.collect()
        return r
    
    def reverse_cumulative_count(df=None, cols=None, col_name=None, params=None):
        result = df.sort_index(ascending=False)[cols].groupby(cols).cumcount().rename(col_name)
        r = result.astype(Feature.Utils._set_type(result, 'uint')).to_frame()
        r.sort_index(inplace=True)
        del result
        gc.collect()
        return r
    
    def variance(df=None, cols=None, col_name=None, params=None):
        group_cols = cols[:-1]
        calc_col = cols[-1]
        group = df[cols].groupby(by=group_cols)[[calc_col]].var().reset_index().rename(index=str, columns={calc_col: col_name}).fillna(0)
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        dtype[col_name] = Feature.Utils._set_type(group[col_name], 'float')
        _df = df.merge(group.astype(dtype), on=group_cols, how='left')
        r = _df[[col_name]].copy()
        del dtype, _df, group
        gc.collect()
        return r
    
    # params['col'] = : additional col to help count
    # params['coefficient']: 
    def count_std_over_mean(df=None, cols=None, col_name=None, params=None):
        """
        
        """
        group_cols = cols[:-1]
        calc_col = cols[-1]
        d_cols = list(cols)
        d_cols.append(params['col'])
        group = df[d_cols].groupby(by=cols)[[params['col']]].count().reset_index().rename(index=str, columns={params['col']: 'count'})
        result = group.groupby(by=group_cols)[['count']].agg(['mean','std'])['count'].reset_index()
        result[col_name] = ((int(params['coefficient']) * result['std']) / result['mean']).fillna(-1)
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        dtype[col_name] = Feature.Utils._set_type(result[col_name], 'float')
        _df = df.merge(result.astype(dtype), on=group_cols, how='left')
        r = _df[[col_name]].copy()
        del d_cols, group, result, _df
        gc.collect()
        return r
    
    
    
    
    # params['n']: n, params['fillna']: fillna, cols[-1]: time
    def time_to_n_next(df=None, cols=None, col_name=None, params=None):
        group_cols = cols[:-1]
        calc_col = cols[-1]
        n = int(params['n'])
        m = int(params['fillna'])
        result = (df[cols].groupby(by=group_cols)[calc_col].shift(-n) - df[calc_col]).fillna(m)
        result = result.astype(Feature.Utils._set_type(result, 'uint')).to_frame()
        del n, m
        gc.collect()
        return result
    
    # params['n']: n, cols[-1]: time
    def count_in_previous_n_time_unit(df=None, cols=None, col_name=None, params=None):
        group_cols = cols[:-1]
        calc_col = cols[-1]
        n = int(params['n'])
        encodings = df[group_cols[0]].copy()
        if len(group_cols) > 1:
            for c in group_cols[1 : ]:
                encodings = encodings * (10 ** (int(np.log(df[c].max() + 1) / np.log(10)) + 1)) + df[c]
        encodings = encodings.values
        times = df[calc_col].values
        dict_count = defaultdict(int)
        result = []
        bound = 0
        for cur in range(len(encodings)):
            while times[cur] - times[bound] > n:
                dict_count[encodings[bound]] -= 1
                bound += 1
            result.append(dict_count[encodings[cur]])
            dict_count[encodings[cur]] += 1
        r = pd.DataFrame(result, columns=[col_name], dtype=Feature.Utils._set_type(result, 'uint'))
        del encodings, times, dict_count, result, bound, n
        gc.collect()
        return r
    
    # cols[-1]: time
    def count_in_next_n_time_unit(df=None, cols=None, col_name=None, params=None):
        r = Feature.count_in_previous_n_time_unit(df.sort_index(ascending=False), cols, col_name, params)
        r = r.reindex(index=r.index[::-1]).reset_index(drop=True)
        gc.collect()
        return r
    
    
    
    class Encoding:
        # params['trainLen'], params['splitCol'], params['col']
        def woe(df=None, cols=None, col_name=None, params=None):
            return Feature.Encoding._wrapper(df, cols, col_name,\
                                      {'train_len': params['trainLen'],\
                                       'function': Feature.Encoding._woe,\
                                       'split_col': params['splitCol'],\
                                       'col': params['col']})
        
        def chi_square(df=None, cols=None, col_name=None, params=None):
            return Feature.Encoding._wrapper(df, cols, col_name,\
                                      {'train_len': params['trainLen'],\
                                       'function': Feature.Encoding._chi_square,\
                                       'split_col':params['splitCol'],\
                                       'col': params['col']})
        
        def mean(df=None, cols=None, col_name=None, params=None):
            return Feature.Encoding._wrapper(df, cols, col_name,\
                                      {'train_len': params['trainLen'],\
                                       'function': Feature.Encoding._mean,\
                                       'split_col':params['splitCol'],\
                                       'col': params['col']})
        
        def _wrapper(df=None, cols=None, col_name=None, params=None):
            train = df[ : params['train_len']]
            test = df[params['train_len']:]
            return pd.concat([Feature.Encoding._train_wrapper(df[:params['train_len']],\
                                                              cols, params['col'],\
                                                              col_name, params['function'],\
                                                              params['split_col']),\
                              Feature.Encoding._testset_wrapper(df[:params['train_len']],\
                                                             df[params['train_len']:],\
                                                             cols, params['col'],\
                                                             col_name, params['function'])],\
                             ignore_index=True)
        
        def _train_wrapper(df, group_cols, label, col_name, func, split_col):
            r_list = []
            for i in range(df[split_col].min(), df[split_col].max() + 1):
                dictionary = func(df=df[df[split_col]!=i], group_cols=group_cols, label=label, col_name=col_name)
                r_list.append(df[df[split_col]==i].merge(dictionary, on=group_cols, how='left')[[col_name]])
            r = pd.concat(r_list).fillna(-1).reset_index(drop=True)
            del r_list, dictionary
            gc.collect()
            return r
        
        def _testset_wrapper(train, test, group_cols, label, col_name, func):
            dictionary = func(df=train, group_cols=group_cols, label=label, col_name=col_name)
            _df = test.merge(dictionary, on=group_cols, how='left')
            r = _df[[col_name]].copy().fillna(-1)
            del _df, dictionary
            gc.collect()
            return r
        
        def _woe(df=None, group_cols=None, label=None, col_name=None, params=None):
            d_cols = list(group_cols)
            d_cols.append(label)
            group = df[d_cols].groupby(by=group_cols)[[label]].agg(['count','sum'])[label].reset_index()
            positive = df[label].sum()
            negative = df.shape[0] - positive
            group[col_name] = np.log((group['sum']+0.5) / positive / ((group['count']-group['sum']+0.5) / negative))
            dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
            dtype[col_name] = Feature.Utils._set_type(group[col_name], 'float')
            group.astype(dtype)
            return_cols = list(group_cols)
            return_cols.append(col_name)
            r = group[return_cols]
            del d_cols, group, positive, negative, dtype, return_cols
            gc.collect()
            return r
        
        def _chi_square(df=None, group_cols=None, label=None, col_name=None, params=None):
            total_count = df.shape[0]
            total_sum = df[label].sum()
            group = df.groupby(by=group_cols)[[label]].agg(['count','sum'])[label].reset_index().rename(index=str, columns={'sum': 'n11'})
            group['n12'] = group['count'] - group['n11']
            group['n21'] = total_sum - group['n11']
            group['n22'] = total_count - group['n11'] - group['n12'] - group['n21']
            group['e11'] = (group['n11'] + group['n12']) * (group['n11'] + group['n21']) / total_count
            group['e12'] = (group['n11'] + group['n12']) * (group['n12'] + group['n22']) / total_count
            group['e21'] = (group['n21'] + group['n22']) * (group['n11'] + group['n21']) / total_count
            group['e22'] = (group['n21'] + group['n22']) * (group['n12'] + group['n22']) / total_count
            group[col_name] = (group['n11'] - group['e11']) ** 2 / group['e11'] + \
                                  (group['n12'] - group['e12']) ** 2 / group['e12'] + \
                                  (group['n21'] - group['e21']) ** 2 / group['e21'] + \
                                  (group['n22'] - group['e22']) ** 2 / group['e22']
            dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
            dtype[col_name] = Feature.Utils._set_type(group[col_name], 'float')
            group.astype(dtype)
            return_cols = list(group_cols)
            return_cols.append(col_name)
            r = group[return_cols]
            del group, total_count, total_sum, dtype, return_cols
            gc.collect()
            return r
        
        def _mean(df=None, group_cols=None, label=None, col_name=None, params=None):
            r = df.groupby(by=group_cols)[[label]].mean().reset_index().rename(index=str, columns={label:col_name})
            r.astype(Feature.Utils._set_type(r[col_name], 'float'))
            gc.collect()
            return r
            
        
        
    class Kernels:
        def square(df=None, cols=None, col_name=None, params=None):
            r = df[[cols]].apply(lambda x: x ** 2)
            r = r.astype(Feature.Utils._set_type(r, 'float'))
            gc.collect()
            return r
        
func_map = {
        'count':                         Feature.count,
        'unique_count':                  Feature.unique_count,
        'cumulative_count':              Feature.cumulative_count,
        'reverse_cumulative_count':      Feature.reverse_cumulative_count,
        'variance':                      Feature.variance,
        'count_std_over_mean':           Feature.count_std_over_mean,
        'time_to_n_next':                Feature.time_to_n_next,
        'count_in_previous_n_time_unit': Feature.count_in_previous_n_time_unit,
        'count_in_next_n_time_unit':     Feature.count_in_next_n_time_unit,
        'woe':                           Feature.Encoding.woe,
        'chi_square':                    Feature.Encoding.chi_square,
        'mean':                          Feature.Encoding.mean,
        'square':                        Feature.Kernels.square
    }

In [17]:
# count
# unique_count
# cummulative_count
# reverse_cummulative_count
# variance
# count_std_over_mean
# time_to_n_next
# count_in_previous_n_time_unit
# count_in_next_n_time_unit
# woe
# chi_square
# mean

import pandas as pd
import gc

gc.collect()

PATH = './'
x = pd.read_csv(PATH + 'a.txt')
# print(x)


y_print = Feature.time_to_n_next(df=x, cols=['a','t'], col_name='ddd', params={'train_len':9,'split_col':'t','col':'l','coefficient':10,'n':'1','fillna':'222'})
print(y_print)


      t
0     1
1     1
2     1
3     5
4     1
5     1
6     1
7     5
8     1
9     1
10    5
11    6
12    1
13    1
14    2
15  222
16  222
17  222


In [1]:
print(int('-2'))

-2


[1, 3, 5, 7, 9, [3, 4, 5, 6, 7]]