In [76]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gc

class Feature:
    
    class Utils:
        def _set_type(series, dtype):
            _max, _min = max(series), min(series)
            if dtype == 'uint':
                if _max <= 255: return np.uint8
                elif _max <= 65535: return np.uint16
                elif _max <= 4294967295: return np.uint32
                else: return np.uint64
            elif dtype == 'int':
                if _min >= -128 and _max <= 127: return np.int8
                elif _min >=-32768 and _max <= 32767: return np.int16
                elif _min >= -2147483648 and _max <= 2147483647: return np.int32
                else: return np.int64
            elif dtype == 'float':
                if max(abs(_min), _max) <= 3.4028235e+38: return np.float32
                else: return np.float64

        def save(df=None, flg='both', train_len=0, url='./', name='default'):
            if flg == 'train':
                df.reset_index(drop=True).to_feather(url + 'train__' + name + '.ftr')
            elif flg == 'test':
                df.reset_index(drop=True).to_feather(url + 'test__' + name + '.ftr')
            else:
                df[:train_len].reset_index(drop=True).to_feather(url + 'train__' + name + '.ftr')
                df[train_len:].reset_index(drop=True).to_feather(url + 'test__' + name + '.ftr')
    
    
    
    # calc_col: additional col to help count
    def count(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        count_result = df[d_cols].groupby(by=group_cols)[[calc_col]].count().rename(index=str, columns={calc_col: col_name}).reset_index()
        dtype[col_name] = Feature.Utils._set_type(count_result[col_name], 'uint')
        _df = df.merge(count_result.astype(dtype), on=group_cols, how='left')
        result = _df[[col_name]].copy()
        del _df, count_result, d_cols, dtype
        gc.collect()
        return result
    
    def unique_count(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        count_result = df[d_cols].groupby(by=group_cols)[[calc_col]].nunique().rename(index=str, columns={calc_col: col_name}).reset_index()
        dtype[col_name] = Feature.Utils._set_type(count_result[col_name], 'uint')
        _df = df.merge(count_result.astype(dtype), on=group_cols, how='left')
        result = _df[[col_name]].copy()
        del _df, count_result, d_cols, dtype
        gc.collect()
        return result
    
    def cummulative_count(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        result = df[group_cols].groupby(group_cols).cumcount().rename(col_name)
        r = result.astype(Feature.Utils._set_type(result, 'uint'))
        r = r.to_frame()
        del result
        gc.collect()
        return r
    
    def reverse_cummulative_count(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        result = df.sort_index(inplace=True, ascending=False).[group_cols].groupby(group_cols).cumcount().rename(col_name).sort_index(inplace=True)
        r = result.astype(Feature.Utils._set_type(result, 'uint'))
        r = r.to_frame()
        del result
        gc.collect()
        return r
    
    def variance(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        group = df[d_cols].groupby(by=group_cols)[[calc_col]].var().reset_index().rename(index=str, columns={calc_col: col_name}).fillna(0)
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        dtype[col_name] = Feature.Utils._set_type(group[col_name], 'float')
        _df = df.merge(group.astype(dtype), on=group_cols, how='left')
        r = _df[[col_name]].copy()
        del d_cols, dtype, _df, group
        gc.collect()
        return r
    
    # params['col']: additional col to help count
    # params['coefficient']: 
    def count_std_over_mean(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        d_cols1 = list(d_cols)
        d_cols1.append(params['col'])
        group = df[d_cols1].groupby(by=d_cols)[[params['col']]].count().reset_index().rename(index=str, columns={params['col']: 'count'})
        result = group.groupby(by=group_cols)[['count']].agg(['mean','std'])['count'].reset_index()
        result[col_name] = ((params['coefficient'] * result['std']) / result['mean']).fillna(-1)
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        dtype[col_name] = Feature.Utils._set_type(result[col_name], 'float')
        _df = df.merge(result.astype(dtype), on=group_cols, how='left')
        r = _df[[col_name]].copy()
        del d_cols, d_cols1, group, result, _df
        gc.collect()
        return r
    
    
    
    
    # params['n']: n, params['fillna']: fillna
    def time_to_n_next(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        n = params['n']
        m = params['fillna']
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        result = (df[d_cols].groupby(by=group_cols)[calc_col].shift(-n) - df[calc_col]).fillna(m)
        result = result.astype(Feature.Utils._set_type(result, 'uint')).to_frame()
        del n, m, d_cols
        gc.collect()
        return result
    
    # params['n']: n
    def count_in_previous_n_time_unit(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        n = params['n']
        encodings = df[group_cols[0]].copy()
        if len(group_cols) > 1:
            for c in group_cols[1 : ]:
                encodings = encodings * (10 ** (int(np.log(df[c].max() + 1) / np.log(10)) + 1)) + df[c]
        encodings = encodings.values
        times = df[calc_col].values
        dict_count = defaultdict(int)
        result = []
        bound = 0
        for cur in range(len(encodings)):
            while times[cur] - times[bound] > n:
                dict_count[encodings[bound]] -= 1
                bound += 1
            result.append(dict_count[encodings[cur]])
            dict_count[encodings[cur]] += 1
        r = pd.DataFrame(result, columns=[col_name], dtype=Feature.Utils._set_type(result, 'uint'))
        del encodings, times, dict_count, result, bound, n
        gc.collect()
        return r
    
    def count_in_next_n_time_unit(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        r = Feature.click_count_in_previous_n_time_unit(df.sort_index(ascending=False), group_cols, calc_col, col_name, params)
        r = r.reindex(index=r.index[::-1]).reset_index(drop=True)
        gc.collect()
        return r
    
    
    
    class Encoding:
        def woe(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
            Feature.Encoding._wrapper(df, group_cols, calc_col, col_name,\
                                      {'train_len': params['train_len'], 'function': Feature.Encoding._woe})
            
        def chi_square(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
            Feature.Encoding._wrapper(df, group_cols, calc_col, col_name,\
                                      {'train_len': params['train_len'], 'function': Feature.Encoding._chi_square})
        
        def mean(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
            Feature.Encoding._wrapper(df, group_cols, calc_col, col_name,\
                                      {'train_len': params['train_len'], 'function': Feature.Encoding._mean})
        
        def _wrapper(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
            train = df[ : params['train_len']]
            test = df[params['train_len']:]
            return pd.concat([Feature.Encoding._train_wrapper(df[:params['train_len']],\
                                                              group_cols, calc_col,\
                                                              col_name, params['function'],\
                                                              params['split_col']),\
                              Feature.Encoding._test_wrapper(df[:params['train_len']],\
                                                             df[params['train_len']:],\
                                                             group_cols, label,\
                                                             col_name, params['function'])],\
                             ignore_index=True)
        
        def _train_wrapper(df, group_cols, label, col_name, func, split_col):
            r_list = []
            for i in range(df[split_col].min(), df[split_col].max() + 1):
                dictionary = func(df=df[df[split_col]==i], group_cols=group_cols, calc_col=label, col_name=col_name)
                r_list.append(df[df[split_col]==i].merge(dictionary, on=group_cols, how='left')[[col_name]])
            r = pd.concat(r_list).fillna(-1).reset_index(drop=True)
            del r_list, dictionary
            gc.collect()
            return r
        
        def _testset_wrapper(train, test, group_cols, label, col_name, func):
            dictionary = func(df=train, group_cols=group_cols, calc_col=label, col_name=col_name)
            _df[col_name] = test.merge(dictionary, on=group_cols, how='left')
            r = _df[[col_name]].copy().fillna(-1)
            del _df, dictionary
            gc.collect()
            return r
        
        # label = calc_col
        def _woe(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
            d_cols = list(group_cols)
            d_cols.append(calc_col)
            group = df[d_cols].groupby(by=group_cols)[[calc_col]].agg(['count','sum'])[calc_col].reset_index()
            positive = df[calc_col].sum()
            negative = df.shape[0] - positive
            group[col_name] = np.log((group['sum']+0.5) / positive) / ((group['count']-group['sum']+0.5) / negative)
            dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
            dtype[col_name] = Feature.Utils._set_type(group[col_name], 'float')
            r = group[[col_name]].astype(dtype)
            del d_cols, group, positive, negative, dtype
            gc.collect()
            return r
        
        # label = calc_col
        def _chi_square(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
            total_count = df.shape[0]
            total_sum = df[calc_col].sum()
            group = df.groupby(by=group_cols)[[calc_col]].agg(['count','sum'])[calc_col].reset_index().rename(index=str, columns={'sum': 'n11'})
            group['n12'] = group['count'] - group['n11']
            group['n21'] = total_sum - group['n11']
            group['n22'] = total_count - group['n11'] - group['n12'] - group['n21']
            group['e11'] = (group['n11'] + group['n12']) * (group['n11'] + group['n21']) / total_count
            group['e12'] = (group['n11'] + group['n12']) * (group['n12'] + group['n22']) / total_count
            group['e21'] = (group['n21'] + group['n22']) * (group['n11'] + group['n21']) / total_count
            group['e22'] = (group['n21'] + group['n22']) * (group['n12'] + group['n22']) / total_count
            group[col_name] = (group['n11'] - group['e11']) ** 2 / group['e11'] + \
                                  (group['n12'] - group['e12']) ** 2 / group['e12'] + \
                                  (group['n21'] - group['e21']) ** 2 / group['e21'] + \
                                  (group['n22'] - group['e22']) ** 2 / group['e22']
            total_cols = list(group_cols)
            total_cols.append(col_name)
            dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
            dtype[col_name] = Feature.Utils._set_type(group[col_name], 'float')
            r = group[[total_cols]].copy().astype(dtype)
            del total_count, total_sum, group, total_cols
            gc.collect()
            return r
        
        def _mean(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
            r = df.groupby(by=group_cols)[[calc_col]].mean().reset_index().rename(index=str, columns={calc_col:col_name})
            r = r.astype(Feature.Utils._set_type(r, 'float')).to_frame()
            gc.collect()
            return r
            
        
        
    class Kernels:
        def square(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
            r = df[[calc_col]].apply(lambda x: x ** 2)
            r = r.astype(Feature.Utils._set_type(r, 'float'))
            gc.collect()
            return r

In [82]:
PATH = '/home/kai/data/kaggle/talkingdata/data/'

x = pd.read_csv(PATH + 'train.csv', nrows=6000)

x['click_time'] = (pd.to_datetime(x['click_time']).astype(np.int64) // 10 ** 9).astype(np.int32)

x['second'] = pd.to_datetime(x['click_time']).dt.second.astype('int8')
print(x.columns.values)
y = Feature.Encoding._chi_square(x, ['app', 'channel'], 'is_attributed', 'a', [2])

# print(y.dtypes,type(y))
print(sorted(list(set(y['a']))))


['ip' 'app' 'device' 'os' 'channel' 'click_time' 'attributed_time'
 'is_attributed' 'second']
[0.0016697274, 0.0033400115, 0.0050108526, 0.0066822511, 0.0083542075, 0.010026721, 0.011699793, 0.013373424, 0.015047614, 0.016722361, 0.018397668, 0.020073537, 0.021749962, 0.02342695, 0.025104497, 0.026782606, 0.028461274, 0.030140504, 0.031820297, 0.033500649, 0.035181567, 0.036863044, 0.038545083, 0.040227689, 0.041910857, 0.043594588, 0.045278881, 0.04696374, 0.048649162, 0.050335146, 0.055396501, 0.05708475, 0.058773562, 0.063843407, 0.065534487, 0.067226134, 0.070611142, 0.072304495, 0.079083592, 0.084173903, 0.089269347, 0.092669167, 0.097773187, 0.10288236, 0.11311622, 0.11824092, 0.11995029, 0.1233708, 0.12508191, 0.12850587, 0.13021871, 0.13536073, 0.14737907, 0.16804826, 0.16977449, 0.2148627, 0.21834756, 0.22009088, 0.2288164, 0.27266812, 0.30447447, 0.30624726, 0.3115693, 0.32756832, 0.34183136, 0.58672237, 0.74622941, 2.6732893, 41.044495, 298.60037, 599.09985, 797.73273, 1797.

In [58]:
y = x.reindex(index=x.index[::-1])
print(y.head(3))
print(x.tail(3))


        ip  app  device  os  channel  click_time attributed_time  \
999  27849    6       1  13      459  1509984001             NaN   
998  77065    6       1  10      459  1509984001             NaN   
997  47456    3       1  13      137  1509984001             NaN   

     is_attributed  second  
999              0       1  
998              0       1  
997              0       1  
        ip  app  device  os  channel  click_time attributed_time  \
997  47456    3       1  13      137  1509984001             NaN   
998  77065    6       1  10      459  1509984001             NaN   
999  27849    6       1  13      459  1509984001             NaN   

     is_attributed  second  
997              0       1  
998              0       1  
999              0       1  


In [75]:
import pandas as pd

x = pd.DataFrame([[1,2,3],[3,4,5]], columns=['a','b','c'])
y = x.drop('a', axis=1)


print(x)




# group = x.groupby(by=['a','b'])[['c']].agg(['count'])['c'].reset_index().rename(index=str, columns={'count': 'n11'})


   a  b  c
0  1  2  3
1  3  4  5
