In [59]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gc

class Feature:
    def _set_type(series, dtype):
        _max, _min = max(series), min(series)
        if dtype == 'uint':
            if _max <= 255: return np.uint8
            elif _max <= 65535: return np.uint16
            elif _max <= 4294967295: return np.uint32
            else: return np.uint64
        elif dtype == 'int':
            if _min >= -128 and _max <= 127: return np.int8
            elif _min >=-32768 and _max <= 32767: return np.int16
            elif _min >= -2147483648 and _max <= 2147483647: return np.int32
            else: return np.int64
        elif dtype == 'float':
            if max(abs(_min), _max) <= 3.4028235e+38: return np.float32
            else: return np.float64
    
    def save(df=None, flg='both', train_len=0, url='./', name='default'):
        if flg == 'train':
            df.reset_index(drop=True).to_feather(url + 'train__' + name + '.ftr')
        elif flg == 'test':
            df.reset_index(drop=True).to_feather(url + 'test__' + name + '.ftr')
        else:
            df[:train_len].reset_index(drop=True).to_feather(url + 'train__' + name + '.ftr')
            df[train_len:].reset_index(drop=True).to_feather(url + 'test__' + name + '.ftr')
    
    # params[0]: additional col to help count
    def count(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        count_result = df[d_cols].groupby(by=group_cols)[[params[0]]].count().rename(index=str, columns={calc_col: col_name}).reset_index()
        dtype[col_name] = Feature._set_type(count_result[col_name], 'uint')
        _df = df.merge(count_result.astype(dtype), on=group_cols, how='left')
        result = _df[[col_name]].copy()
        del _df, count_result, d_cols, dtype
        gc.collect()
        return result
    
    def unique_count(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        count_result = df[d_cols].groupby(by=group_cols)[[calc_col]].nunique().rename(index=str, columns={calc_col: col_name}).reset_index()
        dtype[col_name] = Feature._set_type(count_result[col_name], 'uint')
        _df = df.merge(count_result.astype(dtype), on=group_cols, how='left')
        result = _df[[col_name]].copy()
        del _df, count_result, d_cols, dtype
        gc.collect()
        return result
    
    def cum_count(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        result = df[group_cols].groupby(group_cols).cumcount().rename(col_name).reset_index(drop=True)
        r = result.astype(Feature._set_type(result, 'uint'))
        r = r.to_frame()
        del result
        gc.collect()
        return r
    
    def variance(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        group = df[d_cols].groupby(by=group_cols)[[calc_col]].var().reset_index().rename(index=str, columns={calc_col: col_name}).fillna(0)
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        dtype[col_name] = Feature._set_type(group[col_name], 'float')
        _df = df.merge(group.astype(dtype), on=group_cols, how='left')
        r = _df[[col_name]].copy()
        del d_cols, dtype, _df, group
        gc.collect()
        return r
    
    def woe(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        pass
    
    # params[0]: additional col to help count
    def count_std_over_mean(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        d_cols1 = list(d_cols)
        d_cols1.append(params[0])
        group = df[d_cols1].groupby(by=d_cols)[[params[0]]].count().reset_index().rename(index=str, columns={params[0]: 'count'})
        result = group.groupby(by=group_cols)[['count']].agg(['mean','std'])['count'].reset_index()
        result[col_name] = ((params['coeff'] * result['std']) / result['mean']).fillna(-1)
        dtype = {x: df[x].dtype for x in group_cols if x in df.columns.values}
        dtype[col_name] = Feature._set_type(result[col_name], 'float')
        _df = df.merge(result.astype(dtype), on=group_cols, how='left')
        r = _df[[col_name]].copy()
        del d_cols, d_cols1, group, result, _df
        gc.collect()
        return r
    
    # params[0]: n, params[1]: fillna
    def time_to_n_next_click(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        n = params[0]
        m = params[1]
        d_cols = list(group_cols)
        d_cols.append(calc_col)
        result = (df[d_cols].groupby(by=group_cols)[calc_col].shift(-n) - df[calc_col]).fillna(m)
        result = result.astype(Feature._set_type(result, 'uint')).to_frame()
        del n, m, d_cols
        gc.collect()
        return result
    
    # params[0]: n
    def click_count_in_previous_n_time_unit(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        n = params[0]
        encodings = df[group_cols[0]].copy()
        if len(group_cols) > 1:
            for c in group_cols[1 : ]:
                encodings = encodings * (10 ** (int(np.log(df[c].max() + 1) / np.log(10)) + 1)) + df[c]
        encodings = encodings.values
        times = df[calc_col].values
        dict_count = defaultdict(int)
        result = []
        bound = 0
        for cur in range(len(encodings)):
            while times[cur] - times[bound] > n:
                dict_count[encodings[bound]] -= 1
                bound += 1
            result.append(dict_count[encodings[cur]])
            dict_count[encodings[cur]] += 1
        r = pd.DataFrame(result, columns=[col_name], dtype=Feature._set_type(result, 'uint'))
        del encodings, times, dict_count, result, bound, n
        gc.collect()
        return r
    
    def click_count_in_next_n_time_unit(df=None, group_cols=None, calc_col=None, col_name=None, params=None):
        r = Feature.click_count_in_previous_n_time_unit(df.sort_index(ascending=False), group_cols, calc_col, col_name, params)
        r = r.reindex(index=r.index[::-1]).reset_index(drop=True)
        gc.collect()
        return r

In [60]:
PATH = '/home/kai/data/kaggle/talkingdata/data/'

x = pd.read_csv(PATH + 'train.csv', nrows=1000)

x['click_time'] = (pd.to_datetime(x['click_time']).astype(np.int64) // 10 ** 9).astype(np.int32)

x['second'] = pd.to_datetime(x['click_time']).dt.second.astype('int8')
print(x.columns.values)
y = Feature.click_count_in_next_n_time_unit(x, ['app'], 'click_time', 'a', [2])

print(y.dtypes,type(y))
print(y)


['ip' 'app' 'device' 'os' 'channel' 'click_time' 'attributed_time'
 'is_attributed' 'second']
a    uint16
dtype: object <class 'pandas.core.frame.DataFrame'>
       a
0    460
1    459
2    458
3     39
4    457
5    456
6    455
7    454
8    453
9     70
10   452
11   451
12   450
13   449
14   448
15   447
16   446
17   445
18   444
19   443
20   442
21   441
22    38
23   440
24   439
25   438
26   437
27   436
28   435
29   434
..   ...
970    2
971    0
972    3
973    4
974    0
975    4
976    1
977    3
978    3
979    2
980    3
981    0
982    0
983    1
984    1
985    0
986    2
987    1
988    2
989    0
990    0
991    2
992    0
993    0
994    0
995    1
996    0
997    0
998    1
999    0

[1000 rows x 1 columns]


In [58]:
y = x.reindex(index=x.index[::-1])
print(y.head(3))
print(x.tail(3))


        ip  app  device  os  channel  click_time attributed_time  \
999  27849    6       1  13      459  1509984001             NaN   
998  77065    6       1  10      459  1509984001             NaN   
997  47456    3       1  13      137  1509984001             NaN   

     is_attributed  second  
999              0       1  
998              0       1  
997              0       1  
        ip  app  device  os  channel  click_time attributed_time  \
997  47456    3       1  13      137  1509984001             NaN   
998  77065    6       1  10      459  1509984001             NaN   
999  27849    6       1  13      459  1509984001             NaN   

     is_attributed  second  
997              0       1  
998              0       1  
999              0       1  
