In [1]:
def write_log(s, rewrite=False):
    mode = 'w' if rewrite else 'a'
    with open('log.txt', mode) as f:
        f.write(s + '\n')

# read data

In [2]:
import pandas as pd

PATH = '/home/kai/data/kaggle/talkingdata/data/'
# nrows = 10
nrows = None
dtypes = {
    'ip':            'uint32',
    'app':           'uint16',
    'device':        'uint16',
    'os':            'uint16',
    'channel':       'uint16',
    'is_attributed': 'uint8',
    'click_id':      'uint32'
}
train = pd.read_csv(PATH + 'train.csv', nrows=nrows, dtype=dtypes,
                    usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']).reset_index()
test = pd.read_csv(PATH + 'test_supplement.csv', nrows=nrows, dtype=dtypes,
                    usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']).reset_index()
train['click_id'] = 0
train['is_test'] = 0
test['is_test'] = 1
test['is_attributed'] = 2
print(train.head(3))
write_log('data reading', True)

   index     ip  app  device  os  channel           click_time  is_attributed  \
0      0  83230    3       1  13      379  2017-11-06 14:32:21              0   
1      1  17357    3       1  19      379  2017-11-06 14:33:34              0   
2      2  35810    3       1  13      379  2017-11-06 14:34:12              0   

   click_id  is_test  
0         0        0  
1         0        0  
2         0        0  


# data cleaning

## machine = device + os

In [3]:
import numpy as np
import pytz

# set time zone to be Shanghai time and split click_time into day, hour and minute
def data_clean(df):
    tz = pytz.timezone('Asia/Shanghai')
    df['click_time'] = pd.to_datetime(df['click_time']).dt.tz_localize(pytz.utc).dt.tz_convert(tz)
    df['day'] = df['click_time'].dt.day.astype('uint8')
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
    df['minute'] = df['click_time'].dt.minute.astype('uint8')
    df['minute10'] = (df['minute'] / 10).astype('uint8') * 10 # set to 10 minute
    df['hourminute'] = (df['minute'].astype('uint16') + df['hour'].astype('uint16') * 60)
    df['hourminute10'] = (df['minute10'].astype('uint16') + df['hour'].astype('uint16') * 60)
    df['dayhourminute'] = (df['hourminute'].astype('uint32') + df['day'].astype('uint32') * 60 * 24)
    df['dayhourminute10'] = (df['hourminute10'].astype('uint32') + df['day'].astype('uint32') * 60 * 24)
    df['machine'] = 1000 * df['device'] + df['os']
    df['click_timestamp'] = (df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)

    
    
data_clean(train)
data_clean(test)
df = pd.concat([train, test], ignore_index=True) # concat train and test

data_type = df.dtypes.to_dict()

label = 'is_attributed'
train_len = train.shape[0]
fdir = '/home/kai/data/kaggle/talkingdata/haoyandata1/'
print(df.head(3))
train.to_feather(fdir + 'train_cleaned.ftr')
test.to_feather(fdir + 'test_cleaned.ftr')
write_log('data cleaning')

   app  channel  click_id                click_time  click_timestamp  day  \
0    3      379         0 2017-11-06 22:32:21+08:00       1509978741    6   
1    3      379         0 2017-11-06 22:33:34+08:00       1509978814    6   
2    3      379         0 2017-11-06 22:34:12+08:00       1509978852    6   

   dayhourminute  dayhourminute10  device  hour  hourminute  hourminute10  \
0           9992             9990       1    22        1352          1350   
1           9993             9990       1    22        1353          1350   
2           9994             9990       1    22        1354          1350   

   index     ip  is_attributed  is_test  machine  minute  minute10  os  
0      0  83230              0        0     1013      32        30  13  
1      1  17357              0        0     1019      33        30  19  
2      2  35810              0        0     1013      34        30  13  


In [4]:
def save(df, col_name, train_len):
    df.reset_index(drop=True)
    df[ : train_len].to_feather(fdir + 'train__' + col_name + '.ftr')
    df[train_len : ].reset_index(drop=True).to_feather(fdir + 'test_supplement__' + col_name + '.ftr')

# count

## count the click number for each feature combination

In [4]:
import gc

# Here df is [train test_supp]
def count(df, cols, label, train_len):
    col_name = 'count_' + '_'.join(cols)
    d_cols = list(cols)
    d_cols.append(label)
    count_result = df[d_cols].groupby(by=cols)[[label]].count().rename(index=str, columns={label: col_name}).reset_index()
    type_map = {i: data_type[i] for i in count_result.columns.values if i in data_type.keys()}
    _df = df.merge(count_result.astype(type_map), on=cols, how='left')
    save(_df[[col_name]], col_name, train_len)
    del _df, count_result
    gc.collect()

patterns = [
    ['app','channel'],
    ['app','device','channel','day','hour'],##
    ['app','device','day','hour'],##
    ['app','os','channel','day','hour'],##
    ['ip','day'],
    ['ip'],#
    ['ip','app','device','channel','day'],##
    ['ip','app','device','day'],##
    ['ip','app','device','os','day','hour'],##
    ['ip','app','os','channel'],##
    ['ip','app','os','channel','day'],##
    ['ip','os'],
    ['app','day','hourminute'],
    ['device','os','day','hourminute10'],##
    ['ip','device','os','day','hourminute10']##
]


write_log('count')
for p in patterns:
    count(df, p, label, train_len)
    write_log(str(p))

# unique count

## group data by certain feature combination and count the number of different values of another feature

In [11]:
import gc

def unique_count(df, cols, train_len):
    col_name = 'nunique_' + '_'.join(cols)
    count_result = df[cols].groupby(by=cols[:-1])[[cols[-1]]].nunique().rename(index=str,\
                                                                               columns={cols[-1]: col_name}).reset_index()
    type_map = {i: data_type[i] for i in count_result.columns.values if i in data_type.keys()}
    _df = df.merge(count_result.astype(type_map), on=cols[:-1], how='left')
    print(_df[[col_name]])
    save(_df[[col_name]], col_name, train_len)
    del _df, count_result
    gc.collect()
    
patterns = [
    ['day','ip','machine'],
    ['day','ip','os'],
    ['day','ip','device'],
    ['day','ip','app'],
    ['day','ip','channel'],
    ['machine','app'],
    ['machine','ip'],
    ['machine','channel'],
]

write_log('unique count')
for p in patterns:
    unique_count(df, p, train_len)
    write_log(str(p))

    nunique_ip_day
0                2
1                2
2                2
3                2
4                2
5                2
6                1
7                1
8                1
9                5
10               5
11               5
12               5
13               5
14               5
15               5
16               5
17               5
18               5
19               5
20               5


# cumulative count

## give an order number in each feature combination by each feature combination, sorted by [click_time, index, is_train]

In [12]:
import gc

def cum_count(df, cols, train_len):
    col_name = 'cumcount_' + '_'.join(cols)
    result = df[cols].groupby(cols).cumcount().rename(col_name).to_frame().reset_index(drop=True)
    save(result, col_name, train_len)
    del result
    gc.collect()
    
patterns = [
    ['ip','app','device','os','day','hour'],
    ['ip','day'],
    ['app','device','os','day']
]

write_log('cummulative count')
df.sort_values(['click_time','index','is_test'], inplace=True)
for p in patterns:
    cum_count(df, p, train_len)
    write_log(str(p))
df.sort_index(inplace=True)

# count ratio

## cols1 count / cols2 count

In [13]:
import gc

def _count(df, cols, label):
    col_name = 'count_ratio_' + '_'.join(cols)
    d_cols = list(cols)
    d_cols.append(label)
    count_result = df[d_cols].groupby(by=cols)[[label]].count().rename(index=str, columns={label: col_name}).reset_index()
    type_map = {i: data_type[i] for i in count_result.columns.values if i in data_type.keys()}
    _df = df.merge(count_result.astype(type_map), on=cols, how='left')
    result = _df[[col_name]].copy()
    del _df, count_result
    gc.collect()
    return result

def count_ratio(df, cols1, cols2, label, train_len):
    col_name = 'count_ratio_' + '_'.join(cols1) + '_' + '_'.join(cols2)
    x1 = _count(df, cols1, label)
    x2 = _count(df, cols2, label)
    x1[col_name] = x1[x1.columns.values[0]] / x2[x2.columns.values[0]] # or = round(x1 / x2, 4)
    result = x1[[col_name]]
    save(result, col_name, train_len)
    del x1, x2
    gc.collect()
    
patterns = [
    {'cols1':['ip'], 'cols2':['machine']},
    {'cols1':['ip'], 'cols2':['channel']},
    {'cols1':['machine'], 'cols2':['ip']},
    {'cols1':['app'], 'cols2':['channel']},
    {'cols1':['channel'], 'cols2':['app']}
]

write_log('count ratio')
for p in patterns:
    count_ratio(df, p['cols1'], p['cols2'], label, train_len)
    write_log(str(p))

# cumulative count ratio

## cols cumcount / (cols count-1)

In [19]:
import gc

def _count(df, cols, label):
    col_name = 'count_ratio_' + '_'.join(cols)
    d_cols = list(cols)
    d_cols.append(label)
    count_result = df[d_cols].groupby(by=cols)[[label]].count().rename(index=str, columns={label: col_name}).reset_index()
    type_map = {i: data_type[i] for i in count_result.columns.values if i in data_type.keys()}
    _df = df.merge(count_result.astype(type_map), on=cols, how='left')
    result = _df[[col_name]].copy()
    del _df, count_result
    gc.collect()
    return result

def _cum_count(df, cols):
    col_name = 'cumcount_ratio_' + '_'.join(cols)
    result = df[cols].groupby(cols).cumcount().rename(col_name).to_frame()
    return result.reset_index()[[col_name]]
    
def cum_count_ratio(df, cols, label, train_len):
    col_name = 'cumcount_ratio_' + '_'.join(cols)
    x1 = _cum_count(df, cols)
    x2 = _count(df, cols, label)
    x1[col_name] = round(x1[x1.columns.values[0]] / (x2[x2.columns.values[0]] - 1), 4).fillna(1.1)
    result = x1[[col_name]]
    save(result, col_name, train_len)
    del x1, x2
    gc.collect()
    
patterns = [
    ['ip','day']
]

write_log('cumulative count ratio')
for p in patterns:
    cum_count_ratio(df, p, label, train_len)
    write_log(str(p))

0     0.000000
1     1.000000
2     0.000000
3     0.333333
4     0.666667
5     1.000000
6     0.000000
7     0.500000
8     1.000000
9     0.000000
10    0.333333
11    0.000000
12    0.500000
13    1.000000
14    0.000000
15    1.000000
16    0.000000
17    1.000000
18         NaN
19    0.666667
20    1.000000
dtype: float64


# Time to n next click and its filter

## time delta from current click to the next same feature combination click

In [22]:
import gc

def time_to_n_next_click(df, n, cols, time_col, train_len):
    col_name = 'time_to_n_next_click_' + str(n) + '_' + '_'.join(cols)
    total_cols = list(cols)
    total_cols.append(time_col)
    _df = df[total_cols].copy()
    _df[col_name] = (_df.groupby(cols)[time_col].shift(-n) - _df[time_col] + 1).fillna(999999).astype(int)
    out = _df[[col_name]].sort_index()
    save(out, col_name, train_len)
    del _df, out
    gc.collect()
    return col_name
    
def time_to_n_next_click_filter(name, train_len):
    col_name = 'filter_' + name
    in_func_train = pd.read_feather(fdir + 'train__' + name + '.ftr')
    in_func_test = pd.read_feather(fdir + 'test_supplement__' + name + '.ftr')
    in_func_df = pd.concat([in_func_train, in_func_test], ignore_index=True)
    in_func_df[col_name] = 2
    in_func_df[col_name] -= (in_func_df[name] < 1800) & (in_func_df[name] > 2)
    in_func_df[col_name] -= (in_func_df[name] < 30) * 2
    in_func_df
    save(in_func_df[[col_name]], col_name, train_len)
    del in_func_df, in_func_train, in_func_test
    gc.collect()
    
patterns = [
    ['day','ip','app','device','os']
]

write_log('time to next')
df.sort_values(['click_time','is_attributed','click_id'], inplace=True)
for p in patterns:
    time_to_n_next_click_filter(time_to_n_next_click(df, 1, p, 'click_timestamp', train_len), train_len)
    time_to_n_next_click_filter(time_to_n_next_click(df, 2, p, 'click_timestamp', train_len), train_len)
    write_log(str(p))
df.sort_index(inplace=True)

# range count (same as unique count of certain time col group by feature combination)

In [23]:
import gc

def unique_count(df, cols, train_len):
    col_name = 'rang_count_' + '_'.join(cols)
    count_result = df[cols].groupby(by=cols[:-1])[[cols[-1]]].nunique().rename(index=str,\
                                                                               columns={cols[-1]: col_name}).reset_index()
    type_map = {i: data_type[i] for i in count_result.columns.values if i in data_type.keys()}
    _df = df.merge(count_result.astype(type_map), on=cols[:-1], how='left')
    save(_df[[col_name]], col_name, train_len)
    del _df, count_result
    gc.collect()
    
patterns = [
    ['ip','day'],
    ['ip','hour'],
    ['ip','dayhourminute'],
    ['ip','dayhourminute10'],
    ['app','os','channel','dayhourminute'],
    ['app','os','channel','dayhourminute10'],
    ['ip','channel','dayhourminute'],
    ['ip','channel','dayhourminute10'],
    ['ip','device','os','dayhourminute'],
    ['ip','device','os','dayhourminute10'],
]

write_log('range count')
for p in patterns:
    unique_count(df, p, train_len)
    write_log(str(p))

# variance (/(N-1))

## variance for the last col element groupby the first several columns

In [24]:
def variance(df, cols, train_len):
    col_name = 'variance_' + '_'.join(cols)
    group = df[cols].groupby(by=cols[:-1])[[cols[-1]]].var().reset_index().rename(index=str, columns={cols[-1]: col_name})
    group[col_name] = group[col_name].fillna(0).astype(int)
    type_map = {i: data_type[i] for i in group.columns.values if i in data_type.keys()}
    _df = df.merge(group, on=cols[:-1], how='left')
    save(_df[[col_name]], col_name, train_len)
    del _df, group
    gc.collect()
    
patterns = [
    ['ip','device','hour']
]

write_log('var')
for p in patterns:
    variance(df, p, train_len)
    write_log(str(p))

# common ip

## this part is to assume that std(count_ip/day)/mean(count_ip/day) will behave different when fraud comes

In [5]:
import gc

def get_com_ip(df, col, train_len):
    fday = df[col].min()
    lday = df[col].max()
    if len(df[df.day==fday]) < 1000:
        fday += 1
    if len(df[df.day==lday]) < 1000:
        lday -= 1
        
    name = 'com_ip'
    com_set = set()
    for d in range(fday,lday+1):
        if d == fday:
            com_set = set(df[df[col]==d]['ip'].unique())
        else:
            com_set = com_set & set(df[df[col]==d]['ip'].unique())
    flt_ip = df.ip.isin(com_set)
    com_ip = ((df['ip'] + 1) * flt_ip).to_frame()
    print(com_ip)
    save(com_ip, name, train_len)
    
    del com_ip
    gc.collect()    
    return flt_ip


def dump_com_ip_feature(df, flt_ip, threshold, label, train_len):
    com_df = df[flt_ip]
    name = 'com' + str(threshold) + '_ip'
    cols = ['ip', 'day']
    total_cols = cols.copy()
    total_cols.append(label)
    group = com_df[total_cols].groupby(by=cols)[[label]].count().reset_index().rename(index=str, columns={label: 'count'})
    result = group[['ip','count']].groupby('ip')[['count']].agg(['mean', 'std'])['count'].reset_index()
    result['flg'] = (100 * result['std'] / result['mean']) <= threshold
    type_map = {i: data_type[i] for i in result.columns.values if i in data_type.keys()}
    _df = pd.merge(df, result[['ip','flg']], on='ip', how='left').fillna(False)
    _df[name] = (_df['ip']+1) * _df['flg']
    save(_df[[name]], name, train_len)

    del _df
    gc.collect()

write_log('common ip')
dump_com_ip_feature(df, get_com_ip(df, 'day', train_len), 1, label, train_len)
write_log(str(['ip', 'day']))

done
0
1
2
5
7
9
    ip
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    3
10   3
11   3
12   3
13   3
14   3
15   3
16   3
17   3
18   3
19   3
20   3
21   0
22   0
23   0
24   0


# WOE (categorical feature encoding)

## use day 7,9 to get day 8, and same for other days on training and use all training to get woe on test data

In [None]:
import gc
import numpy as np

def _woe(calc_df, map_df, cols, label, col_name):
    t_cols = list(cols)
    t_cols.append(label)
    group = calc_df[t_cols].groupby(by=cols)[[label]].agg(['count','sum'])[label].reset_index()
    positive = calc_df[label].sum()
    negative = calc_df.shape[0] - positive
#     group[col_name] = np.log((group['sum']+0.5) / positive) / ((group['count']-group['sum']+0.5) / negative)
    group[col_name] = np.log((group['sum'] / positive) / ((group['count']-group['sum']) / negative)) + 1
    t_cols[-1] = col_name
    type_map = {i: data_type[i] for i in group.columns.values if i in data_type.keys()}
    return map_df.merge(group[t_cols], on=cols, how='left')

def woe(train, test, cols, label):
    fdf = train
    fdf = train[train['hour']>=12]
    fdf = fdf[fdf['hour']<=22]
    fday = train['day'].min()
    lday = train['day'].max()
    total_cols = list(cols)
    total_cols.append(label)
    col_name = 'woe_' + '_'.join(cols)
    _df_list = [_woe(fdf[fdf.day!=day], train[train.day==day], cols, label, col_name) for day in range(fday,lday+1)]
    _df = pd.concat(_df_list).fillna(-1).reset_index(drop=True)
    _df[[col_name]].to_feather(fdir + 'train__' + col_name + '.ftr')
    del _df, _df_list
    gc.collect()
    
    _df = _woe(fdf, test, cols, label, col_name).fillna(-1).reset_index()
    _df[[col_name]].to_feather(fdir + 'test_supplement__' + col_name + '.ftr')
    del _df
    gc.collect()

patterns = [
    ['ip'],
    ['app'],
    ['device'],
    ['os'],
    ['channel'],
    ['ip','app'],
    ['ip','device'],
    ['ip','os'],
    ['ip','channel'],
    ['app','device'],
    ['app','os'],
    ['app','channel'],
    ['ip','app','device'],
    ['ip','app','os'],
    ['ip','app','channel'],
    ['ip','device','os'],
    ['ip','device','channel'],
    ['ip','os','channel'],
    ['app','device','os'],
    ['app','device','channel'],
    ['app','os','channel'],
    ['ip','app','device','os'],
    ['ip','app','device','channel'],
    ['ip','app','os','channel'],
    ['ip','device','os','channel'],
    ['app','device','os','channel'],
    ['ip','nextClickLeakDayFlt'],
    ['app','nextClickLeakDayFlt'],
    ['device','nextClickLeakDayFlt'],
    ['os','nextClickLeakDayFlt'],
    ['channel','nextClickLeakDayFlt'],
    ['ip','app','nextClickLeakDayFlt'],
    ['ip','device','nextClickLeakDayFlt'],
    ['ip','os','nextClickLeakDayFlt'],
    ['ip','channel','nextClickLeakDayFlt'],
    ['app','device','nextClickLeakDayFlt'],
    ['app','os','nextClickLeakDayFlt'],
    ['app','channel','nextClickLeakDayFlt'],
    ['device','os','nextClickLeakDayFlt'],
    ['device','channel','nextClickLeakDayFlt'],
    ['os','channel','nextClickLeakDayFlt']
]
woe_train = train
woe_train['nextClickLeakDayFlt'] = pd.read_feather(fdir + 'train__filter_time_to_n_next_click_1_day_ip_app_device_os.ftr')
woe_test = test
woe_test['nextClickLeakDayFlt']=pd.read_feather\
        (fdir + 'test_supplement__filter_time_to_n_next_click_1_day_ip_app_device_os.ftr')

write_log('woe')
for p in patterns:
    woe(woe_train, woe_test, p, label)
    write_log(str(p))
print('done')

  # This is added back by InteractiveShellApp.init_path()


In [None]:
write_log('done')

In [None]:
gc.collect()