In [72]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold

In [73]:
path = '/home/kai/data/kaggle/talkingdata/data/'
train = pd.read_csv(path + 'train_cleaned_final.csv')

# Get K Fold and use last fold as example

In [74]:
# K = 18
# # kf = KFold(n_splits=K, shuffle = False)
# kf = KFold(n_splits=K, shuffle = True, random_state = 233)
# history_index = []
# train_index = []
# for h,t in kf.split(train):
#     history_index.append(h)
#     train_index.append(t)

# ### use last fold as example

# import sys
# print(sys.getsizeof(train)/ 1024 **3)


# #use last fold as example
# length_train = len(train)
# df_history = train.iloc[history_index[-1]].copy()
# df_train = train.iloc[train_index[-1]].copy()


# print(sys.getsizeof(df_train)/ 1024 **3)
# print(len(df_train))
# print(sys.getsizeof(df_history)/ 1024 **3)
# print(len(df_history))

# Use last 10 million data as example

In [75]:
length_train = len(train)
sample_length = 50 * 1000000
df_history = train.iloc[: length_train - sample_length].copy()
df_train = train.iloc[length_train - sample_length :].copy()

In [76]:
df_train.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'day', 'hour', 'timestamp',
       'minute', 'second', 'is_attributed', 'intesthh'],
      dtype='object')

In [77]:
orders = {}
feature_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',
              'minute',
              'second',
              'intesthh']

# feature_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel']
for col in feature_col:
    orders[col] = 10 ** (int(np.log(train[col].max() + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

import gc
del train
gc.collect()

191

In [78]:
orders

{'app': 1000,
 'channel': 1000,
 'day': 100,
 'device': 10000,
 'hour': 100,
 'intesthh': 10,
 'ip': 1000000,
 'minute': 100,
 'os': 1000,
 'second': 100}

# count
plan 1. count from historical data  
plan 2. count from all data

In [79]:
def count(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts()
    
    return group.map(count_map).fillna(0)

# mean
mean P(target | feature combination)

purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer))
Get the conditional Probability only from historical data and apply to train data.

P(replay | X feature combination) = P( replay & X feature combination) / P (X feature combination)  
=(count(replay & X feature combination) / count(total)) / (count(X feature combination) / count(total))  
= count(replay & X feature combination) / count(X feature combination)  
= sum((replay & X feature combination)) / count(X feature combination)  
= sum((replay or not replayed & X feature combination)) / count(X feature combination)# since replay is 1, not replay is 0  
= sum( X feature combination) / count(X feature combination)  
= mean(X feature combination)  

In [80]:
def scaller(num):
    sca = 1
    while num * sca < 1:
        sca *= 10
    return sca

def mean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    mean_map = df_history.groupby(group_history)[target].mean()
    # mean_map: key - encoding, value - target mean
#     ### sca
#     m_min = mean_map[mean_map > 0].min()
#     sca = scaller(m_min)
#     mean_map *= sca
#     ###

    return group.map(mean_map).fillna(-1)

# reversemean
reverse mean P(feature combination | target)

In [81]:
def reversemean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    positive = group_history[df_history[target] == 1]
    negative = group_history[df_history[target] == 0]
    index_p = set(positive.unique())
    index_n = set(negative.unique())
    index_n.difference_update(index_p)
    map_reverse_p = positive.groupby(positive).count() / len(positive)
    map_reverse_n = pd.Series(np.zeros(len(index_n)), index=index_n)
    map_reverse = pd.concat([map_reverse_p, map_reverse_n])
    return group.map(map_reverse).fillna(-1)

# Time related
get pattern on train and test respectively

In [82]:
def time2nextclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    df_reverse = df_train.sort_values([timecol], ascending=False)
    group = get_group(df_reverse,  cols)
    
    next_heard = {}
    for g, t in zip(group, df_reverse[timecol]):
        if g in next_heard:
            result.append(next_heard[g] - t)
        else:
            result.append(-1)
        next_heard[g] = t
    
    result.reverse()
    return result

def time2previousclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    group = get_group(df_train, cols)

    last_heard = {}
    for t, g in zip(df_train[timecol], group):
        if g in last_heard:
            result.append(t - last_heard[g])
        else:
            result.append(-1)
        last_heard[g] = t
        
    return result

def countfrompast(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
    
    count = {}
    result = []
    for g in group.values:
        if g not in count:
            count[g] = 0
        else:
            count[g] += 1
        result.append(count[g])
        
    return result

def countfromfuture(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_values([timecol], ascending=False)
    group = get_group(df_reverse,  cols)
    
    count = {}
    for g in group.values:
        if g in count:
            result.append(count[g])
            count[g] += 1 
        else:
            result.append(0)
            count[g] = 1
    
    result.reverse()
    return result

def lasttimediff(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
        
    last_time = df_train.groupby(group)[timecol].last()
    
    return group.map(last_time) - df_train[timecol]

# generate all cols

In [83]:
from itertools import combinations
import sys


combine_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel', 
              'day',
              'hour',
              'intesthh']

combine_col_time = ['ip', 
                    'app', 
                    'device', 
                    'os', 
                    'channel']



def col_name(cols, func=None):
    if func is None:
        return '_'.join(cols)
    else:
        return '_'.join(cols) + '_' + func.__name__

counter = 0



exception_list = []
for num_col in [1,2,3,4,5]:
    # below for count and mean
    
#     for cols in combinations(combine_col, num_col):
# #         for func in [count, mean]:
#         for func in [reversemean]:
#             feature_name = col_name(cols, func=func)
#             counter += 1
#             if func.__name__ == count.__name__:
#                 print('count function')
#                 df_train[feature_name] = func(df_train, df_train, cols, target='is_attributed')
#             else:
#                 print('mean function')
#                 df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')
#             all_str = 'all {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(df_train)/ 1024 **3)
#             print(all_str)
#             with open('feature_all.txt', 'w') as text_file:
#                 text_file.write(all_str + '\n')
                
    # below for time related
    for cols in combinations(combine_col_time, num_col):
#         for func in [time2nextclick, time2previousclick, countfromfuture, countfrompast, lasttimediff]:
        for func in [lasttimediff]:
            feature_name = col_name(cols, func=func)
            counter += 1
            df_train[feature_name] = func(df_train, df_train, cols, target='is_attributed')
            all_str = 'all {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(df_train)/ 1024 **3)
            print(all_str)
            with open('feature_all.txt', 'w') as text_file:
                text_file.write(all_str + '\n')


all 1:   ip_lasttimediff   			 size: 4.842877488583326 G.
all 2:   app_lasttimediff   			 size: 5.215406518429518 G.
all 3:   device_lasttimediff   			 size: 5.587935548275709 G.
all 4:   os_lasttimediff   			 size: 5.960464578121901 G.
all 5:   channel_lasttimediff   			 size: 6.332993607968092 G.
all 6:   ip_app_lasttimediff   			 size: 6.705522637814283 G.
all 7:   ip_device_lasttimediff   			 size: 7.078051667660475 G.
all 8:   ip_os_lasttimediff   			 size: 7.450580697506666 G.
all 9:   ip_channel_lasttimediff   			 size: 7.823109727352858 G.
all 10:   app_device_lasttimediff   			 size: 8.195638757199049 G.
all 11:   app_os_lasttimediff   			 size: 8.56816778704524 G.
all 12:   app_channel_lasttimediff   			 size: 8.940696816891432 G.
all 13:   device_os_lasttimediff   			 size: 9.313225846737623 G.
all 14:   device_channel_lasttimediff   			 size: 9.685754876583815 G.
all 15:   os_channel_lasttimediff   			 size: 10.058283906430006 G.
all 16:   ip_app_device_lasttimediff   			 s

# Saving Files

In [84]:
print(len(df_train))

50000000


In [85]:
df_train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_lasttimediff_combine5_0409.csv', index=False)

In [30]:
df_history.head(10)

Unnamed: 0,ip,app,device,os,channel,day,hour,timestamp,minute,second,is_attributed,intesthh
0,83230,3,1,13,379,6,14,1509978741,32,21,0,1
1,17357,3,1,19,379,6,14,1509978814,33,34,0,1
2,35810,3,1,13,379,6,14,1509978852,34,12,0,1
3,45745,14,1,13,478,6,14,1509978892,34,52,0,1
4,161007,3,1,13,379,6,14,1509978908,35,8,0,1
5,18787,3,1,16,379,6,14,1509978986,36,26,0,1
6,103022,3,1,23,379,6,14,1509979064,37,44,0,1
7,114221,3,1,19,379,6,14,1509979079,37,59,0,1
8,165970,3,1,13,379,6,14,1509979090,38,10,0,1
9,74544,64,1,22,459,6,14,1509979103,38,23,0,1
