In [8]:
import pandas as pd
import numpy as np
import time
from itertools import combinations
import gc
import sys

In [2]:
path = '/home/kai/data/kaggle/talkingdata/data/'
train = pd.read_csv(path + 'train_cleaned_final.csv')
test = pd.read_csv(path + 'test_cleaned_final.csv')

###
from sklearn.model_selection import KFold
K = 3

kf = KFold(n_splits=K, shuffle = False)
history_index = []
train_index = []
for h,t in kf.split(train):
    history_index.append(h)
    train_index.append(t)

In [13]:
len(train_index[2])
len(train)

184903890

In [4]:
def count(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts()
    
    return group.map(count_map).fillna(0)


def mean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  

    group = get_group(df_train, cols)
    group_history = get_group(df_history, cols)
    mean_map = df_history.groupby(group_history)[target].mean()
    return group.map(mean_map).fillna(-1)


def reversemean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    positive = group_history[df_history[target] == 1]
    negative = group_history[df_history[target] == 0]
    index_p = set(positive.unique())
    index_n = set(negative.unique())
    index_n.difference_update(index_p)
    map_reverse_p = positive.groupby(positive).count() / len(positive)
    map_reverse_n = pd.Series(np.zeros(len(index_n)), index=index_n)
    map_reverse = pd.concat([map_reverse_p, map_reverse_n])
    return group.map(map_reverse).fillna(-1)


def time2nextclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    df_reverse = df_train.sort_values([timecol], ascending=False)
    group = get_group(df_reverse,  cols)
    
    next_heard = {}
    for g, t in zip(group, df_reverse[timecol]):
        if g in next_heard:
            result.append(next_heard[g] - t)
        else:
            result.append(-1)
        next_heard[g] = t
    
    result.reverse()
    return result

def time2previousclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    group = get_group(df_train, cols)

    last_heard = {}
    for t, g in zip(df_train[timecol], group):
        if g in last_heard:
            result.append(t - last_heard[g])
        else:
            result.append(-1)
        last_heard[g] = t
        
    return result

def countfrompast(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
    
    count = {}
    result = []
    for g in group.values:
        if g not in count:
            count[g] = 0
        else:
            count[g] += 1
        result.append(count[g])
        
    return result

def countfromfuture(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_values([timecol], ascending=False)
    group = get_group(df_reverse,  cols)
    
    count = {}
    for g in group.values:
        if g in count:
            result.append(count[g])
            count[g] += 1 
        else:
            result.append(0)
            count[g] = 1
    
    result.reverse()
    return result

def lasttimediff(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
        
    last_time = df_train.groupby(group)[timecol].last()
    
    return group.map(last_time) - df_train[timecol]

def col_name(cols, func=None):
    if func is None:
        return '_'.join(cols)
    else:
        return '_'.join(cols) + '_' + func.__name__

In [7]:
orders = {}
feature_col = ['ip', 'app',  'device', 'os', 'channel', 'day',  'hour','intesthh']

for col in feature_col:
    orders[col] = 10 ** (int(np.log(max(df_history[col].max(),df_all[col].max() ) + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

In [None]:
entry_size = 45
counter = 0
combine_col = ['ip', 'app',  'device', 'os', 'channel', 'day',  'hour','intesthh']
base_col = [ 'app',  'device', 'os', 'channel', 'hour']
for fold in range(K):
    df_train = train.iloc[train_index[fold]]
    df_history = train.iloc[history_index[fold]]
    df_all = pd.concat([df_train, test])
    for func in [count, mean, reversemean, time2nextclick, time2previousclick, countfromfuture, countfrompast, lasttimediff]:
        save_train = df_train[base_col].copy()
        save_test = test[base_col].copy()
        entry_pressed = 0
        entry_counter = 0
        for num_col in [1,2,3,4,5]:
            for cols in combinations(combine_col, num_col):
                feature_name = col_name(cols, func=func)
                if func.__name__ == count.__name__:
                    print('count function')
                    save_train[feature_name] = func(df_all, df_train, cols, target='is_attributed')
                    save_test[feature_name] = func(df_all, test, cols, target='is_attributed')
                    
                elif func.__name__ == mean.__name__:
                    print('mean function')
                    save_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')
                    save_test[feature_name] = func(train, test, cols, target='is_attributed')
                    
                else:
                    print('time related function')
                    save_train[feature_name] = func(df_train, df_train, cols, target='is_attributed')
                    save_test[feature_name] = func(test, test, cols, target='is_attributed')
                    
                entry_counter += 1
                counter += 1
                
                all_str = 'all {}: \t fold:{} \t {} \t size: {} \t length: {}'.format(counter, fold, feature_name, sys.getsizeof(save_train)/ 1024 **3, len(save_train))
                print(all_str)
                with open('feature_all.txt', 'w') as text_file:
                    text_file.write(all_str + '\n')
                
                
                
                if entry_counter >= entry_size:
                    
                    train_file_name = 'train_fold{}_{}_{}.csv'.format(fold, func.__name__, entry_pressed)
                    test_file_name = 'test_fold{}_{}_{}.csv'.format(fold, func.__name__, entry_pressed)
                    print('saving train -- {}'.format(train_file_name))
                    save_train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/stacking/train/' + train_file_name, index=False)
                    print('saving test -- {}'.format(test_file_name))
                    save_test.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/stacking/test/' + test_file_name, index=False)
                    save_train = df_train[base_col].copy()
                    save_test = test[base_col].copy()
                    entry_pressed += 1
                    entry_counter = 0
                gc.collect()
        print('saving file at end of function: {}'.format(func.__name__))
        train_file_name = 'train_fold{}_{}_{}.csv'.format(fold, func.__name__, entry_pressed)
        test_file_name = 'test_fold{}_{}_{}.csv'.format(fold, func.__name__, entry_pressed)
        save_train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/stacking/train/' + train_file_name, index=False)
        save_test.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/stacking/test/' + test_file_name, index=False)


count function
all 1: 	 fold:0 	 ip_count 	 size: 3.214496470987797 	 length: 61634630
count function
all 2: 	 fold:0 	 app_count 	 size: 3.673710249364376 	 length: 61634630
count function
all 3: 	 fold:0 	 device_count 	 size: 4.132924027740955 	 length: 61634630
count function
all 4: 	 fold:0 	 os_count 	 size: 4.592137806117535 	 length: 61634630
count function
all 5: 	 fold:0 	 channel_count 	 size: 5.051351584494114 	 length: 61634630
count function
all 6: 	 fold:0 	 day_count 	 size: 5.510565362870693 	 length: 61634630
count function
all 7: 	 fold:0 	 hour_count 	 size: 5.9697791412472725 	 length: 61634630
count function
all 8: 	 fold:0 	 intesthh_count 	 size: 6.428992919623852 	 length: 61634630
count function
all 9: 	 fold:0 	 ip_app_count 	 size: 6.888206698000431 	 length: 61634630
count function
all 10: 	 fold:0 	 ip_device_count 	 size: 7.34742047637701 	 length: 61634630
count function
all 11: 	 fold:0 	 ip_os_count 	 size: 7.80663425475359 	 length: 61634630
count fun