In [1]:
import pandas as pd
import numpy as np
import time
from itertools import combinations
import gc
import sys

In [2]:
path = '/home/kai/data/kaggle/talkingdata/data/'
train = pd.read_csv(path + 'train_cleaned_final.csv')
test = pd.read_csv(path + 'test_cleaned_final.csv')

###
from sklearn.model_selection import KFold
K = 3

kf = KFold(n_splits=K, shuffle = False)
history_index = []
train_index = []
for h,t in kf.split(train):
    history_index.append(h)
    train_index.append(t)

In [3]:
len(train_index[2])
len(train)

184903890

In [4]:
def count(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts()
    
    return group.map(count_map).fillna(0)

def countsort(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts().iloc[::-1]
    count_map.iloc[:] = list(range(1, len(count_map) + 1))
    
    return group.map(count_map).fillna(-1)

def runningcount(df_history, df_train, cols, target=None, start=None, end=None):
    """
    Purpose: calculate the order of the occurence. meaning: 1st time occur, mark 1. next then add 1
    """
    result = []
    index_train = df_train.index.values
    group_all = get_group(df_history, cols)
    urls_d = defaultdict(int)
    for url in group:
        urls_d[url] += 1
        result.append(urls_d[url])
    result = np.array(result)
    return pd.Series(result).iloc[start: end]


def mean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  

    group = get_group(df_train, cols)
    group_history = get_group(df_history, cols)
    mean_map = df_history.groupby(group_history)[target].mean()
    return group.map(mean_map).fillna(-1)


def reversemean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    positive = group_history[df_history[target] == 1]
    negative = group_history[df_history[target] == 0]
    index_p = set(positive.unique())
    index_n = set(negative.unique())
    index_n.difference_update(index_p)
    map_reverse_p = positive.groupby(positive).count() / len(positive)
    map_reverse_n = pd.Series(np.zeros(len(index_n)), index=index_n)
    map_reverse = pd.concat([map_reverse_p, map_reverse_n])
    return group.map(map_reverse).fillna(-1)


def time2nextclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    next_heard = {}
    for g, t in zip(group, df_reverse[timecol]):
        if g in next_heard:
            result.append(next_heard[g] - t)
        else:
            result.append(-1)
        next_heard[g] = t
    
    result.reverse()
    return result

def time2previousclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    group = get_group(df_train, cols)

    last_heard = {}
    for t, g in zip(df_train[timecol], group):
        if g in last_heard:
            result.append(t - last_heard[g])
        else:
            result.append(-1)
        last_heard[g] = t
        
    return result

def countfrompast(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
    
    count = {}
    result = []
    for g in group.values:
        if g not in count:
            count[g] = 0
        else:
            count[g] += 1
        result.append(count[g])
        
    return result

def countfromfuture(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    count = {}
    for g in group.values:
        if g in count:
            result.append(count[g])
            count[g] += 1 
        else:
            result.append(0)
            count[g] = 1
    
    result.reverse()
    return result

def lasttimediff(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
        
    last_time = df_train.groupby(group)[timecol].last()
    
    return group.map(last_time) - df_train[timecol]

def col_name(cols, func=None):
    if func is None:
        return '_'.join(cols)
    else:
        return '_'.join(cols) + '_' + func.__name__
    
    


In [None]:
orders = {}
feature_col = ['ip', 'app',  'device', 'os', 'channel', 'day',  'hour','intesthh']

for col in feature_col:
    orders[col] = 10 ** (int(np.log(max(train[col].max(),test[col].max() ) + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

In [None]:
entry_size = 45
counter = 0
combine_col = ['ip', 'app',  'device', 'os', 'channel', 'day',  'hour','intesthh']
base_col = [ 'app',  'device', 'os', 'channel', 'hour']
base_col_train = [ 'app',  'device', 'os', 'channel', 'hour', 'is_attributed']

for func in [time2nextclick]:
# for func in [count, mean, time2nextclick, time2previousclick, countfromfuture, countfrompast, lasttimediff]:
    if func.__name__ == count.__name__:
        df_all = pd.concat([train, test])
    else:
        try:
            del df_all
            gc.collect()
        except Exception:
            print('df_all does not exist')
            
    
    for fold in range(0,K):
        df_train = train.iloc[train_index[fold]]
        df_history = train.iloc[history_index[fold]]
        save_train = df_train[base_col_train].copy()
        save_test = test[base_col].copy()
        entry_pressed = 0
        entry_counter = 0
        
        for num_col in [1,2,3,4,5]:
            for cols in combinations(combine_col, num_col):
                feature_name = col_name(cols, func=func)
                if func.__name__ == count.__name__:
                    print('count function')
                    save_train[feature_name] = func(df_all, df_train, cols, target='is_attributed')
                    save_test[feature_name] = func(df_all, test, cols, target='is_attributed')
                    
                elif func.__name__ == mean.__name__:
                    print('mean function')
                    save_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')
                    save_test[feature_name] = func(train, test, cols, target='is_attributed')
                elif func.__name__ == reversemean.__name__:
                    print('mean function')
                    save_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')
                    save_test[feature_name] = func(train, test, cols, target='is_attributed')
                    
                else:
                    print('time related function')
                    save_train[feature_name] = func(df_train, df_train, cols, target='is_attributed')
                    save_test[feature_name] = func(test, test, cols, target='is_attributed')
                    
                entry_counter += 1
                counter += 1
                
                all_str = 'all {}: \t fold:{} \t {} \t size: {} \t length: {}'.format(counter, fold, feature_name, sys.getsizeof(save_train)/ 1024 **3, len(save_train))
                print(all_str)
                with open('feature_all.txt', 'w') as text_file:
                    text_file.write(all_str + '\n')
                
                
                
                if entry_counter >= entry_size:
                    
                    train_file_name = 'train_fold{}_{}_{}.csv'.format(fold, func.__name__, entry_pressed)
                    test_file_name = 'test_fold{}_{}_{}.csv'.format(fold, func.__name__, entry_pressed)
                    print('saving train -- {}'.format(train_file_name))
                    save_train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/stacking/train/' + train_file_name, index=False)
                    print('saving test -- {}'.format(test_file_name))
                    save_test.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/stacking/test/' + test_file_name, index=False)
                    save_train = df_train[base_col_train].copy()
                    save_test = test[base_col].copy()
                    entry_pressed += 1
                    entry_counter = 0
                gc.collect()
        print('saving file at end of function: {}'.format(func.__name__))
        train_file_name = 'train_fold{}_{}_{}.csv'.format(fold, func.__name__, entry_pressed)
        test_file_name = 'test_fold{}_{}_{}.csv'.format(fold, func.__name__, entry_pressed)
        save_train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/stacking/train/' + train_file_name, index=False)
        save_test.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/stacking/test/' + test_file_name, index=False)


df_all does not exist
time related function
all 1: 	 fold:0 	 ip_time2nextclick 	 size: 3.673710249364376 	 length: 61634630
time related function
all 2: 	 fold:0 	 app_time2nextclick 	 size: 4.132924027740955 	 length: 61634630
time related function
all 3: 	 fold:0 	 device_time2nextclick 	 size: 4.592137806117535 	 length: 61634630
time related function
all 4: 	 fold:0 	 os_time2nextclick 	 size: 5.051351584494114 	 length: 61634630
time related function
all 5: 	 fold:0 	 channel_time2nextclick 	 size: 5.510565362870693 	 length: 61634630
time related function
all 6: 	 fold:0 	 day_time2nextclick 	 size: 5.9697791412472725 	 length: 61634630
time related function
all 7: 	 fold:0 	 hour_time2nextclick 	 size: 6.428992919623852 	 length: 61634630
time related function
all 8: 	 fold:0 	 intesthh_time2nextclick 	 size: 6.888206698000431 	 length: 61634630
time related function
all 9: 	 fold:0 	 ip_app_time2nextclick 	 size: 7.34742047637701 	 length: 61634630
time related function
all 10

all 72: 	 fold:0 	 app_hour_intesthh_time2nextclick 	 size: 15.613268487155437 	 length: 61634630
time related function
all 73: 	 fold:0 	 device_os_channel_time2nextclick 	 size: 16.072482265532017 	 length: 61634630
time related function
all 74: 	 fold:0 	 device_os_day_time2nextclick 	 size: 16.531696043908596 	 length: 61634630
time related function
all 75: 	 fold:0 	 device_os_hour_time2nextclick 	 size: 16.990909822285175 	 length: 61634630
time related function
all 76: 	 fold:0 	 device_os_intesthh_time2nextclick 	 size: 17.450123600661755 	 length: 61634630
time related function
all 77: 	 fold:0 	 device_channel_day_time2nextclick 	 size: 17.909337379038334 	 length: 61634630
time related function
all 78: 	 fold:0 	 device_channel_hour_time2nextclick 	 size: 18.368551157414913 	 length: 61634630
time related function
all 79: 	 fold:0 	 device_channel_intesthh_time2nextclick 	 size: 18.827764935791492 	 length: 61634630
time related function
all 80: 	 fold:0 	 device_day_hour_ti

all 138: 	 fold:0 	 app_os_channel_day_time2nextclick 	 size: 4.592137806117535 	 length: 61634630
time related function
all 139: 	 fold:0 	 app_os_channel_hour_time2nextclick 	 size: 5.051351584494114 	 length: 61634630
time related function
all 140: 	 fold:0 	 app_os_channel_intesthh_time2nextclick 	 size: 5.510565362870693 	 length: 61634630
time related function
all 141: 	 fold:0 	 app_os_day_hour_time2nextclick 	 size: 5.9697791412472725 	 length: 61634630
time related function
all 142: 	 fold:0 	 app_os_day_intesthh_time2nextclick 	 size: 6.428992919623852 	 length: 61634630
time related function
all 143: 	 fold:0 	 app_os_hour_intesthh_time2nextclick 	 size: 6.888206698000431 	 length: 61634630
time related function
all 144: 	 fold:0 	 app_channel_day_hour_time2nextclick 	 size: 7.34742047637701 	 length: 61634630
time related function
all 145: 	 fold:0 	 app_channel_day_intesthh_time2nextclick 	 size: 7.80663425475359 	 length: 61634630
time related function
all 146: 	 fold:0 	

all 202: 	 fold:0 	 app_device_os_day_intesthh_time2nextclick 	 size: 13.317199595272541 	 length: 61634630
time related function
all 203: 	 fold:0 	 app_device_os_hour_intesthh_time2nextclick 	 size: 13.77641337364912 	 length: 61634630
time related function
all 204: 	 fold:0 	 app_device_channel_day_hour_time2nextclick 	 size: 14.2356271520257 	 length: 61634630
time related function
all 205: 	 fold:0 	 app_device_channel_day_intesthh_time2nextclick 	 size: 14.694840930402279 	 length: 61634630
time related function
all 206: 	 fold:0 	 app_device_channel_hour_intesthh_time2nextclick 	 size: 15.154054708778858 	 length: 61634630
time related function
all 207: 	 fold:0 	 app_device_day_hour_intesthh_time2nextclick 	 size: 15.613268487155437 	 length: 61634630
time related function
all 208: 	 fold:0 	 app_os_channel_day_hour_time2nextclick 	 size: 16.072482265532017 	 length: 61634630
time related function
all 209: 	 fold:0 	 app_os_channel_day_intesthh_time2nextclick 	 size: 16.5316960

all 271: 	 fold:1 	 ip_channel_hour_time2nextclick 	 size: 6.888206698000431 	 length: 61634630
time related function
all 272: 	 fold:1 	 ip_channel_intesthh_time2nextclick 	 size: 7.34742047637701 	 length: 61634630
time related function
all 273: 	 fold:1 	 ip_day_hour_time2nextclick 	 size: 7.80663425475359 	 length: 61634630
time related function
all 274: 	 fold:1 	 ip_day_intesthh_time2nextclick 	 size: 8.265848033130169 	 length: 61634630
time related function
all 275: 	 fold:1 	 ip_hour_intesthh_time2nextclick 	 size: 8.725061811506748 	 length: 61634630
time related function
all 276: 	 fold:1 	 app_device_os_time2nextclick 	 size: 9.184275589883327 	 length: 61634630
time related function
all 277: 	 fold:1 	 app_device_channel_time2nextclick 	 size: 9.643489368259907 	 length: 61634630
time related function
all 278: 	 fold:1 	 app_device_day_time2nextclick 	 size: 10.102703146636486 	 length: 61634630
time related function
all 279: 	 fold:1 	 app_device_hour_time2nextclick 	 siz

all 339: 	 fold:1 	 ip_os_day_hour_time2nextclick 	 size: 17.450123600661755 	 length: 61634630
time related function
all 340: 	 fold:1 	 ip_os_day_intesthh_time2nextclick 	 size: 17.909337379038334 	 length: 61634630
time related function
all 341: 	 fold:1 	 ip_os_hour_intesthh_time2nextclick 	 size: 18.368551157414913 	 length: 61634630
time related function
all 342: 	 fold:1 	 ip_channel_day_hour_time2nextclick 	 size: 18.827764935791492 	 length: 61634630
time related function
all 343: 	 fold:1 	 ip_channel_day_intesthh_time2nextclick 	 size: 19.28697871416807 	 length: 61634630
time related function
all 344: 	 fold:1 	 ip_channel_hour_intesthh_time2nextclick 	 size: 19.74619249254465 	 length: 61634630
time related function
all 345: 	 fold:1 	 ip_day_hour_intesthh_time2nextclick 	 size: 20.20540627092123 	 length: 61634630
time related function
all 346: 	 fold:1 	 app_device_os_channel_time2nextclick 	 size: 20.66462004929781 	 length: 61634630
time related function
all 347: 	 fol

all 403: 	 fold:1 	 ip_device_os_channel_intesthh_time2nextclick 	 size: 5.510565362870693 	 length: 61634630
time related function
all 404: 	 fold:1 	 ip_device_os_day_hour_time2nextclick 	 size: 5.9697791412472725 	 length: 61634630
time related function
all 405: 	 fold:1 	 ip_device_os_day_intesthh_time2nextclick 	 size: 6.428992919623852 	 length: 61634630
time related function
all 406: 	 fold:1 	 ip_device_os_hour_intesthh_time2nextclick 	 size: 6.888206698000431 	 length: 61634630
time related function
all 407: 	 fold:1 	 ip_device_channel_day_hour_time2nextclick 	 size: 7.34742047637701 	 length: 61634630
time related function
all 408: 	 fold:1 	 ip_device_channel_day_intesthh_time2nextclick 	 size: 7.80663425475359 	 length: 61634630
time related function
all 409: 	 fold:1 	 ip_device_channel_hour_intesthh_time2nextclick 	 size: 8.265848033130169 	 length: 61634630
time related function
all 410: 	 fold:1 	 ip_device_day_hour_intesthh_time2nextclick 	 size: 8.725061811506748 	 l

KeyboardInterrupt: 