In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold

In [2]:
path = '/home/kai/data/kaggle/talkingdata/data/'
train = pd.read_csv(path + 'train_cleaned_final.csv')
test = pd.read_csv(path + 'test_cleaned_final.csv')

In [22]:
### load feature cols
import json as js
featurefile = '/home/kai/data/kaggle/talkingdata/wl/data/features/feature_cols.json'
with open(featurefile, 'r') as myjs:
    added_features = js.load(myjs)

In [23]:
added_features

['ip_day_intesthh_count',
 'ip_day_hour_count',
 'ip_os_day_hour_count',
 'ip_app_day_hour_count',
 'ip_app_os_day_hour_count',
 'app_day_hour_count',
 'app_channel_count',
 'device_os_count',
 'app_os_channel_hour_count',
 'app_os_channel_mean',
 'app_os_channel_intesthh_mean',
 'ip_mean',
 'ip_app_mean',
 'ip_device_channel_mean',
 'ip_app_device_mean',
 'ip_app_intesthh_mean',
 'ip_channel_mean',
 'ip_os_mean',
 'ip_device_mean',
 'ip_app_device_channel_mean',
 'ip_app_device_channel_time2nextclick',
 'ip_app_device_os_time2nextclick',
 'app_device_os_channel_time2nextclick',
 'ip_app_device_os_channel_time2nextclick',
 'ip_time2nextclick',
 'ip_time2previousclick',
 'ip_app_time2previousclick',
 'ip_device_time2previousclick',
 'ip_channel_time2previousclick',
 'ip_app_device_time2previousclick',
 'ip_os_time2previousclick',
 'ip_app_device_os_channel_time2previousclick',
 'app_device_countfromfuture',
 'app_channel_countfromfuture',
 'ip_device_countfromfuture',
 'ip_countfromfutu

# use last 75million as example

In [5]:
length = 75 * 1000000
front = len(train) - length
df_history = train.iloc[:front].copy()
df_train = train.iloc[front:].copy()
# last 65m + test
df_all = pd.concat([df_train, test])  

import gc
del test
del df_train
gc.collect()

14

In [6]:
import sys
print(sys.getsizeof(df_all)/ 1024 **3)
print(len(df_all))

9.084314852952957
93790469


In [7]:
orders = {}
feature_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',
              'minute',
              'second',
              'intesthh']

# feature_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel']
for col in feature_col:
    orders[col] = 10 ** (int(np.log(max(df_history[col].max(),df_all[col].max() ) + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

import gc
# del train
gc.collect()

22

# count
plan 1. count from historical data  
plan 2. count from all data

In [8]:
def count(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts()
    
    return group.map(count_map).fillna(0)

# mean
mean P(target | feature combination)

purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer))
Get the conditional Probability only from historical data and apply to train data.

P(replay | X feature combination) = P( replay & X feature combination) / P (X feature combination)  
=(count(replay & X feature combination) / count(total)) / (count(X feature combination) / count(total))  
= count(replay & X feature combination) / count(X feature combination)  
= sum((replay & X feature combination)) / count(X feature combination)  
= sum((replay or not replayed & X feature combination)) / count(X feature combination)# since replay is 1, not replay is 0  
= sum( X feature combination) / count(X feature combination)  
= mean(X feature combination)  

In [9]:
def scaller(num):
    sca = 1
    while num * sca < 1:
        sca *= 10
    return sca

def mean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    mean_map = df_history.groupby(group_history)[target].mean()
    # mean_map: key - encoding, value - target mean
#     ### sca
#     m_min = mean_map[mean_map > 0].min()
#     sca = scaller(m_min)
#     mean_map *= sca
#     ###

    return group.map(mean_map).fillna(-1)

# reversemean
reverse mean P(feature combination | target)

In [10]:
def reversemean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    positive = group_history[df_history[target] == 1]
    negative = group_history[df_history[target] == 0]
    index_p = set(positive.unique())
    index_n = set(negative.unique())
    index_n.difference_update(index_p)
    map_reverse_p = positive.groupby(positive).count() / len(positive)
    map_reverse_n = pd.Series(np.zeros(len(index_n)), index=index_n)
    map_reverse = pd.concat([map_reverse_p, map_reverse_n])
    return group.map(map_reverse).fillna(-1)

# Time related

In [11]:
def time2nextclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    df_reverse = df_train.sort_values([timecol], ascending=False)
    group = get_group(df_reverse,  cols)
    
    next_heard = {}
    for g, t in zip(group, df_reverse[timecol]):
        if g in next_heard:
            result.append(next_heard[g] - t)
        else:
            result.append(-1)
        next_heard[g] = t
    
    result.reverse()
    return result

def time2previousclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    group = get_group(df_train, cols)

    last_heard = {}
    for t, g in zip(df_train[timecol], group):
        if g in last_heard:
            result.append(t - last_heard[g])
        else:
            result.append(-1)
        last_heard[g] = t
        
    return result

def countfrompast(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
    
    count = {}
    result = []
    for g in group.values:
        if g not in count:
            count[g] = 0
        else:
            count[g] += 1
        result.append(count[g])
        
    return result

def countfromfuture(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_values([timecol], ascending=False)
    group = get_group(df_reverse,  cols)
    
    count = {}
    for g in group.values:
        if g in count:
            result.append(count[g])
            count[g] += 1 
        else:
            result.append(0)
            count[g] = 1
    
    result.reverse()
    return result

def lasttimediff(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
        
    last_time = df_train.groupby(group)[timecol].last()
    
    return group.map(last_time) - df_train[timecol]

# generate all cols

In [12]:
from itertools import combinations


combine_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel', 
              'day',
              'hour',
              'intesthh']

combine_col_time = ['ip', 
                    'app', 
                    'device', 
                    'os', 
                    'channel']


def col_name(cols, func=None):
    if func is None:
        return '_'.join(cols)
    else:
        return '_'.join(cols) + '_' + func.__name__

counter = 0



exception_list = []
for num_col in [1,2,3,4,5]:
    for cols in combinations(combine_col, num_col):
#         for func in [count, mean]:
        for func in [count, mean]:
            feature_name = col_name(cols, func=func)
            if feature_name in added_features:
                counter += 1
                if func.__name__ == count.__name__:
                    print('count function')
                    df_all[feature_name] = func(df_all, df_all, cols, target='is_attributed')
                else:
                    print('mean function')
                    # train
                    df_all[feature_name] = pd.concat([func(df_history, df_all.iloc[:length], cols, target='is_attributed'), 
                                                      func(train, df_all.iloc[length:], cols, target='is_attributed')])
#                     df_all.iloc[:length][feature_name] = func(df_history, df_all.iloc[:length], cols, target='is_attributed')
#                     # test
#                     df_all.iloc[length:][feature_name] = func(train, df_all.iloc[length:], cols, target='is_attributed')
                all_str = 'all {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(df_all)/ 1024 **3)
                print(all_str)
                with open('feature_all.txt', 'w') as text_file:
                    text_file.write(all_str + '\n')
    # time related            
    for cols in combinations(combine_col_time, num_col):
        for func in [time2nextclick, time2previousclick, countfromfuture, countfrompast, lasttimediff]:
            feature_name = col_name(cols, func=func)
            if feature_name in added_features:
                print('time related function')
                counter += 1
                if func.__name__ == lasttimediff.__name__:
                    df_all[feature_name] = pd.concat([func(df_all.iloc[:length], df_all.iloc[:length], cols, target='is_attributed'), 
                                                  func(df_all.iloc[length:], df_all.iloc[length:], cols, target='is_attributed')])

                else:

                    df_all[feature_name] = func(df_all.iloc[:length], df_all.iloc[:length], cols, target='is_attributed') \
                                        + func(df_all.iloc[length:], df_all.iloc[length:], cols, target='is_attributed')
                all_str = 'all {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(df_all)/ 1024 **3)
                print(all_str)
                with open('feature_all.txt', 'w') as text_file:
                    text_file.write(all_str + '\n')


mean function
all 1:   ip_mean   			 size: 9.783108301460743 G.
time related function
all 2:   ip_time2nextclick   			 size: 10.481901749968529 G.
time related function
all 3:   ip_time2previousclick   			 size: 11.180695198476315 G.
time related function
all 4:   ip_countfromfuture   			 size: 11.8794886469841 G.
time related function
all 5:   ip_countfrompast   			 size: 12.578282095491886 G.
time related function
all 6:   ip_lasttimediff   			 size: 13.277075543999672 G.
time related function
all 7:   app_countfrompast   			 size: 13.975868992507458 G.
time related function
all 8:   app_lasttimediff   			 size: 14.674662441015244 G.
time related function
all 9:   device_countfromfuture   			 size: 15.37345588952303 G.
time related function
all 10:   device_countfrompast   			 size: 16.072249338030815 G.
time related function
all 11:   device_lasttimediff   			 size: 16.7710427865386 G.
time related function
all 12:   os_lasttimediff   			 size: 17.469836235046387 G.
mean function
al

In [21]:
from itertools import combinations


combine_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel', 
              'day',
              'hour',
              'intesthh']

combine_col_time = ['ip', 
                    'app', 
                    'device', 
                    'os', 
                    'channel']


def col_name(cols, func=None):
    if func is None:
        return '_'.join(cols)
    else:
        return '_'.join(cols) + '_' + func.__name__

counter = 0


final_list = []
exception_list = []
for num_col in [1,2,3,4,5]:

               
    # time related            
    for cols in combinations(combine_col_time, num_col):
        for func in [time2nextclick, time2previousclick, countfromfuture, countfrompast, lasttimediff]:
            feature_name = col_name(cols, func=func)
            if feature_name == 'app_device_os_channel_time2nextclick':
                print('start')
                df_all['app_device_os_channel_time2nextclick'] = func(df_all.iloc[:length], df_all.iloc[:length], cols, target='is_attributed') \
                                        + func(df_all.iloc[length:], df_all.iloc[length:], cols, target='is_attributed')
                print('done')
                

start
done


In [18]:
len(final_list)

51

In [13]:
feature_name

'ip_app_device_os_channel_lasttimediff'

# Saving Files

In [20]:
list(set(added_features) - set(df_all.columns))

['app_device_os_channel_time2nextclic']

In [16]:
df_all.columns

Index(['app', 'channel', 'day', 'device', 'hour', 'intesthh', 'ip',
       'is_attributed', 'minute', 'os', 'second', 'timestamp', 'ip_mean',
       'ip_time2nextclick', 'ip_time2previousclick', 'ip_countfromfuture',
       'ip_countfrompast', 'ip_lasttimediff', 'app_countfrompast',
       'app_lasttimediff', 'device_countfromfuture', 'device_countfrompast',
       'device_lasttimediff', 'os_lasttimediff', 'ip_app_mean',
       'ip_device_mean', 'ip_os_mean', 'ip_channel_mean', 'app_channel_count',
       'device_os_count', 'ip_app_time2previousclick', 'ip_app_lasttimediff',
       'ip_device_time2previousclick', 'ip_device_countfromfuture',
       'ip_device_countfrompast', 'ip_os_time2previousclick',
       'ip_os_lasttimediff', 'ip_channel_time2previousclick',
       'app_device_countfromfuture', 'app_channel_countfromfuture',
       'app_channel_countfrompast', 'ip_app_device_mean',
       'ip_app_intesthh_mean', 'ip_device_channel_mean', 'ip_day_hour_count',
       'ip_day_intesth

In [24]:
target='is_attributed'
train_col = added_features.copy()
train_col.append(target)
train = df_all.iloc[:length][train_col]
test = df_all.iloc[length:][added_features]
print('start saving')

start saving


In [25]:
train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_cm0410time_last75m.csv', index=False)
print('training saving done!')
test.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_cm0410time_last75m.csv', index=False)
print('testing saving done!')

training saving done!
testing saving done!


In [None]:
#For float saving


# df_train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_fold_last_in_12_mean_1float.csv', index=False, float_format='%.1f')
# print('training saving done!')
# test.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_fold_last_in_12_mean_1float.csv', index=False, float_format='%.1f')
# print('testing saving done!')

In [None]:
list(train.columns)

In [31]:
list(test.columns)

['ip_count',
 'ip_minute_count',
 'app_channel_count',
 'ip_app_count',
 'ip_app_hour_count',
 'ip_second_count',
 'ip_app_os_count',
 'ip_device_hour_count',
 'ip_os_hour_count',
 'ip_day_hour_count',
 'ip_channel_count',
 'app_channel_day_count',
 'ip_device_count',
 'ip_day_count',
 'ip_hour_count',
 'channel',
 'app',
 'os',
 'device',
 'second',
 'minute',
 'hour',
 'ip']

In [32]:
test.head(10)

Unnamed: 0,ip_count,ip_minute_count,app_channel_count,ip_app_count,ip_app_hour_count,ip_second_count,ip_app_os_count,ip_device_hour_count,ip_os_hour_count,ip_day_hour_count,...,ip_day_count,ip_hour_count,channel,app,os,device,second,minute,hour,ip
0,506,8,900035,88,9,8,3,47,3,34,...,91,47,107,9,3,1,0,0,4,5744
1,10721,231,1831868,1335,90,155,19,818,40,403,...,2083,821,466,9,3,1,0,0,4,119901
2,6855,144,1138028,456,23,107,65,588,251,229,...,2135,588,128,21,19,1,0,0,4,72287
3,5470,136,291728,263,30,96,58,560,150,239,...,1201,560,111,15,13,1,0,0,4,78477
4,660,8,893563,90,9,11,28,82,48,60,...,208,83,328,12,13,1,0,0,4,123080
5,1874,33,4191211,172,21,40,41,239,69,120,...,399,239,107,18,13,1,0,0,4,110769
6,1934,31,1005191,344,27,27,3,154,6,90,...,412,159,137,3,1,1,0,0,4,12540
7,2784,27,398008,29,3,46,7,251,32,93,...,558,253,153,27,19,1,0,0,4,88637
8,3041,51,4191211,246,29,47,2,260,1,106,...,608,261,107,18,10,1,0,0,4,14932
9,13714,267,115820,1692,104,241,10,890,2,539,...,2718,1036,424,12,53,1,0,0,4,123701
