In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
path = '/home/kai/data/kaggle/talkingdata/data/'
train = pd.read_csv(path + 'train_cleaned_final.csv')
test = pd.read_csv(path + 'test_cleaned_final.csv')

# get 3 training set day7 day8 day9

In [3]:
df_hour = pd.read_csv(path+'hourdistri.csv', index_col='Unnamed: 0')
index = {}
for day in ['day7', 'day8','day9']:
    index[day] = list(range(df_hour.loc[day,'4start'], df_hour.loc[day,'6end0sec'])) + \
    list(range(df_hour.loc[day,'9start'], df_hour.loc[day,'11end0sec'])) + \
    list(range(df_hour.loc[day,'13start'], df_hour.loc[day,'15end0sec'])) 
    


# Feature Cols

In [4]:
target = 'is_attributed'
feature_count =  [
                    'ip_day_hour_count',
                    'ip_os_day_hour_count',
                    'ip_app_day_hour_count',
                    'ip_app_os_day_hour_count',
                    'app_day_hour_count',
                    'ip_device_os_count',
                    'ip_app_device_os_count']

feature_mean = ['ip_device_os_mean',
                'ip_app_device_os_mean', 'ip_mean']

feature_reversemean = ['ip_device_os_reversemean',
                'ip_app_device_os_reversemean', 'ip_reversemean']

feature_time2nextclick = ['ip_device_os_time2nextclick',
                            'ip_app_device_os_time2nextclick', 'ip_time2nextclick']

feature_time2previousclick = ['ip_device_os_time2previousclick', 
                                'ip_app_device_os_time2previousclick', 'ip_time2previousclick']
    
    
feature_countfromfuture = ['ip_device_os_countfromfuture',
                            'ip_app_device_os_countfromfuture', 'ip_countfromfuture']

feature_countfrompast = ['ip_device_os_countfrompast',
                            'ip_app_device_os_countfrompast', 'ip_countfrompast']
    
feature_lasttimediff =  ['ip_device_os_lasttimediff',
                             'ip_app_device_os_lasttimediff', 'ip_lasttimediff']

feature_ori = ['app', 'channel', 'device', 'os', 'hour']

feature_cols = []
added_feature = []

added_feature.extend(feature_count)
added_feature.extend(feature_mean)
added_feature.extend(feature_reversemean)
added_feature.extend(feature_time2nextclick)
added_feature.extend(feature_time2previousclick)
added_feature.extend(feature_countfromfuture)
added_feature.extend(feature_countfrompast)
added_feature.extend(feature_lasttimediff)
feature_cols.extend(added_feature)
feature_cols.extend(feature_ori)

train_cols = feature_cols.copy()
train_cols.append(target)

# Define functin

In [6]:
def count(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts()
    
    return group.map(count_map).fillna(0)

def countsort(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts().iloc[::-1]
    count_map.iloc[:] = list(range(1, len(count_map) + 1))
    
    return group.map(count_map).fillna(-1)



def mean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  

    group = get_group(df_train, cols)
    group_history = get_group(df_history, cols)
    mean_map = df_history.groupby(group_history)[target].mean()
    return group.map(mean_map).fillna(-1)


def reversemean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    positive = group_history[df_history[target] == 1]
    negative = group_history[df_history[target] == 0]
    index_p = set(positive.unique())
    index_n = set(negative.unique())
    index_n.difference_update(index_p)
    map_reverse_p = positive.groupby(positive).count() / len(positive)
    map_reverse_n = pd.Series(np.zeros(len(index_n)), index=index_n)
    map_reverse = pd.concat([map_reverse_p, map_reverse_n])
    return group.map(map_reverse).fillna(-1)


def time2nextclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    next_heard = {}
    for g, t in zip(group, df_reverse[timecol]):
        if g in next_heard:
            result.append(next_heard[g] - t)
        else:
            result.append(-1)
        next_heard[g] = t
    
    result.reverse()
    return result

def time2previousclick(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    group = get_group(df_train, cols)

    last_heard = {}
    for t, g in zip(df_train[timecol], group):
        if g in last_heard:
            result.append(t - last_heard[g])
        else:
            result.append(-1)
        last_heard[g] = t
        
    return result

def countfrompast(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
    
    count = {}
    result = []
    for g in group.values:
        if g not in count:
            count[g] = 0
        else:
            count[g] += 1
        result.append(count[g])
        
    return result

def countfromfuture(df_history, df_train, cols, target, timecol='timestamp'):
    
    result = []
    df_reverse = df_train.sort_index(ascending=False)
    group = get_group(df_reverse,  cols)
    
    count = {}
    for g in group.values:
        if g in count:
            result.append(count[g])
            count[g] += 1 
        else:
            result.append(0)
            count[g] = 1
    
    result.reverse()
    return result

def lasttimediff(df_history, df_train, cols, target, timecol='timestamp'):
    
    group = get_group(df_train, cols)
        
    last_time = df_train.groupby(group)[timecol].last()
    
    return group.map(last_time) - df_train[timecol]

def col_name(cols, func=None):
    if func is None:
        return '_'.join(cols)
    else:
        return '_'.join(cols) + '_' + func.__name__
    
    


In [7]:
orders = {}
feature_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',]

# feature_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel']
for col in feature_col:
    orders[col] = 10 ** (int(np.log(max(train[col].max(),test[col].max() ) + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

# Feature engineering on Train

In [9]:
import sys
from itertools import combinations
combine_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',]


for day in ['day7', 'day8', 'day9']:
    counter = 0
    df_train = train.iloc[index[day]].copy()
    print('got train data')
    history_index = list(set(train.index.values) - set(index[day]))
    df_history = train.iloc[history_index].copy()
    print('got historical data')
    
    ###########################################################################
    for func in [count, mean, reversemean,time2nextclick, time2previousclick, countfromfuture, countfrompast, lasttimediff]:
                if func.__name__ == count.__name__:
                    df_all = pd.concat([train, test])
                else:
                    try:
                        del df_all
                        gc.collect()
                    except Exception:
                        print('df_all does not exist')
               
                for num_col in [1,2,3,4,5]:
                    for cols in combinations(combine_col, num_col):
                        feature_name = col_name(cols, func=func)
                        if feature_name not in added_feature:
                               continue
                        counter += 1
                        if func.__name__ == count.__name__:
                                print('count function')
                                df_train[feature_name] = func(df_all, df_train, cols, target='is_attributed')

                        elif func.__name__ == mean.__name__:
                                print('mean function')
                                df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')
                        elif func.__name__ == reversemean.__name__:
                                print('reverse mean function')
                                df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')

                        else:
                                print('time related function')
                                df_train[feature_name] = func(df_train, df_train, cols, target='is_attributed')
               
                        all_str = 'all {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(df_train)/ 1024 **3)
                        print(all_str)
                        with open('feature_all.txt', 'w') as text_file:
                            text_file.write(all_str + '\n')

               
               
               
               
    save_file_name = '{}_features_addreversemean_ip.csv'.format(day)
    save_file_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/' + save_file_name
    df_train = df_train[train_cols]
    df_train.to_csv(save_file_path, index=False)
    print(save_file_path)

got train data
got historical data
count function
all 1:   ip_device_os_count   			 size: 2.0376134142279625 G.
count function
all 2:   ip_day_hour_count   			 size: 2.183157227933407 G.
count function
all 3:   app_day_hour_count   			 size: 2.328701041638851 G.
count function
all 4:   ip_app_device_os_count   			 size: 2.4742448553442955 G.
count function
all 5:   ip_app_day_hour_count   			 size: 2.61978866904974 G.
count function
all 6:   ip_os_day_hour_count   			 size: 2.765332482755184 G.
count function
all 7:   ip_app_os_day_hour_count   			 size: 2.9108762964606285 G.
df_all does not exist
mean function
all 8:   ip_mean   			 size: 3.056420110166073 G.
mean function
all 9:   ip_device_os_mean   			 size: 3.201963923871517 G.
mean function
all 10:   ip_app_device_os_mean   			 size: 3.3475077375769615 G.
df_all does not exist
reverse mean function
all 11:   ip_reversemean   			 size: 3.493051551282406 G.
reverse mean function
all 12:   ip_device_os_reversemean   			 size: 3.6385

# Feature Engineering on Test

In [10]:
import sys
from itertools import combinations
combine_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',]



counter = 0
df_train = test.copy()
print('got train data')
history_index = list(set(train.index.values) - set(index[day]))
df_history = train.copy()
print('got historical data')

###########################################################################
for func in [count, mean,reversemean, time2nextclick, time2previousclick, countfromfuture, countfrompast, lasttimediff]:
            if func.__name__ == count.__name__:
                df_all = pd.concat([train, test])
            else:
                try:
                    del df_all
                    gc.collect()
                except Exception:
                    print('df_all does not exist')

            for num_col in [1,2,3,4,5]:
                for cols in combinations(combine_col, num_col):
                    feature_name = col_name(cols, func=func)
                    if feature_name not in added_feature:
                           continue
                    counter += 1
                    if func.__name__ == count.__name__:
                            print('count function')
                            df_train[feature_name] = func(df_all, df_train, cols, target='is_attributed')

                    elif func.__name__ == mean.__name__:
                            print('mean function')
                            df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')
                    elif func.__name__ == reversemean.__name__:
                            print('mean function')
                            df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')

                    else:
                            print('time related function')
                            df_train[feature_name] = func(df_train, df_train, cols, target='is_attributed')

                    all_str = 'all {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(df_train)/ 1024 **3)
                    print(all_str)
                    with open('feature_all.txt', 'w') as text_file:
                        text_file.write(all_str + '\n')





save_file_name = '{}_features_addreversemean_ip.csv'.format('test')
save_file_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/' + save_file_name
df_train = df_train[feature_cols]
df_train.to_csv(save_file_path, index=False)
print(save_file_path)

got train data
got historical data
count function
all 1:   ip_device_os_count   			 size: 1.679998941719532 G.
count function
all 2:   ip_day_hour_count   			 size: 1.8199988454580307 G.
count function
all 3:   app_day_hour_count   			 size: 1.9599987491965294 G.
count function
all 4:   ip_app_device_os_count   			 size: 2.099998652935028 G.
count function
all 5:   ip_app_day_hour_count   			 size: 2.2399985566735268 G.
count function
all 6:   ip_os_day_hour_count   			 size: 2.3799984604120255 G.
count function
all 7:   ip_app_os_day_hour_count   			 size: 2.519998364150524 G.
df_all does not exist
mean function
all 8:   ip_mean   			 size: 2.659998267889023 G.
mean function
all 9:   ip_device_os_mean   			 size: 2.7999981716275215 G.
mean function
all 10:   ip_app_device_os_mean   			 size: 2.93999807536602 G.
df_all does not exist
mean function
all 11:   ip_reversemean   			 size: 3.079997979104519 G.
mean function
all 12:   ip_device_os_reversemean   			 size: 3.2199978828430176 G.

In [None]:
from lightfm import LightFM