In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold

In [3]:
path = '/home/kai/data/kaggle/talkingdata/data/'
train = pd.read_csv(path + 'train_cleaned_final.csv')
test = pd.read_csv(path + 'test_cleaned_final.csv')

# Get K Fold

In [4]:
K = 12
# kf = KFold(n_splits=K, shuffle = False)
kf = KFold(n_splits=K, shuffle = True, random_state = 233)
history_index = []
train_index = []
for h,t in kf.split(train):
    history_index.append(h)
    train_index.append(t)
    
### use last fold as example

In [5]:
import sys
print(sys.getsizeof(train)/ 1024 **3)

15.154054783284664


# use last fold as example

In [6]:
df_history = train.iloc[history_index[-1]].copy()
df_train = train.iloc[train_index[-1]].copy()

In [7]:
import sys
print(sys.getsizeof(df_train)/ 1024 **3)
print(len(df_train))

1.377641312777996
15408657


In [8]:
df_train.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'day', 'hour', 'timestamp',
       'minute', 'second', 'is_attributed'],
      dtype='object')

In [9]:
orders = {}
feature_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel',
              'day',
              'hour',
              'minute',
              'second']

# feature_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel']
for col in feature_col:
    orders[col] = 10 ** (int(np.log(max(train[col].max(),test[col].max() ) + 1) / np.log(10)) + 1)
def get_group(df, cols):
    """
    define an encoding method which can ganrantee the adding value will be unique.
    eg: artist_name_composer will be a combination of (artist_name,composer) and the encoding will reflect the unqiue combination of those two
    """
    group = df[cols[0]].copy()
    for col in cols[1:]:
        group = group * orders[col] + df[col]
        
    return group

import gc
# del train
gc.collect()

60

In [10]:
orders

{'app': 1000,
 'channel': 1000,
 'day': 100,
 'device': 10000,
 'hour': 100,
 'ip': 1000000,
 'minute': 100,
 'os': 1000,
 'second': 100}

# count
plan 1. count from historical data  
plan 2. count from all data

In [11]:
def count(df_history, df_train, cols, target=None):
    """
    Purpose: add a new feature to training df.count the number of records for each feature combination (eg, artist_name_composer)) 
    """
    
    group = get_group(df_train, cols)
    group_all = get_group(df_history, cols)
    
    count_map = group_all.value_counts()
    
    return group.map(count_map).fillna(0)

# mean
mean P(target | feature combination)

purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer))
Get the conditional Probability only from historical data and apply to train data.

P(replay | X feature combination) = P( replay & X feature combination) / P (X feature combination)  
=(count(replay & X feature combination) / count(total)) / (count(X feature combination) / count(total))  
= count(replay & X feature combination) / count(X feature combination)  
= sum((replay & X feature combination)) / count(X feature combination)  
= sum((replay or not replayed & X feature combination)) / count(X feature combination)# since replay is 1, not replay is 0  
= sum( X feature combination) / count(X feature combination)  
= mean(X feature combination)  

In [12]:
def scaller(num):
    sca = 1
    while num * sca < 1:
        sca *= 10
    return sca

def mean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    mean_map = df_history.groupby(group_history)[target].mean()
    # mean_map: key - encoding, value - target mean
#     ### sca
#     m_min = mean_map[mean_map > 0].min()
#     sca = scaller(m_min)
#     mean_map *= sca
#     ###

    return group.map(mean_map).fillna(-1)

# reversemean
reverse mean P(feature combination | target)

In [13]:
def reversemean(df_history, df_train, cols, target):
    """
    Purpose: add a new feature to training df. conditional probability P(replay (target) | feature combination (eg, artist_name_composer)) 
    """
  
    # encoding df's cols into a new series
    group = get_group(df_train, cols)
    # encoding df_history's cols into a new series
    group_history = get_group(df_history, cols)
    # get the conditional probability p(target| feature combination. eg, artist_name_composer) 
    positive = group_history[df_history[target] == 1]
    negative = group_history[df_history[target] == 0]
    index_p = set(positive.unique())
    index_n = set(negative.unique())
    index_n.difference_update(index_p)
    map_reverse_p = positive.groupby(positive).count() / len(positive)
    map_reverse_n = pd.Series(np.zeros(len(index_n)), index=index_n)
    map_reverse = pd.concat([map_reverse_p, map_reverse_n])
    return group.map(map_reverse).fillna(-1)

# generate all cols

In [14]:
from itertools import combinations


combine_col = ['ip', 
              'app', 
              'device', 
              'os', 
              'channel', 
#               'day',
              'hour',
              'minute',
              'second']

# combine_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel']

def col_name(cols, func=None):
    if func is None:
        return '_'.join(cols)
    else:
        return '_'.join(cols) + '_' + func.__name__

counter = 0



exception_list = []
for num_col in [1,2,3]:
    for cols in combinations(combine_col, num_col):
#         for func in [count, mean]:
        for func in [reversemean]:
            counter += 1
            feature_name = col_name(cols, func=func)
            if feature_name in exception_list:
                continue
#             df_train[feature_name] = func(df_history, df_train, cols, target='is_attributed')
#             test[feature_name] = func(df_history, test, cols, target='is_attributed')
            df_train[feature_name] = func(train, df_train, cols, target='is_attributed')
            test[feature_name] = func(train, test, cols, target='is_attributed')
            gc.collect()
            train_str = 'train {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(df_train)/ 1024 **3)
            test_str = 'test {}:   {}   \t\t\t size: {} G.'.format(counter, feature_name, sys.getsizeof(test)/ 1024 **3)
            print(train_str)
            print(test_str)
            print('------')
            with open('feature_all.txt', 'w') as text_file:
                text_file.write(train_str + '\n')
                text_file.write(test_str + '\n')
                text_file.write('------' + '\n')

train 1:   ip_reversemean   			 size: 1.4924447536468506 G.
test 1:   ip_reversemean   			 size: 1.5399990379810333 G.
------
train 2:   app_reversemean   			 size: 1.607248194515705 G.
test 2:   app_reversemean   			 size: 1.679998941719532 G.
------
train 3:   device_reversemean   			 size: 1.7220516353845596 G.
test 3:   device_reversemean   			 size: 1.8199988454580307 G.
------
train 4:   os_reversemean   			 size: 1.8368550762534142 G.
test 4:   os_reversemean   			 size: 1.9599987491965294 G.
------
train 5:   channel_reversemean   			 size: 1.9516585171222687 G.
test 5:   channel_reversemean   			 size: 2.099998652935028 G.
------
train 6:   hour_reversemean   			 size: 2.066461957991123 G.
test 6:   hour_reversemean   			 size: 2.2399985566735268 G.
------
train 7:   minute_reversemean   			 size: 2.1812653988599777 G.
test 7:   minute_reversemean   			 size: 2.3799984604120255 G.
------
train 8:   second_reversemean   			 size: 2.2960688397288322 G.
test 8:   second_reverseme

KeyboardInterrupt: 

# multi-processing version

In [14]:
# import multiprocessing


# ####################
# from itertools import combinations


# combine_col = ['ip', 
#               'app', 
#               'device', 
#               'os', 
#               'channel', 
#               'day',
#               'hour',
#               'minute',
#               'second']

# func_pool = [count, mean]

# target = 'is_attributed'

# def col_name(cols, func=None):
#     if func is None:
#         return '_'.join(cols)
#     else:
#         return '_'.join(cols) + '_' + func.__name__
# ###################################




# def _process_each_col(kwargs_list):
#     print('thread start !!!!!!!')
#     result_dict = {}
    
#     for kwargs in kwargs_list:
# #         df_train = kwargs.get('df_train')
# #         df_history = kwargs.get('df_history')
# #         target = kwargs.get('target')
# #         df_train = df_train
# #         df_history = df_history
# #         target = target
#         cols = kwargs.get('cols')
#         func = kwargs.get('func')
#         mode = kwargs.get('mode')
#         feature_name = col_name(cols, func=func)
#         print(feature_name)
#         if mode.lower() == 'train_history':
#             result_dict[feature_name] = func(df_history, df_train, cols, target=target)
#         elif mode.lower() == 'train_all':
#             result_dict[feature_name] = func(train, df_train, cols, target=target)
#         elif mode.lower() == 'test_history':
#             result_dict[feature_name] = func(df_history, test, cols, target=target)
#         elif mode.lower() == 'test_all':
#             result_dict[feature_name] = func(train, test, cols, target=target)
#         else:
#             print('known mode !!!!')
#     return result_dict


# # def build_kwargs(df_history, df_train, target, combine_col, func_pool, comb_total=3):
# #     kwargs_pool = []
# #     for num_col in range(1, comb_total + 1):
# #         for cols in combinations(combine_col, num_col):
# #             for func in func_pool:
# #                 kwargs = {}
# #                 kwargs['df_history'] = df_history
# #                 kwargs['df_train'] = df_train
# #                 kwargs['target'] = target
# #                 kwargs['cols'] = cols
# #                 kwargs['func'] = func
# #                 kwargs_pool.append(kwargs)
# #     return kwargs_pool

# def build_kwargs( combine_col, func_pool, mode='train',comb_total=3):
#     kwargs_pool = []
#     for num_col in range(1, comb_total + 1):
#         for cols in combinations(combine_col, num_col):
#             for func in func_pool:
#                 kwargs = {}
#                 kwargs['cols'] = cols
#                 kwargs['func'] = func
#                 kwargs['mode'] = mode
#                 kwargs_pool.append(kwargs)
#     return kwargs_pool

# def multiprocessing_features(df_history, df_train, target, combine_col, func_pool, comb_total=3, workers=3, mode='train_history'):
#     kwargs_pool = build_kwargs(combine_col, func_pool, mode, comb_total=3)
#     print('build kwargs done!')
#     pool = multiprocessing.Pool(processes=workers)
#     result = pool.map(_process_each_col, [kwargs for kwargs in np.array_split(kwargs_pool, workers)])
#     pool.close()
#     print('feature processing done!!!!!!!!!!!!!!')
#     for each_thread in result:
#         for key in each_thread:
#             df_train[key] = each_thread[key]
#     return df_train


# df_train = multiprocessing_features(df_history, df_train, target, combine_col, func_pool, comb_total=3, workers=3, mode='train_history')

build kwargs done!
thread start !!!!!!!
thread start !!!!!!!
thread start !!!!!!!
ip_count
app_channel_minute_count
hour_second_count
hour_second_mean
app_channel_minute_mean
ip_mean
minute_second_count
app_channel_second_count
minute_second_mean
app_channel_second_mean
app_count
ip_app_device_count
app_mean
app_day_hour_count
device_count
app_day_hour_mean
device_mean
app_day_minute_count
ip_app_device_mean
os_count
app_day_minute_mean
os_mean
channel_count
app_day_second_count
channel_mean
app_day_second_mean
ip_app_os_count
day_count
app_hour_minute_count
day_mean
app_hour_minute_mean
hour_count
hour_mean
app_hour_second_count
ip_app_os_mean
minute_count
app_hour_second_mean
minute_mean
second_count
app_minute_second_count
second_mean
app_minute_second_mean
ip_app_count
device_os_channel_count
ip_app_channel_count
device_os_channel_mean
ip_app_mean
device_os_day_count
device_os_day_mean
ip_app_channel_mean
device_os_hour_count
ip_device_count
device_os_hour_mean
ip_device_mean
devic

Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/home/kai/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/kai/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/kai/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/kai/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-14-c973b4268b5b>", line 49, in _process_each_col
    result_dict[feature_name] = func(df_history, df_train, cols, target=target)
  File "<ipython-input-10-4fbe1539f76c>", line 7, in count
    group_all = get_group(df_history, cols)
  File "<ipython-input-8-6613b36fca9d>", line 18, in get_group
    group = df[cols[0]].copy()
  File "/home/kai/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 3432, in copy
  

KeyboardInterrupt: 

Process ForkPoolWorker-2:
Traceback (most recent call last):
  File "/home/kai/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/kai/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/kai/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/kai/anaconda3/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-14-c973b4268b5b>", line 49, in _process_each_col
    result_dict[feature_name] = func(df_history, df_train, cols, target=target)
  File "<ipython-input-10-4fbe1539f76c>", line 9, in count
    count_map = group_all.value_counts()
  File "/home/kai/anaconda3/lib/python3.6/site-packages/pandas/core/base.py", line 938, in value_counts
    normalize=normalize, bins=bins, dropna=dropna)
  File "/home/kai/anaconda3/lib/python3.6/si

# Saving Files

In [14]:
df_train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_fold_last_in_12_reversemean.csv', index=False)
print('training saving done!')
test.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_fold_last_in_12_reversemean.csv', index=False)
print('testing saving done!')

training saving done!
testing saving done!


In [14]:
#For float saving


# df_train.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/train_fold_last_in_12_mean_1float.csv', index=False, float_format='%.1f')
# print('training saving done!')
# test.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/features/test_fold_last_in_12_mean_1float.csv', index=False, float_format='%.1f')
# print('testing saving done!')

training saving done!
testing saving done!
