In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

def _click_count_in_next_n_hour(feature_series, id_series, timestamp_series, n_hour):
    
    def _apply_in_each_group(x):
        result = []
        for i in range(x.shape[0]):
            row = x.iloc[i]
            filter_s = sum((x['time'] - row['time']).between(0, n_hour))
            result.append([row['id'], filter_s])
        return pd.DataFrame(result, columns=['id', 'feature'])
    
    df = pd.concat([id_series, feature_series, timestamp_series], axis=1, keys=['id', 'feature', 'time'])
    output_series = df.groupby('feature').apply(_apply_in_each_group)
    return pd.Series(data=output_series['feature'].values, index=output_series['id'])


def _foward_click_time_delta(feature_series, id_series, timestamp_series):
    
    def _apply_in_each_group(x):
        result = []
        for i in range(x.shape[0]-1):
            row = x.iloc[i]
            next_row = x.iloc[i+1]
            result.append([row['id'], next_row['time'] - row['time']])
        result.append([x.iloc[-1]['id'], -1])
        return pd.DataFrame(result, columns=['id', 'feature'])
    
    df = pd.concat([id_series, feature_series, timestamp_series], axis=1, keys=['id','feature', 'time'])
    output_series = df.groupby('feature').apply(_apply_in_each_group)
    return pd.Series(data=output_series['feature'].values, index=output_series['id'])


def _backward_click_time_delta(feature_series, id_series, timestamp_series):
    
    def _apply_in_each_group(x):
        result = []
        for i in range(x.shape[0]-1, 0, -1):
            row = x.iloc[i]
            next_row = x.iloc[i-1]
            result.append([row['id'], row['time'] - next_row['time']])
        result.append([x.iloc[0]['id'], -1])
        return pd.DataFrame(result, columns=['id', 'feature'])
    
    df = pd.concat([id_series, feature_series, timestamp_series], axis=1, keys=['id', 'feature', 'time'])
    output_series = df.groupby('feature').apply(_apply_in_each_group)
    return pd.Series(data=output_series['feature'].values, index=output_series['id'])


def _past_click_mean(feature_series, id_series, label_series):
    
    def _apply_in_each_group(x):
        result = []
        for i in range(1, x.shape[0]+1):
            result.append([x.iloc[i-1]['id'], x.iloc[ : i]['y'].mean()])
        return pd.DataFrame(result, columns=['id', 'feature'])
    
    df = pd.concat([id_series, feature_series, label_series], axis=1, keys=['id', 'feature', 'y'])
    output_series = df.groupby('feature').apply(_apply_in_each_group)
    return pd.Series(data=output_series['feature'].values, index=output_series['id'])


def _lda_nmf_lsa_from_sklearn(key_col_series, value_col_series, n_components):
    dictionary = {}
    key_list = []
    for i in range(key_col_series.shape[0]):
        dictionary.setdefault(key_col_series.iloc[i], []).append(str(value_col_series.iloc[i]))
    key_list = list(dictionary.keys())
    sentences = [' '.join(dictionary[key]) for key in key_list]
    matrix = CountVectorizer().fit_transform(sentences)
    
    lda_dict = dict(zip(key_list, LatentDirichletAllocation(n_components=n_components).fit_transform(matrix)))
    nmf_dict = dict(zip(key_list, NMF(n_components=n_components).fit_transform(matrix)))
    lsa_dict = dict(zip(key_list, TruncatedSVD(n_components=n_components).fit_transform(matrix)))
    
    def _apply_
    
    return key_col_series.map(lda_dict), key_col_series.map(nmf_dict), key_col_series.map(lsa_dict)

In [5]:
import gc
import pandas as pd
import numpy as np
from itertools import combinations
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

class FeatureEngineering:
    def __init__(self, train=None, time_col=None, feature_cols=None, label_col=None, time_related=True, ngram=[1,2,3,4,5]):
        self.train = train
        self.time_col = time_col
        self.feature_cols = feature_cols
        self.label_col = label_col
        self.time_related = time_related
        self.ngram = ngram
        self.columns = self._get_ngram_col_combination(ngram)
        
    def _get_ngram_col_combination(self, ngram):
        column = []
        for e in ngram:
            tmp = combinations(self.feature_cols, e)
            column.extend([list(x) for x in tmp])
        return column
    
    def _col_combination_encoding(self, col):
        encoding = self.train[col[0]].copy()
        if len(col) > 1:
            for c in col[1 : ]:
                encoding = encoding * (10 ** (int(np.log(self.train[c].max() + 1) / np.log(10)) + 1)) + self.train[c]
        return encoding
    
    def _free(data):
        del data
        gc.collect()
    
    def _add_one_feature(self, func, col_name, nargin, *args):
        if nargin != len(args):
            print('ERROR!!!!!!!!!!!')
            return
        new_col = func(*args)
        self.train[col_name] = new_col
        
    def _add_embedding_features(self, func, col_names, nargin, *args):
        if nargin != len(args):
            print('ERROR!!!!!!!!!!!')
            return
        lda, nmf, lsa = func(*args)
        self.train[col_names['lda']] = lda
        self.train[col_names['nmf']] = nmf
        self.train[col_names['lsa']] = lsa
        
    
    def add_features(self):
        id_series = self.train.index.to_series()
        for c in self.columns:
            print(c)
            feature_encoding = self._col_combination_encoding(c)
            col_name = '_'.join(c)
            
            col_name1 = col_name + _click_count_in_next_n_hour.__name__ + '1'
            self._add_one_feature(_click_count_in_next_n_hour, col_name1, 4, feature_encoding, id_series,\
                                  self.train[self.time_col], 3600)
            
            col_name5 = col_name + _click_count_in_next_n_hour.__name__ + '6'
            self._add_one_feature(_click_count_in_next_n_hour, col_name5, 4, feature_encoding, id_series,\
                                  self.train[self.time_col], 3600*6)
            
            col_name2 = col_name + _foward_click_time_delta.__name__
            self._add_one_feature(_foward_click_time_delta, col_name2, 3, feature_encoding, id_series,\
                                  self.train[self.time_col])
            
            col_name3 = col_name + _backward_click_time_delta.__name__
            self._add_one_feature(_backward_click_time_delta, col_name3, 3, feature_encoding, id_series,\
                                  self.train[self.time_col])
            
            col_name4 = col_name + _past_click_mean.__name__
            self._add_one_feature(_past_click_mean, col_name4, 3, feature_encoding, id_series,\
                                  self.train[self.label_col])
            
            if len(c) == 2:
                col_name = c[0] + '_' + c[1]
                col_name5 = {'lda': col_name + '_lda', 'nmf': col_name + '_nmf', 'lsa': col_name + '_lsa'}
                self._add_embedding_features(_lda_nmf_lsa_from_sklearn, col_name5, 3, self.train[c[0]], self.train[c[1]], 2)
                col_name = c[1] + '_' + c[0]
                col_name5 = {'lda': col_name + '_lda', 'nmf': col_name + '_nmf', 'lsa': col_name + '_lsa'}
                self._add_embedding_features(_lda_nmf_lsa_from_sklearn, col_name5, 3, self.train[c[1]], self.train[c[0]], 2)
            
            FeatureEngineering._free(feature_encoding)

In [6]:
import pandas as pd
PATH = "/home/kai/data/kaggle/talkingdata/data/"
train = pd.read_csv(PATH + 'train_sample_cleaned.csv').iloc[ : 1000].sort_values(by='timestamp')
feature_cols = ['ip', 'app', 'device' , 'os', 'channel']
time_col = 'timestamp'
label_col = 'is_attributed'
f = FeatureEngineering(train=train, time_col=time_col, feature_cols=feature_cols, label_col=label_col, ngram=[2])
f.add_features()
print(f.train.columns.values)
print('done')

['ip', 'app']
[array([ 0.74966252,  0.25033748]) array([ 0.74943143,  0.25056857])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.74945612,  0.25054388]) array([ 0.5,  0.5])
 array([ 0.25157543,  0.74842457]) array([ 0.25050373,  0.74949627])
 array([ 0.250159,  0.749841]) array([ 0.25116861,  0.74883139])
 array([ 0.5,  0.5]) array([ 0.25053089,  0.74946911]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.250159,  0.749841])
 array([ 0.74992054,  0.25007946]) array([ 0.25053089,  0.74946911])
 array([ 0.250159,  0.749841]) array([ 0.250159,  0.749841])
 array([ 0.74911249,  0.25088751]) array([ 0.250159,  0.749841])
 array([ 0.25053089,  0.74946911]) array([ 0.5,  0.5])
 array([ 0.25053089,  0.74946911]) array([ 0.5,  0.5])
 array([ 0.74992054,  0.25007946]) array([ 0.5,  0.5])
 array([ 0.250159,  0.749841]) array([ 0.250159,  0.749841])
 array([ 0.4907282,  0.5092718]) array([ 0.5,  0.5])
 array([ 0.25008795,  0.74991205]) array([ 0.74

[array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) array([ 0.5,  0.5]) array([ 0.5,  0.5])
 array([ 0.5,  0.5]) arr

KeyboardInterrupt: 

In [5]:
print(len(f.train.columns.values))
print(f.train.columns.values)

229
['ip' 'app' 'device' 'os' 'channel' 'year' 'month' 'week' 'day' 'hour'
 'timestamp' 'minute' 'second' 'is_attributed'
 'ip_click_count_in_next_n_hour1' 'ip_click_count_in_next_n_hour6'
 'ip_foward_click_time_delta' 'ip_backward_click_time_delta'
 'ip_past_click_mean' 'app_click_count_in_next_n_hour1'
 'app_click_count_in_next_n_hour6' 'app_foward_click_time_delta'
 'app_backward_click_time_delta' 'app_past_click_mean'
 'device_click_count_in_next_n_hour1' 'device_click_count_in_next_n_hour6'
 'device_foward_click_time_delta' 'device_backward_click_time_delta'
 'device_past_click_mean' 'os_click_count_in_next_n_hour1'
 'os_click_count_in_next_n_hour6' 'os_foward_click_time_delta'
 'os_backward_click_time_delta' 'os_past_click_mean'
 'channel_click_count_in_next_n_hour1'
 'channel_click_count_in_next_n_hour6' 'channel_foward_click_time_delta'
 'channel_backward_click_time_delta' 'channel_past_click_mean'
 'ip_app_click_count_in_next_n_hour1' 'ip_app_click_count_in_next_n_hour6'
 'ip_

In [6]:
print(f.train['app_ip_nmf'])

441     [0.000316237167758, 0.00283855333816]
228         [0.0691725441664, 0.361480556039]
945                      [3.76585103318, 0.0]
87         [0.095117212628, 0.00492884473223]
357        [0.095117212628, 0.00492884473223]
36       [7.91451942348e-05, 0.0268720869215]
62                   [0.0, 0.000436330187376]
834                                [0.0, 0.0]
958         [0.0520615787326, 0.200767823048]
846                      [0.0, 3.40817598179]
576                                [0.0, 0.0]
165                  [0.0, 0.000436330187376]
439       [0.00134376018015, 0.0156284117958]
679         [0.0691725441664, 0.361480556039]
536         [0.0691725441664, 0.361480556039]
989        [0.095117212628, 0.00492884473223]
333                      [0.0, 3.40817598179]
994                     [0.0, 0.315855621313]
865       [0.00134376018015, 0.0156284117958]
96                       [0.0, 3.40817598179]
819                      [0.0, 3.40817598179]
403        [0.0198963457829, 0.013