In [None]:
PATH = "/home/kai/data/kaggle/talkingdata/data/"

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF

def _click_count_in_next_n_hour(id_series, feature_series, timestamp_series, n_hour):
    
    def _apply_in_each_group(x):
        result = []
        for i in range(x.shape[0]):
            row = x.iloc[i]
            filter_s = sum((x['time'] - row['time']).between(0, n_hour))
            result.append([row['id'], filter_s])
        return pd.DataFrame(result, columns=['id', 'feature'])
    
    df = pd.concat([id_series, feature_series, timestamp_series], axis=1, keys=['id', 'feature', 'time'])
    output_series = df.groupby('feature').apply(_apply_in_each_group)
    return pd.Series(data=output_series['feature'].values, index=output_series['id'])


def _foward_click_time_delta(id_series, feature_series, timestamp_series):
    
    def _apply_in_each_group(x):
        result = []
        for i in range(x.shape[0]-1):
            row = x.iloc[i]
            next_row = x.iloc[i+1]
            result.append([row['id'], next_row['time'] - row['time']])
        result.append([x.iloc[-1]['id'], -1])
        return pd.DataFrame(result, columns=['id', 'feature'])
    
    df = pd.concat([id_series, feature_series, timestamp_series], axis=1, keys=['id','feature', 'time'])
    output_series = df.groupby('feature').apply(_apply_in_each_group)
    return pd.Series(data=output_series['feature'].values, index=output_series['id'])


def _backward_click_time_delta(id_series, feature_series, timestamp_series):
    
    def _apply_in_each_group(x):
        result = []
        for i in range(x.shape[0]-1, 0, -1):
            row = x.iloc[i]
            next_row = x.iloc[i-1]
            result.append([row['id'], row['time'] - next_row['time']])
        result.append([x.iloc[0]['id'], -1])
        return pd.DataFrame(result, columns=['id', 'feature'])
    
    df = pd.concat([id_series, feature_series, timestamp_series], axis=1, keys=['id', 'feature', 'time'])
    output_series = df.groupby('feature').apply(_apply_in_each_group)
    return pd.Series(data=output_series['feature'].values, index=output_series['id'])


def _past_click_mean(id_series, feature_series, timestamp_series, label_series):
    
    def _apply_in_each_group(x):
        result = []
        for i in range(1, x.shape[0]+1):
            result.append([x.iloc[i-1]['id'], x.iloc[ : i]['y'].mean()])
        return pd.DataFrame(result, columns=['id', 'feature'])
    
    df = pd.concat([id_series, feature_series, timestamp_series, label_series], axis=1, keys=['id', 'feature', 'time', 'y'])
    output_series = df.groupby('feature').apply(_apply_in_each_group)
    return pd.Series(data=output_series['feature'].values, index=output_series['id'])


def _lda_nmf_lsa_from_sklearn(key_col_series, value_col_series, n_components):
    dictionary = {}
    key_list = []
    for i in range(key_col_series.shape[0]):
        dictionary.setdefault(key_col_series.iloc[i], []).append(str(value_col_series.iloc[i]))
    key_list = list(dictionary.keys())
    sentences = [' '.join(dictionary[key]) for key in key_list]
    matrix = CountVectorizer().fit_transform(sentences)
    
    lda_dict = dict(zip(key_list, LatentDirichletAllocation(n_components=n_components).fit_transform(matrix)))
    nmf_dict = dict(zip(key_list, NMF(n_components=n_components).fit_transform(matrix)))
    lsa_dict = dict(zip(key_list, TruncatedSVD(n_components=n_components).fit_transform(matrix)))
    
    return key_col_series.map(lda_dict), key_col_series.map(nmf_dict), key_col_series.map(lsa_dict)

In [24]:
x = pd.read_csv('test.txt')
print()
group = _lda_nmf_lsa_from_sklearn(x['a'], x['b'], 2)
# print(group)







In [None]:
import pandas as pd
import numpy as np
from itertools import combinations

class FeatureEngineering:
    def __init__(self, train=None, test=None, feature_cols=None, label_cols=None, time_related=True, ngram=[1,2,3,4,5]):
        self.train = train
        self.test = test
        self.feature_cols = feature_cols
        self.label_cols = label_cols
        self.time_related = time_related
        self.ngram = ngram
        self.columns = self._get_ngram_col_combination(ngram)
        
    def _get_ngram_col_combination(self, ngram):
        column = []
        for e in ngram:
            tmp = combinations(self.feature_cols, e)
            column.extend([list(x) for x in tmp])
        return column
    
    def _col_combination_encoding(self, col):
        encoding = self.train[col].copy()
        if len(col) > 1:
            for c in col[1 : ]:
                encoding = encoding * (10 ** (int(np.log(self.train[col].max() + 1) / np.log(10)) + 1)) + self.train[c]
        return encoding
    
    def _add_one_feature(self, func, nargin, *args):
        if nargin != len(args):
            print('ERROR!!!!!!!!!!!')
            return
        new_col = func(*args)
        new_col_name = None
        self.df[new_col_name] = new_col