In [1]:
%load_ext autoreload
%autoreload 2
from glob import glob
import gc
import os
import sys
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from func.utils import get_numeric_features, get_categorical_features, read_pkl_gzip, to_pkl_gzip, parallel_load_data, get_filename
from func.time_utils import date_add_days, date_add_times
from func.ml_utils import save_feature, get_cnt_feature, get_dummie_feature, get_label_feature
from func.parallel_utils import get_parallel_arg_list
from kaggle_utils import reduce_mem_usage, move_feature
from joblib import delayed, Parallel
import gensim
from gensim import corpora
from itertools import combinations, chain

In [2]:
COLUMN_ID = 'TransactionID'
COLUMN_DT = 'TransactionDT'
COLUMN_TARGET = 'isFraud'
COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, 'ProductCD']

train_paths = glob('../feature/eda_base/*_train.gz')
test_paths = glob('../feature/eda_base/*_test.gz')

train_paths = [path for path in train_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('fill__cnt')
               or path.count('bin_')
              ]
test_paths = [path for path in test_paths 
               if path.count(COLUMN_DT) 
               or path.count(COLUMN_ID)
               or path.count(COLUMN_TARGET)
               or path.count('fill__cnt')
               or path.count('bin_')
              ]

df_train = parallel_load_data(train_paths)
df_test = parallel_load_data(test_paths)
data = pd.concat([df_train, df_test], axis=0)
data = reduce_mem_usage(data)

Memory usage of dataframe is 785.85 MB
Memory usage after optimization is: 380.89 MB
Decreased by 51.5%


In [3]:
use_cols = sorted([col for col in  df_train.columns if col not in COLUMNS_IGNORE])
cols_cnt_card   = sorted([col for col in use_cols if col.count('card') and col.count('cnt') and not col.count('4') and not col.count('6')])
cols_cnt_addr   = sorted([col for col in use_cols if col.count('addr') and col.count('cnt')])
cols_cnt_C      = sorted([col for col in use_cols if col.count('C') and col.count('cnt')])
cols_cnt_V      = sorted([col for col in use_cols if col.count('V') and col.count('cnt')])
cols_cnt_Amt    = sorted([col for col in use_cols if (col.count('TransactionAmt') or col.count('cents')) and col.count('cnt')])

In [4]:
list_combi = list(
    list(combinations(cols_cnt_card   + ['fill__cnt_addr1', 'fill__cnt_C1', 'fill__cnt_C13', 'fill__cnt_V283'], 3))
    + list(combinations(cols_cnt_card + ['fill__cnt_addr1', 'fill__cnt_C1', 'fill__cnt_C13', 'fill__cnt_V283'], 4))
    + list(combinations(cols_cnt_card + ['fill__cnt_addr1', 'fill__cnt_C1', 'fill__cnt_C13', 'fill__cnt_V283'], 5))
    + list(combinations(cols_cnt_card + ['fill__cnt_addr1', 'fill__cnt_C1', 'fill__cnt_C13', 'fill__cnt_V283'], 6))
    )

In [None]:
#========================================================================
# Presetting
#========================================================================
num_topics = 5
n_jobs = 24
prefix = '611'
trn_len = len(df_train)

#========================================================================
# Make Corpus
#========================================================================
def make_text(df, cols_lda):

    list_token = []
    for val in df[cols_lda].values:
        elems = [str(v) for v in val]
        list_token.append(elems)
    return list_token


def parallel_lda(df, combis):
    

    for cols_lda in tqdm(combis):
        

        cols_lda = list(cols_lda)
        cols = [c.replace('fill__', 'f_').replace('cnt_', 'c_') for c in cols_lda]
        fname = '-'.join(cols)

        texts = make_text(df, cols_lda)
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        #========================================================================
        # LDA
        #========================================================================

        lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)

        def get_lda_topic(num_topics, model, corpus):
            tmp_mx = np.zeros((len(corpus), num_topics))
            for idx, sample in tqdm(enumerate(corpus)):
                topic = model[sample]
                for t_no, val in topic:
                    tmp_mx[idx, t_no] = val

            return tmp_mx


        topic_mx = get_lda_topic(num_topics, lda, corpus)

        for col_no in tqdm(range(num_topics)):
            train_feature = topic_mx[:trn_len, col_no]
            test_feature = topic_mx[trn_len:, col_no]
            to_pkl_gzip(path=f'../feature/create/{prefix}__lda_topic{num_topics}_no{col_no}_{fname}_train', obj=train_feature)
            to_pkl_gzip(path=f'../feature/create/{prefix}__lda_topic{num_topics}_no{col_no}_{fname}_test', obj=test_feature)
            
            
for no in range(4, 9, 1):
    arg_list = get_parallel_arg_list(48, list_combi[48*no:48*(no+1)])
    print(len(arg_list[0]))

    Parallel(n_jobs)([delayed(parallel_lda)(data[list(set(list(chain(*arg))))], arg) for arg in arg_list])

0


'f_c_card1-f_c_card2-f_c_card3'