In [None]:
!python3 -m pip install --user h5py numpy pandas six tensorflow

In [2]:
import h5py
import numpy as np
import pandas as pd
from six.moves import cPickle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
path = 'origin/'

train_file_list = [
    'train.chunk.01'
    , 'train.chunk.02', 'train.chunk.03',
    'train.chunk.04', 'train.chunk.05', 'train.chunk.06',
    'train.chunk.07', 'train.chunk.08', 'train.chunk.09'
]

dev_file_list = [
    'dev.chunk.01'
]

test_file_list = [
    'test.chunk.01',
    'test.chunk.02'
]

In [4]:
def get_cate_maxsize(range_cate):
    
    b_cate_max = 57
    m_cate_max = 552
    s_cate_max = 3190
    d_cate_max = 404
    
    if (range_cate == 'bcateid'):
        cate_size = b_cate_max
    elif (range_cate == 'mcateid'):
        cate_size = m_cate_max
        
    # s and d have zero(originally -1)
    elif (range_cate == 'scateid'):
        cate_size = s_cate_max + 1
    elif (range_cate == 'dcateid'):
        cate_size = d_cate_max + 1
    else :
        print("wrong id...")
        return
    
    return cate_size
    

In [5]:
def get_size(data_attr):
    
    file_list = []
    if data_attr == 'train':
        file_list = train_file_list
    elif data_attr == 'dev':
        file_list = dev_file_list
    elif data_attr == 'test':
        file_list = test_file_list   
    
    total_size = 0
    for file in file_list:
        with h5py.File(path+file, 'r') as read:
            total_size += read[data_attr]['pid'].size
    
    return total_size
        

In [6]:
def make_dataset(data_attr, col_name, max_len=32, num_words=100000, train_val_ratio=0.8, seed_num=17):
    
    col_list = ['product', 'maker', 'brand', 'model']
    cate_list= ['bcateid', 'mcateid', 'scateid', 'dcateid']
    
    file_list = []
    if data_attr == 'train':
        file_list = train_file_list
    elif data_attr == 'dev':
        train_val_ratio = 1
        file_list = dev_file_list
    elif data_attr == 'test':
        train_val_ratio = 1
        file_list = test_file_list
    else:
        print('Wrong data attr...')
        return
    
    total_file_list = train_file_list + dev_file_list + test_file_list
    
    total_size = get_size(data_attr)
    train_size = int(total_size * train_val_ratio)
    val_size = total_size - train_size
    
    
    print("Total size : {}, train size : {}, val_size : {}".format(total_size, train_size, val_size))

    # make img_feat
    if col_name == 'img_feat':
        !free
        gen_file_name = 'X_'+ str(data_attr) +'_' + str(col_name) + '.h5py'
        print("Make {} dataset : {}, Generated file : {}".format(data_attr, col_name, gen_file_name))    
        
        with h5py.File(path+gen_file_name, 'w') as write:            
            write.create_dataset(data_attr, (train_size, max_len), dtype='f4')
            write.create_dataset('val', (val_size, max_len), dtype='f4')
            print("train dataset shape : {}".format(write[data_attr].shape))
            print("val dataset shape : {}".format(write['val'].shape))
            
            start_idx = 0
            tr_idx = 0
            val_idx = 0
            end_idx = 0            
            for file in file_list:
                with h5py.File(path+file, 'r') as read:
                    print("current file : {}".format(file))
                    read_data = np.array(read[data_attr][col_name])
                    # np.random.seed(seed_num)
                    # np.random.shuffle(read_data)
                    print("image_feat's shape of cur file : {}".format(read_data.shape))
                    
                    file_size = len(read_data)
                    
                    tmp_train_size = int(file_size * train_val_ratio)
                    tmp_val_size = file_size - tmp_train_size
                     
                    end_idx += file_size
                    
                    print("start idx: {}, end idx : {}".format(start_idx, end_idx))
                    print("train : target {} / {}".format(tr_idx, tr_idx+tmp_train_size))
                    write[data_attr][tr_idx:tr_idx+tmp_train_size] = read_data[:tmp_train_size]
                    print("val : target {} / {}".format(val_idx, val_idx+tmp_val_size))
                    write['val'][val_idx:val_idx+tmp_val_size] = read_data[tmp_train_size:]
                    start_idx = end_idx
                    tr_idx += tmp_train_size 
                    val_idx += tmp_val_size
                    !free

            print("Finally, saving dataset complete ! {}, shape : {}, {}".format(gen_file_name, write[data_attr].shape, write['val'].shape))
    
    # make price feature
    elif col_name == 'price':
        !free
        gen_file_name = 'X_'+ str(data_attr) +'_' + str(col_name) + '.h5py'
        print("Make {} dataset : {}, Generated file : {}".format(data_attr, col_name, gen_file_name))    
        
        with h5py.File(path+gen_file_name, 'w') as write:            
            write.create_dataset(data_attr, (train_size, ), dtype='i4')
            write.create_dataset('val', (val_size, ), dtype='i4')
            print("train dataset shape : {}".format(write[data_attr].shape))
            print("val dataset shape : {}".format(write['val'].shape))
            
            start_idx = 0
            tr_idx = 0
            val_idx = 0
            end_idx = 0            
            for file in file_list:
                with h5py.File(path+file, 'r') as read:
                    print("current file : {}".format(file))
                    read_data = np.array(read[data_attr][col_name])
                    # np.random.seed(seed_num)
                    # np.random.shuffle(read_data)
                    print("image_feat's shape of cur file : {}".format(read_data.shape))
                    
                    file_size = len(read_data)
                    
                    tmp_train_size = int(file_size * train_val_ratio)
                    tmp_val_size = file_size - tmp_train_size
                     
                    end_idx += file_size
                    
                    print("start idx: {}, end idx : {}".format(start_idx, end_idx))
                    print("train : target {} / {}".format(tr_idx, tr_idx+tmp_train_size))
                    write[data_attr][tr_idx:tr_idx+tmp_train_size] = read_data[:tmp_train_size]
                    print("val : target {} / {}".format(val_idx, val_idx+tmp_val_size))
                    write['val'][val_idx:val_idx+tmp_val_size] = read_data[tmp_train_size:]
                    start_idx = end_idx
                    tr_idx += tmp_train_size 
                    val_idx += tmp_val_size
                    !free

            print("Finally, saving dataset complete ! {}, shape : {}, {}".format(gen_file_name, write[data_attr].shape, write['val'].shape))

    
    # make y data about b, m, s, d category
    elif col_name in cate_list :
        !free
        gen_file_name = 'y_'+ str(data_attr) + '_' + str(col_name) + '.h5py'
        print("Make {} dataset : {}, Generated file : {}".format(data_attr, col_name, gen_file_name))    
        with h5py.File(path+gen_file_name, 'w') as write:
            write.create_dataset(data_attr, (train_size, get_cate_maxsize(col_name)), dtype='?')
            write.create_dataset('val', (val_size, get_cate_maxsize(col_name)), dtype='?')
            print("train dataset shape : {}".format(write[data_attr].shape))
            print("val dataset shape : {}".format(write['val'].shape))
            
            start_idx = 0
            tr_idx = 0
            val_idx = 0
            end_idx = 0  
            for file in file_list:
                with h5py.File(path+file, 'r') as read:
                    print("current file : {}".format(file))
                    read_data = np.array(read[data_attr][col_name])
                    # np.random.seed(seed_num)
                    # np.random.shuffle(read_data)
                    print("cate's shape of cur file (before categorical): {}".format(read_data.shape))
                    
                    file_size = len(read_data)
                    tmp_train_size = int(file_size * train_val_ratio)
                    tmp_val_size = file_size - tmp_train_size
                    end_idx += file_size
                    
                    print("start idx: {}, end idx : {}".format(start_idx, end_idx))                    
                    
                    if col_name in ('bcateid', 'mcateid'):
                        to_cate = to_categorical(read_data, num_classes = get_cate_maxsize(col_name) + 1) # include zero
                        print("shape (after categorical): {}".format(to_cate.shape))
                        write[data_attr][tr_idx:tr_idx+tmp_train_size] = to_cate[:tmp_train_size, 1:].astype('?') # delete zero columns
                        write['val'][val_idx:val_idx+tmp_val_size] = to_cate[tmp_train_size:, 1:].astype('?')
                        
                    elif col_name in ('scateid', 'dcateid'):
                        read_data[read_data == -1] = 0 # replace -1 to 0
                        to_cate = to_categorical(read_data, num_classes = get_cate_maxsize(col_name))
                        write[data_attr][tr_idx:tr_idx+tmp_train_size] = to_cate[:tmp_train_size].astype('?')
                        write['val'][val_idx:val_idx+tmp_val_size] = to_cate[tmp_train_size:].astype('?')
                        
                    print("train : target {} / {}".format(tr_idx, tr_idx+tmp_train_size))
                    print("val : target {} / {}".format(val_idx, val_idx+tmp_val_size))
                    start_idx = end_idx
                    tr_idx += tmp_train_size 
                    val_idx += tmp_val_size
                    !free
                    
            print("Finally, saving dataset complete ! {}, shape : {}, {}".format(gen_file_name, write[data_attr].shape, write['val'].shape))
            

    
    # make input data ( product, maker, brand, model )
    elif col_name in col_list:     
        !free
        gen_file_name = 'X_' + str(data_attr) + '_' + str(col_name) + '.h5py'
        print("Make {} dataset : {}, Generated file : {}".format(data_attr, col_name, gen_file_name))    
   
        with h5py.File(path+gen_file_name, 'w') as write:
            write.create_dataset(data_attr, (train_size, max_len), dtype='i4')
            write.create_dataset('val', (val_size, max_len), dtype='i4')
            print("train dataset shape : {}".format(write[data_attr].shape))
            print("val dataset shape : {}".format(write['val'].shape))        

            tok_data = Tokenizer(num_words=num_words)
            total_list = []

            print("Start fit on text... about {}".format(col_name))
            
            for file in total_file_list:
                with h5py.File(path+file, 'r') as read:
                    read_data = np.array(read[file.split('.')[0]][col_name]) # read file 
                    # np.random.seed(seed_num)
                    # np.random.shuffle(read_data)                    
                    print("current file : {}".format(file))
                    data_list = [ s.decode('utf-8') for s in read_data ]
                    if (file.split('.')[0] == data_attr):
                        print("{} file is stored in total_list...".format(file))
                        total_list += data_list
                    
                    tok_data.fit_on_texts(data_list)

                    !free

            print("fit on text Done...")
            print("Start to make sequence...")
            tokenized = tok_data.texts_to_sequences(total_list)
            !free
            print("padding...")
            X_tr_data = pad_sequences(tokenized, maxlen=max_len)
            print("Shape of saving dataset : {}".format(X_tr_data.shape))
            !free

            print("Save dataset...")
            
            # train data / val data
            chunk_size = 1000000
            file_cnt = int(total_size / chunk_size)
            if total_size % chunk_size != 0:
                file_cnt += 1
            st = 0
            ed = 0
            
            for i in range(file_cnt):
                if i == file_cnt - 1 :
                    
                    tr_st  = i * int(chunk_size * train_val_ratio)
                    val_st = i * int(chunk_size * round(1 - train_val_ratio, 1))
                    div = int((total_size % chunk_size) * train_val_ratio)
                    st = i * chunk_size
                    ed = (i+1) * chunk_size                    
         
                    print("train : target {} / {}, origin {} / {}".format(tr_st, train_size, st, st+div))
                    print("val : target {} / {}, origin {} / {}".format(val_st, val_size, st+div, total_size))
                    
                    write[data_attr][tr_st:train_size] = X_tr_data[st:st+div]
                    write['val'][val_st:val_size] = X_tr_data[st+div:total_size]
                    
                    break
                else:
                    ed += chunk_size
                    div = int(chunk_size * train_val_ratio)
                    
                    tr_st = int(st * train_val_ratio)
                    tr_ed = int(ed * train_val_ratio)
                    val_st = int(st * round(1-train_val_ratio, 1))
                    val_ed = int(ed * round(1-train_val_ratio, 1))
                    
                    print("train : target {} / {}, origin {} / {}".format(tr_st, tr_ed, st, st+div))
                    print("val : target {} / {}, origin {} / {}".format(val_st, val_ed, st+div, ed))
                    
                    write[data_attr][tr_st:tr_ed] = X_tr_data[st:st+div]
                    write['val'][val_st:val_ed] = X_tr_data[st+div:ed]
                    st = ed
                    
            print("Finally, saving dataset complete ! : {}".format(gen_file_name))

        del tok_data
        del total_list
        del tokenized
        del X_tr_data
    
    # make concate string
    elif col_name == 'concate':
        !free
        gen_file_name = 'X_' + str(data_attr) + '_' + str(col_name) + '.h5py'
        print("Make {} dataset : {}, Generated file : {}".format(data_attr, col_name, gen_file_name))    
   
        with h5py.File(path+gen_file_name, 'w') as write:
            write.create_dataset(data_attr, (train_size, max_len), dtype='i4')
            write.create_dataset('val', (val_size, max_len), dtype='i4')
            print("train dataset shape : {}".format(write[data_attr].shape))
            print("val dataset shape : {}".format(write['val'].shape))        

            tok_data = Tokenizer(num_words=num_words)
            total_list = []

            print("Start fit on text... about {}".format(col_name))
            
            for file in total_file_list:
                with h5py.File(path+file, 'r') as read:
                    print("current file : {}".format(file))
                    print("target col : {}".format('product'))
                    target = pd.Series(read[file.split('.')[0]]['product'][:], dtype='O').str.decode('utf-8') #product
                    print("current col : {}".format('brand'))
                    other = pd.Series(read[file.split('.')[0]]['brand'][:], dtype='O').str.decode('utf-8') # read file
                    target = target.str.cat(other, sep=' ')
                    print("concate data shape : {}".format(target.shape))    
                    # read_data = np.array(read[file.split('.')[0]][col_name]) # read file 
                    # np.random.seed(seed_num)
                    # np.random.shuffle(read_data)                    
                    data_list = [ s for s in target ]
                    
                    if (file.split('.')[0] == data_attr):
                        print("{} file is stored in total_list...".format(file))
                        total_list += data_list
                    
                    tok_data.fit_on_texts(data_list)

                    !free

            print("fit on text Done...")
            print("Start to make sequence...")
            tokenized = tok_data.texts_to_sequences(total_list)
            !free
            print("padding...")
            X_tr_data = pad_sequences(tokenized, maxlen=max_len)
            print("Shape of saving dataset : {}".format(X_tr_data.shape))
            !free

            print("Save dataset...")
            
            # train data / val data
            chunk_size = 1000000
            file_cnt = int(total_size / chunk_size)
            if total_size % chunk_size != 0:
                file_cnt += 1
            st = 0
            ed = 0
            
            for i in range(file_cnt):
                if i == file_cnt - 1 :
                    
                    tr_st  = i * int(chunk_size * train_val_ratio)
                    val_st = i * int(chunk_size * round(1 - train_val_ratio, 1))
                    div = int((total_size % chunk_size) * train_val_ratio)
                    st = i * chunk_size
                    ed = (i+1) * chunk_size                    
         
                    print("train : target {} / {}, origin {} / {}".format(tr_st, train_size, st, st+div))
                    print("val : target {} / {}, origin {} / {}".format(val_st, val_size, st+div, total_size))
                    
                    write[data_attr][tr_st:train_size] = X_tr_data[st:st+div]
                    write['val'][val_st:val_size] = X_tr_data[st+div:total_size]
                    
                    break
                else:
                    ed += chunk_size
                    div = int(chunk_size * train_val_ratio)
                    
                    tr_st = int(st * train_val_ratio)
                    tr_ed = int(ed * train_val_ratio)
                    val_st = int(st * round(1-train_val_ratio, 1))
                    val_ed = int(ed * round(1-train_val_ratio, 1))
                    
                    print("train : target {} / {}, origin {} / {}".format(tr_st, tr_ed, st, st+div))
                    print("val : target {} / {}, origin {} / {}".format(val_st, val_ed, st+div, ed))
                    
                    write[data_attr][tr_st:tr_ed] = X_tr_data[st:st+div]
                    write['val'][val_st:val_ed] = X_tr_data[st+div:ed]
                    st = ed
                    
            print("Finally, saving dataset complete ! : {}".format(gen_file_name))

        del tok_data
        del total_list
        del tokenized
        del X_tr_data
        
    elif not col_name in col_list+cate_list :
        print("Wrong col name...")
        return

In [7]:
!free

              total        used        free      shared  buff/cache   available
Mem:      206357056     7687752    96778140       10876   101891164   196835908
Swap:             0           0           0


In [37]:
make_dataset('train', col_name='product', max_len=32, num_words=100000)
make_dataset('train', col_name='maker', max_len=32, num_words=100000)
make_dataset('train', col_name='brand', max_len=32, num_words=100000)
make_dataset('train', col_name='model', max_len=32, num_words=100000)
make_dataset('train', col_name='concate', max_len=32, num_words=100000)

# make_dataset('train', col_name='price')
# make_dataset('train', col_name='img_feat', max_len=2048)

# make_dataset('train', col_name='bcateid')
# make_dataset('train', col_name='mcateid')
# make_dataset('train', col_name='scateid')
# make_dataset('train', col_name='dcateid')


Total size : 8134818, train size : 6507854, val_size : 1626964
              total        used        free      shared  buff/cache   available
Mem:      206357056     2129712    83370120       10872   120857224   202378484
Swap:             0           0           0
Make train dataset : concate, Generated file : X_train_concate.h5py
train dataset shape : (6507854, 32)
val dataset shape : (1626964, 32)
Start fit on text... about concate
current file : train.chunk.01
target col : product
current col : brand
concate data shape : (1000000,)
train.chunk.01 file is stored in total_list...
              total        used        free      shared  buff/cache   available
Mem:      206357056     2801968    82697896       10872   120857192   201706220
Swap:             0           0           0
current file : train.chunk.02
target col : product
current col : brand
concate data shape : (1000000,)
train.chunk.02 file is stored in total_list...
              total        used        free      shared 

In [9]:
# make_dataset('dev', col_name='product', max_len=32, num_words=100000)
# make_dataset('dev', col_name='maker', max_len=32, num_words=100000)
# make_dataset('dev', col_name='brand', max_len=32, num_words=100000)
# make_dataset('dev', col_name='model', max_len=32, num_words=100000)

make_dataset('dev', col_name='concate', max_len=32, num_words=200000)

# make_dataset('dev', col_name='img_feat', max_len=2048)

Total size : 507783, train size : 507783, val_size : 0
              total        used        free      shared  buff/cache   available
Mem:      206357056     7046740    89396296       10876   109914020   197461476
Swap:             0           0           0
Make dev dataset : concate, Generated file : X_dev_concate.h5py
train dataset shape : (507783, 32)
val dataset shape : (0, 32)
Start fit on text... about concate
current file : train.chunk.01
target col : product
current col : brand
concate data shape : (1000000,)
              total        used        free      shared  buff/cache   available
Mem:      206357056     7638740    88804516       10876   109913800   196869476
Swap:             0           0           0
current file : train.chunk.02
target col : product
current col : brand
concate data shape : (1000000,)
              total        used        free      shared  buff/cache   available
Mem:      206357056     8019600    88423712       10876   109913744   196488612
Swap:    

In [None]:
# make_dataset('test', col_name='product', max_len=32, num_words=100000)
# make_dataset('test', col_name='maker', max_len=32, num_words=100000)
# make_dataset('test', col_name='brand', max_len=32, num_words=100000)
# make_dataset('test', col_name='model', max_len=32, num_words=100000)

# make_dataset('test', col_name='price')
# make_dataset('test', col_name='img_feat', max_len=2048)


In [None]:
!freedd

In [6]:
h = h5py.File('origin/train.chunk.01', 'r')

In [None]:
h['train']['product'][1].decode('utf-8')

In [None]:
h = h5py.File('y_train_scateid.h5py', 'r')

In [None]:
h['train']

In [None]:
aa = np.array(h['train'])

In [None]:
ss = np.argmax(aa, axis=1)

In [None]:
len(ss[ss == 0])

In [None]:
!free

In [34]:
s1 = pd.Series(h['train']['product'][:], dtype='O').str.decode('utf-8')
# s2 = pd.Series(h['train']['maker'][:10], dtype='O').str.decode('utf-8')
# s3 = pd.Series(h['train']['brand'][:10], dtype='O').str.decode('utf-8')
# s4 = pd.Series(h['train']['model'][:10], dtype='O').str.decode('utf-8')


In [28]:
data

[0                      직소퍼즐 - 1000조각 바다거북의 여행 (PL1275)
 1    [모리케이스]아이폰6S/6S+ tree farm101 - 다이어리케이스[바보사랑][...
 2                              크리비아 기모 3부 속바지 GLG4314P
 3        [하프클럽/잭앤질]남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA
 4                          코드프리혈당시험지50매/코드프리시험지/최장유효기간
 5                    아트박스 POOM/낭만창고  idk385-시원한 맥주 캬하~
 6                    데버스 뉴 캠핑 BBQ 글러브 DVC E1209N 캠핑 등산
 7                          엘르스포츠 여성 비키니2PCS ETFLB06NVY
 8             [패션플러스][GEOX][GEOX] 제옥스 GH-405 블랙펄  클러치백
 9                          [아트박스 POOM/꾸밈] iz099-우럭아왜우럭
 dtype: object]

In [14]:
s3 = s1.str.cat(s2, sep=' ')

In [23]:
s2

0    상품상세설명 참조
1    MORY|해당없음
2             
3       ㈜크리스패션
4           기타
5             
6           기타
7           기타
8          제옥스
9           꾸밈
dtype: object

In [22]:
s3[:10]

0    퍼즐라이프
1     바보사랑
2     크리비아
3      잭앤질
4         
5         
6         
7    엘르스포츠
8      제옥스
9       꾸밈
dtype: object

In [24]:
s4[:10]

0                           퍼즐라이프 직소퍼즐 바다거북의 여행
1    아이폰6S/6S+ tree farm101 - 다이어리케이스|아이폰6S/6S+
2                       크리비아 기모 3부 속바지 GLG4314P
3     [잭앤질] 남성 솔리드 절개라인 포인트 포켓 팬츠 31133PT002_NA
4                              SD코드프리혈당시험지[50매]
5             아트박스 POOM/낭만창고  idk385-시원한 맥주 캬하~
6                                            기타
7                                    ETFLB06NVY
8                                              
9                           인테리어액자-iz099-우럭아왜우럭
dtype: object