## 1. Install Packages

In [1]:
# !pip install pandas
# !pip install sentencepiece
# !pip install hgtk
# !pip install gluonnlp

# !git clone https://github.com/SKTBrain/KoBERT.git
# !pip install -r KoBERT/requirements.txt
# !pip install KoBERT/.

## 2. Data-Preprocess

In [2]:
# BASE_PATH = '/content/drive/My Drive/googledrive/'
# TEMP = BASE_PATH + '/' + 'KsponSpeech_01' + '/' + 'KsponSpeech_0001' + '/' + 'KsponSpeech_000001' + '.txt'
# print(TEMP)

# with open(TEMP, 'r', encoding='ms949') as f:
#    r = f.read()
#    print(r)

In [3]:
def bracket_filter(sentence):
    new_sentence = ''
    flag = True

    for ch in sentence:
        if ch == '(':
            continue
        if ch == ')':
            if flag == True:
                flag = False
                continue
            else:
                flag = True
                continue
        if ch != ')' and flag == True:
            new_sentence += ch
            
    return new_sentence

In [4]:
# test = 'o/ 근데 (70%)/(칠십 퍼센트)가 커 보이긴 하는데 (200)/(이백) 벌다 (140)/(백 사십) 벌면 빠+ 빡셀걸? b/'
# print(bracket_filter(test))

In [5]:
def special_filter(sentence):
    SENTENCE_MARK = ['.', '?', ',', '!']
    NOISE = ['o', 'n', 'u', 'b', 'l']
    EXCEPT = ['/', '+', '*', '-', '@', '$', '^', '&', '[', ']', '=', ':', ';']

    import re
    
    new_sentence = ''
    for idx, ch in enumerate(sentence):
        if ch not in SENTENCE_MARK:
            # o/, n/ 등을 처리
            if idx + 1 < len(sentence) and ch in NOISE and sentence[idx+1] == '/':
                # '웃음(l/)'을 '^'로 처리
                if ch == 'l':
                    new_sentence += '^' 
                continue
        if ch == '+':
            new_sentence += ','
        if ch not in EXCEPT:
            new_sentence += ch
    pattern = re.compile(r'\s\s+')
    new_sentence = re.sub(pattern, ' ', new_sentence.strip())
    
    return new_sentence

In [6]:
# test = 'o/ 근데 (70%)/(칠십 퍼센트)가 커 보이긴 하는데 (200)/(이백) 벌다 (140)/(백 사십) 벌면 빠+ 빡셀걸? b/'
# print(special_filter(test))

In [7]:
def sentence_filter(raw_sentence):
    return special_filter(bracket_filter(raw_sentence))

In [8]:
# test = 'o/ 근데 (70%)/(칠십 퍼센트)가 커 보이긴 하는데 (200)/(이백) 벌다 (140)/(백 사십) 벌면 빠+ 빡셀걸? l/'
# print(sentence_filter(test))

In [9]:
# with open(TEMP, 'r', encoding='ms949') as f:
#    r = f.read()
#    print(sentence_filter(r))

## 3. Create Character labels

In [10]:
import os
import pandas as pd

# BASE_PATH = '/content/drive/My Drive/googledrive/'
BASE_PATH = '../'

label_list = []
label_freq = []

temp = os.listdir(BASE_PATH)
f1_names = [t for t in temp if 'KsponSpeech_0' in t]
# f1_names = ['KsponSpeech_01'] ### test
print(f1_names)

for f1 in f1_names:
    print(f'----- {f1} started... ')
    f2_names = os.listdir(BASE_PATH + f1 + '/')
    for f2 in f2_names:
        if 'KsponSpeech_' not in f2:
            continue
        print(f'---------- {f2} started... ')
        f_names = os.listdir(BASE_PATH + f1 + '/' + f2 + '/')
        for fn in f_names:
            if ('.txt' not in fn) or ('_label_' in fn):
                continue
            with open(BASE_PATH + f1 + '/' + f2 + '/' + fn, 'r', encoding='ms949') as f:
                sentence = sentence_filter(f.readline())
            for ch in sentence:
                if ch not in label_list:
                    label_list.append(ch)
                    label_freq.append(1)
                else:
                    label_freq[label_list.index(ch)] += 1

print('\n\n')
print('----- ⚡ Create Label list/freq Ended !!!')
    
label_freq, label_list = zip(*sorted(zip(label_freq, label_list), reverse=True))
label = {'id': [0, 1, 2], 'char': ['_', '<s>', '</s>'], 'freq': [0, 0, 0]}
for idx, (ch, freq) in enumerate(zip(label_list, label_freq)):
    label['id'].append(idx+3)
    label['char'].append(ch)
    label['freq'].append(freq)

label_df = pd.DataFrame(label)
label_df.to_csv(BASE_PATH + 'aihub_labels.csv', encoding='ms949', index=False)

print('\n\n')
print('----- ⚡ Create aihub_labels Ended !!!')
print(label_df)

--------- KsponSpeech_0137 started... 
---------- KsponSpeech_0138 started... 
---------- KsponSpeech_0139 started... 
---------- KsponSpeech_0140 started... 
---------- KsponSpeech_0141 started... 
---------- KsponSpeech_0142 started... 
---------- KsponSpeech_0143 started... 
---------- KsponSpeech_0144 started... 
---------- KsponSpeech_0145 started... 
---------- KsponSpeech_0146 started... 
---------- KsponSpeech_0147 started... 
---------- KsponSpeech_0148 started... 
---------- KsponSpeech_0149 started... 
---------- KsponSpeech_0150 started... 
---------- KsponSpeech_0151 started... 
---------- KsponSpeech_0152 started... 
---------- KsponSpeech_0153 started... 
---------- KsponSpeech_0154 started... 
---------- KsponSpeech_0155 started... 
---------- KsponSpeech_0156 started... 
---------- KsponSpeech_0157 started... 
---------- KsponSpeech_0158 started... 
---------- KsponSpeech_0159 started... 
---------- KsponSpeech_0160 started... 
---------- KsponSpeech_0161 started... 
-

## 4. Create target text

In [11]:
import pandas as pd

def load_label(file_path):
    char2id = {}
    id2char = {}
    ch_labels = pd.read_csv(file_path, encoding='ms949')
    id_list = ch_labels['id']
    char_list = ch_labels['char']
    freq_list = ch_labels['freq']

    for (id, char, freq) in zip(id_list, char_list, freq_list):
        char2id[char] = id
        id2char[id] = char
    return char2id, id2char

In [12]:
def sentence_to_target(sentence, char2id):
    target = ''
    for ch in sentence:
        target += (str(char2id[ch]) + ' ')
    return target[:-1]

In [13]:
def target_to_sentence(target, id2char):
    sentence = ''
    targets = target.split()

    for n in targets:
        sentence += id2char[int(n)]
        
    return sentence

In [14]:
# file_path = '/content/aihub_labels.csv'
# char2id, id2char = load_label(file_path)

# test = '오늘 뭐 먹지?'

# a = sentence_to_target(test, char2id)
# print(a)

# b = target_to_sentence(a, id2char)
# print(b)

In [15]:
import pandas as pd
import os

'''
BASE_PATH = '/content/drive/My Drive/googledrive/'

temp = os.listdir(BASE_PATH)
# f1_names = [t for t in temp if 'KsponSpeech_0' in t]
f1_names = ['KsponSpeech_01'] ### test
print(f1_names)
'''

char2id, id2char = load_label(BASE_PATH + 'aihub_labels.csv')
total_fn = 0

for f1 in f1_names:
    print(f'----- {f1} started... ')
    f2_names = os.listdir(BASE_PATH + f1 + '/')
    for f2 in f2_names:
        if 'KsponSpeech_' not in f2:
            continue
        print(f'---------- {f2} started... ')
        f_names = os.listdir(BASE_PATH + f1 + '/' + f2 + '/')
        for fn in f_names:
            if ('.txt' not in fn) or ('_label_' in fn):
                continue
            total_fn += 1
            with open(BASE_PATH + f1 + '/' + f2 + '/' + fn, 'r', encoding='ms949') as f:
                sentence = sentence_filter(f.readline())
            with open(BASE_PATH + f1 + '/' + f2 + '/' + 'KsponSpeech_label_' + fn.split('_')[1][:6] + '.txt', 'w', encoding='ms949') as f:
                target = sentence_to_target(sentence, char2id)
                f.write(target)
    
print('\n\n')
print(f'total_fn: {total_fn}')
print('----- ⚡Create Target Text Ended !!! ')

.. 
---------- KsponSpeech_0128 started... 
---------- KsponSpeech_0129 started... 
---------- KsponSpeech_0130 started... 
---------- KsponSpeech_0131 started... 
---------- KsponSpeech_0132 started... 
---------- KsponSpeech_0133 started... 
---------- KsponSpeech_0134 started... 
---------- KsponSpeech_0135 started... 
---------- KsponSpeech_0136 started... 
---------- KsponSpeech_0137 started... 
---------- KsponSpeech_0138 started... 
---------- KsponSpeech_0139 started... 
---------- KsponSpeech_0140 started... 
---------- KsponSpeech_0141 started... 
---------- KsponSpeech_0142 started... 
---------- KsponSpeech_0143 started... 
---------- KsponSpeech_0144 started... 
---------- KsponSpeech_0145 started... 
---------- KsponSpeech_0146 started... 
---------- KsponSpeech_0147 started... 
---------- KsponSpeech_0148 started... 
---------- KsponSpeech_0149 started... 
---------- KsponSpeech_0150 started... 
---------- KsponSpeech_0151 started... 
---------- KsponSpeech_0152 started.

## 5. Create data list

In [16]:
import pandas as pd

df = pd.read_csv(BASE_PATH + 'aihub_labels.csv', encoding='ms949')
start1 = len(df) - len(df[df.freq == 1]) + 1

# total_fn = ??? (위에서 계산)
train_num = int(total_fn * 0.98)
test_num = total_fn - train_num

train_data_list = {'audio': [], 'label': []}
test_data_list = {'audio': [], 'label': []}

aihub_labels = pd.read_csv(BASE_PATH + 'aihub_labels.csv', encoding='ms949')
rare_labels = aihub_labels['char'][start1:]

In [17]:
import os

audio_paths = []
target_paths = []

'''
temp = os.listdir(BASE_PATH)
# f1_names = [t for t in temp if 'KsponSpeech_0' in t]
f1_names = ['KsponSpeech_01'] ### test
print(f1_names)
'''

for f1 in f1_names:
    print(f'----- {f1} started... ')
    f2_names = os.listdir(BASE_PATH + f1 + '/')
    for f2 in f2_names:
        if 'KsponSpeech_' not in f2:
            continue
        print(f'---------- {f2} started... ')
        f_names = os.listdir(BASE_PATH + f1 + '/' + f2 + '/')
        for fn in f_names:
            if '.pcm' in fn:
                audio_paths.append(f1 + '/' + f2 + '/' + fn)
                target_paths.append(f1 + '/' + f2 + '/KsponSpeech_label_' + fn.split('_')[1][:6] + '.txt')
    
print('\n\n')
print('----- ⚡ Create audio/target path Ended !!!')

27 started... 
---------- KsponSpeech_0128 started... 
---------- KsponSpeech_0129 started... 
---------- KsponSpeech_0130 started... 
---------- KsponSpeech_0131 started... 
---------- KsponSpeech_0132 started... 
---------- KsponSpeech_0133 started... 
---------- KsponSpeech_0134 started... 
---------- KsponSpeech_0135 started... 
---------- KsponSpeech_0136 started... 
---------- KsponSpeech_0137 started... 
---------- KsponSpeech_0138 started... 
---------- KsponSpeech_0139 started... 
---------- KsponSpeech_0140 started... 
---------- KsponSpeech_0141 started... 
---------- KsponSpeech_0142 started... 
---------- KsponSpeech_0143 started... 
---------- KsponSpeech_0144 started... 
---------- KsponSpeech_0145 started... 
---------- KsponSpeech_0146 started... 
---------- KsponSpeech_0147 started... 
---------- KsponSpeech_0148 started... 
---------- KsponSpeech_0149 started... 
---------- KsponSpeech_0150 started... 
---------- KsponSpeech_0151 started... 
---------- KsponSpeech_01

In [18]:
import random

data_paths = list(zip(audio_paths, target_paths))
random.shuffle(data_paths)
audio_paths, target_paths = zip(*data_paths)

In [19]:
from tqdm import trange

# BASE_PATH = '/content/drive/My Drive/googledrive/'
train_full = False
train_dict = {'audio': [], 'label': []}
test_dict = {'audio': [], 'label': []}

for idx in trange(len(audio_paths)):
    audio = audio_paths[idx]
    target = target_paths[idx]
    if len(train_dict['audio']) == train_num:
        train_full = True
    if train_full:
        test_dict['audio'].append(audio)
        test_dict['label'].append(target)
    else:
        rare_in = False
        sentence = None
        with open(BASE_PATH + audio.split('.')[0]+'.txt', encoding='ms949') as f:
            sentence = f.readline()
            
        for rare in rare_labels:
            if rare in sentence:
                rare_in = True
                break
        if rare_in:
            test_dict['audio'].append(audio)
            test_dict['label'].append(target)
        else:
            train_dict['audio'].append(audio)
            train_dict['label'].append(target)
            
print('\n\n')
print('----- ⚡ Create train/test dict Ended !!!')

100%|██████████| 622545/622545 [08:38<00:00, 1200.99it/s]


----- ⚡ Create train/test dict Ended !!!



In [20]:
test_df = pd.DataFrame(test_dict)
train_df = pd.DataFrame(train_dict)

test_df.to_csv(BASE_PATH + 'test_list.csv', encoding='ms949', index=False)
train_df.to_csv(BASE_PATH + 'train_list.csv', encoding='ms949', index=False)

In [21]:
print('\n\n')
print('----- 👍👍👍 All Preprocess Ended !!!')




----- 👍👍👍 All Preprocess Ended !!!
