In [None]:
!pip install pandas
!pip install sentencepiece
!pip install hgtk
!pip install gluonnlp

!git clone https://github.com/SKTBrain/KoBERT.git
!pip install -r KoBERT/requirements.txt
!pip install KoBERT/.

## 1. Base Function

## 2. Data-Preprocess

In [None]:
BASE_PATH = '/content/drive/My Drive/googledrive/'
TEMP = BASE_PATH + '/' + 'KsponSpeech_01' + '/' + 'KsponSpeech_0001' + '/' + 'KsponSpeech_000001' + '.txt'
print(TEMP)

with open(TEMP, 'r', encoding='ms949') as f:
   r = f.read()
   print(r)

In [None]:
def bracket_filter(sentence):
    new_sentence = ''
    flag = True

    for ch in sentence:
        if ch == '(':
            continue
        if ch == ')':
            if flag == True:
                flag = False
                continue
            else:
                flag = True
                continue
        if ch != ')' and flag == True:
            new_sentence += ch
            
    return new_sentence

In [None]:
test = 'o/ 근데 (70%)/(칠십 퍼센트)가 커 보이긴 하는데 (200)/(이백) 벌다 (140)/(백 사십) 벌면 빠+ 빡셀걸? b/'
print(bracket_filter(test))

In [None]:
def special_filter(sentence):
    SENTENCE_MARK = ['.', '?', ',', '!']
    NOISE = ['o', 'n', 'u', 'b', 'l']
    EXCEPT = ['/', '+', '*', '-', '@', '$', '^', '&', '[', ']', '=', ':', ';']

    import re
    
    new_sentence = ''
    for idx, ch in enumerate(sentence):
        if ch not in SENTENCE_MARK:
            # o/, n/ 등을 처리
            if idx + 1 < len(sentence) and ch in NOISE and sentence[idx+1] == '/':
                continue
        # if ch == 'l':
        #     new_sentence += '(웃으며)'
        if ch == '+':
            new_sentence += ','
        if ch not in EXCEPT:
            new_sentence += ch
    pattern = re.compile(r'\s\s+')
    new_sentence = re.sub(pattern, ' ', new_sentence.strip())
    return new_sentence

In [None]:
test = 'o/ 근데 (70%)/(칠십 퍼센트)가 커 보이긴 하는데 (200)/(이백) 벌다 (140)/(백 사십) 벌면 빠+ 빡셀걸? b/'
print(special_filter(test))

In [None]:
def sentence_filter(raw_sentence):
    return special_filter(bracket_filter(raw_sentence))

In [None]:
test = 'o/ 근데 (70%)/(칠십 퍼센트)가 커 보이긴 하는데 (200)/(이백) 벌다 (140)/(백 사십) 벌면 빠+ 빡셀걸? b/'
print(sentence_filter(test))

In [None]:
with open(TEMP, 'r', encoding='ms949') as f:
   r = f.read()
   print(sentence_filter(r))

## 3. Create Character labels

In [None]:
import os
import pandas as pd

BASE_PATH = '/content/drive/My Drive/googledrive/'

label_list = []
label_freq = []

temp = os.listdir(BASE_PATH)
# f1_names = [t for t in temp if 'KsponSpeech_0' in t]
f1_names = ['KsponSpeech_01']
print(f1_names)

for f1 in f1_names:
    print(f'----- {f1} started... ')
    f2_names = os.listdir(BASE_PATH + f1 + '/')
    for f2 in f2_names:
        print(f'---------- {f2} started... ')
        f_names = os.listdir(BASE_PATH + f1 + '/' + f2 + '/')
        for fn in f_names:
            if '.txt' not in fn:
                continue
            with open(BASE_PATH + f1 + '/' + f2 + '/' + fn, 'r', encoding='ms949') as f:
                sentence = sentence_filter(f.readline())
            for ch in sentence:
                if ch not in label_list:
                    label_list.append(ch)
                    label_freq.append(1)
                else:
                    label_freq[label_list.index(ch)] += 1


label_freq, label_list = zip(*sorted(zip(label_freq, label_list), reverse=True))
label = {'id': [0, 1, 2], 'char': ['_', '<s>', '</s>'], 'freq': [0, 0, 0]}
for idx, (ch, freq) in enumerate(zip(label_list, label_freq)):
    label['id'].append(idx+3)
    label['char'].append(ch)
    label['freq'].append(freq)

label_df = pd.DataFrame(label)
label_df.to_csv('aihub_labels.csv', encoding='ms949', index=False)
print(label_df)

## 4. Create target text

In [None]:
import pandas as pd

def load_label(file_path):
    char2id = {}
    id2char = {}
    ch_labels = pd.read_csv(file_path, encoding='ms949')
    id_list = ch_labels['id']
    char_list = ch_labels['char']
    freq_list = ch_labels['freq']

    for (id, char, freq) in zip(id_list, char_list, freq_list):
        char2id[char] = id
        id2char[id] = char
    return char2id, id2char

In [None]:
def sentence_to_target(sentence, char2id):
    target = ''
    for ch in sentence:
        target += (str(char2id[ch]) + ' ')
    return target[:-1]

In [None]:
def target_to_sentence(target, id2char):
    sentence = ''
    targets = target.split()

    for n in targets:
        sentence += id2char[int(n)]
    return sentence

In [None]:
file_path = '/content/aihub_labels.csv'
char2id, id2char = load_label(file_path)

test = '오늘 뭐 먹지?'

a = sentence_to_target(test, char2id)
print(a)

b = target_to_sentence(a, id2char)
print(b)

In [None]:
import pandas as pd
import os

BASE_PATH = '/content/drive/My Drive/googledrive/'

temp = os.listdir(BASE_PATH)
# f1_names = [t for t in temp if 'KsponSpeech_0' in t]
f1_names = ['KsponSpeech_01']
print(f1_names)

char2id, id2char = load_label('aihub_labels.csv')
total_fn = 0

for f1 in f1_names:
    print(f'----- {f1} started... ')
    f2_names = os.listdir(BASE_PATH + f1 + '/')
    for f2 in f2_names:
        print(f'---------- {f2} started... ')
        f_names = os.listdir(BASE_PATH + f1 + '/' + f2 + '/')
        for fn in f_names:
            if '.txt' not in fn:
                continue
            total_fn += 1
            with open(BASE_PATH + f1 + '/' + f2 + '/' + fn, 'r', encoding='ms949') as f:
                sentence = sentence_filter(f.readline())
            with open(BASE_PATH + f1 + '/' + f2 + '/' + 'KsponSpeech_label_' + fn.split('_')[1][:6] + '.txt', 'w', encoding='ms949') as f:
                target = sentence_to_target(sentence, char2id)
                f.write(target)
print('----- ended!!! ')
print(total_fn)

## 5. Create data list

In [None]:
import pandas as pd

df = pd.read_csv('aihub_labels.csv', encoding='ms949')
start1 = len(df) - len(df[df.freq == 1]) + 1

# total_fn = ??? (위에서 계산)
train_num = int(total_fn * 0.98)
test_num = total_fn - train_num

train_data_list = {'audio': [], 'label': []}
test_data_list = {'audio': [], 'label': []}

aihub_labels = pd.read_csv('aihub_labels.csv', encoding='ms949')
rare_labels = aihub_labels['char'][start1:]

In [None]:
import os

audio_paths = []
target_paths = []

temp = os.listdir(BASE_PATH)
# f1_names = [t for t in temp if 'KsponSpeech_0' in t]
f1_names = ['KsponSpeech_01']
print(f1_names)

for f1 in f1_names:
    print(f'----- {f1} started... ')
    f2_names = os.listdir(BASE_PATH + f1 + '/')
    for f2 in f2_names:
        print(f'---------- {f2} started... ')
        f_names = os.listdir(BASE_PATH + f1 + '/' + f2 + '/')
        for fn in f_names:
            if '.pcm' in fn:
                audio_paths.append(f1 + '/' + f2 + '/' + fn)
            if 'KsponSpeech_label_' in fn:
                target_paths.append(f1 + '/' + f2 + '/' + fn)

print('----- ended!!!')

In [None]:
import random

data_paths = list(zip(audio_paths, target_paths))
random.shuffle(data_paths)
audio_paths, target_paths = zip(*data_paths)

In [None]:
from tqdm import trange

path = '/content/drive/My Drive/googledrive/'
train_full = False
train_dict = {'audio': [], 'label': []}
test_dict = {'audio': [], 'label': []}

print('started...')
for idx in trange(len(audio_paths)):
    audio = audio_paths[idx]
    target = target_paths[idx]
    if len(train_dict['audio']) == train_num:
        train_full = True
    if train_full:
        test_dict['audio'].append(audio)
        test_dict['label'].append(target)
    else:
        rare_in = False
        sentence = None
        with open((path+audio).split('.')[0]+'.txt', encoding='ms949') as f:
            sentence = f.readline()
            
        for rare in rare_labels:
            if rare in sentence:
                rare_in = True
                break
        if rare_in:
            test_dict['audio'].append(audio)
            test_dict['label'].append(target)
        else:
            train_dict['audio'].append(audio)
            train_dict['label'].append(target)
            
print('\n\n Ended!!!')

In [None]:
test_df = pd.DataFrame(test_dict)
train_df = pd.DataFrame(train_dict)

test_df.to_csv('test_list.csv', encoding='ms949', index=False)
train_df.to_csv('train_list.csv', encoding='ms949', index=False)

## To-Do List
- 전체적으로 실행 후 오류 수정 필요
- special_filter() 함수 수정 필요: 한숨, 침묵 등 처리 방법에 대한 논의 필요
- f1_names = ['KsponSpeech_01'] 테스트로 3개 바꿔놓음