In [None]:
!pip install pandas
!pip install sentencepiece
!pip install hgtk
!pip install gluonnlp

!git clone https://github.com/SKTBrain/KoBERT.git
!pip install -r KoBERT/requirements.txt
!pip install KoBERT/.

## 1. Base Function

In [2]:
def file_num_padding(file_num):
    if file_num < 10:
        return '00000' + str(file_num)
    elif file_num < 100:
        return '0000' + str(file_num)
    elif file_num < 1000:
        return '000' + str(file_num)
    elif file_num < 10000:
        return '00' + str(file_num)
    elif file_num < 100000:
        return '0' + str(file_num)
    else:
        return str(file_num)

def folder_1_padding(folder_num):
    if folder_num < 10:
        return '0' + str(folder_num) + '/'
    else:
        return str(folder_num) + '/'

def folder_2_padding(folder_num):
    if folder_num < 10:
        return '000' + str(folder_num) + '/'
    elif folder_num < 100:
        return '00' + str(folder_num) + '/'
    elif folder_num < 1000:
        return '0' + str(folder_num) + '/'
    else:
        return str(folder_num) + '/'

In [3]:
def get_path(path, fname, folder_1_num, folder_2_num, file_num, format):
    folder_1_num = folder_1_padding(folder_1_num)
    folder_2_num = folder_2_padding(folder_2_num)
    file_num = file_num_padding(file_num)
    return path + fname + folder_1_num + fname + folder_2_num + fname + file_num + format

In [4]:
BASE_PATH = '/content/drive/My Drive/googledrive/'
fname = 'KsponSpeech_'
folder_1_num = 1
folder_2_num = 1
file_num = 1
format = '.txt'

TEMP = get_path(BASE_PATH, fname, folder_1_num, folder_2_num, file_num, format)
print(TEMP)

/content/drive/My Drive/googledrive/KsponSpeech_01/KsponSpeech_0001/KsponSpeech_000001.txt


## 2. Data-Preprocess

In [5]:
def bracket_filter(sentence):
    new_sentence = ''
    flag = False

    for ch in sentence:
        if ch == '(' and flag == False:
            flag = True
            continue
        if ch == ')' and flag == True:
            flag = False
            continue
        if ch != ')' and flag == False:
            new_sentence += ch
            
    return new_sentence

In [6]:
with open(TEMP, 'r', encoding='ms949') as f:
   r = f.read()
   print(bracket_filter(r))

아/ 몬 소리야, 그건 또. b/



In [7]:
def special_filter(sentence):
    SENTENCE_MARK = ['.', '?', ',', '!']
    NOISE = ['o', 'n', 'u', 'b', 'l']
    EXCEPT = ['/', '+', '*', '-', '@', '$', '^', '&', '[', ']', '=', ':', ';']

    import re
    
    new_sentence = ''
    for idx, ch in enumerate(sentence):
        if ch not in SENTENCE_MARK:
            # o/, n/ 등을 처리
            if idx + 1 < len(sentence) and ch in NOISE and sentence[idx+1] == '/':
                continue
        if ch not in EXCEPT:
            new_sentence += ch
    pattern = re.compile(r'\s\s+')
    new_sentence = re.sub(pattern, ' ', new_sentence.strip())
    return new_sentence

In [8]:
with open(TEMP, 'r', encoding='ms949') as f:
   r = f.read()
   print(special_filter(r))

아 몬 소리야, 그건 또.


In [9]:
def sentence_filter(raw_sentence):
    return special_filter(bracket_filter(raw_sentence))

In [10]:
with open(TEMP, 'r', encoding='ms949') as f:
   r = f.read()
   print(sentence_filter(r))

아 몬 소리야, 그건 또.


## 3. Create Character labels

In [None]:
import pandas as pd
from tqdm import trange

BASE_PATH = '/content/drive/My Drive/googledrive/'
fname = 'KsponSpeech_'
format = '.txt'

total_f1 = 1
total_f2 = 8
total_fn = 7200

label_list = []
label_freq = []

print('started...')
for f1 in trange(1, total_f1+1):
    for f2 in trange(1, total_f2+1):
        for fn in trange(1, total_fn+1):
            with open(get_path(BASE_PATH, fname, f1, f2, fn, format), 'r', encoding='ms949') as f:
                sentence = f.readline()
            for ch in sentence:
                if ch not in label_list:
                    label_list.append(ch)
                    label_freq.append(1)
                else:
                    label_freq[label_list.index(ch)] += 1

# sort together Using zip
label_freq, label_list = zip(*sorted(zip(label_freq, label_list), reverse=True))
label = {'id': [], 'char': [], 'freq': []}
for idx, (ch, freq) in enumerate(zip(label_list, label_freq)):
    label['id'].append(idx)
    label['char'].append(ch)
    label['freq'].append(freq)

# dictionary to csv
label_df = pd.DataFrame(label)
label_df.to_csv('aihub_labels.csv', encoding='ms949', index=False)
print(label_df)

## 4. Create target text

In [13]:
import pandas as pd

def load_label(file_path):
    char2id = {}
    id2char = {}
    ch_labels = pd.read_csv(file_path, encoding='cp949')
    id_list = ch_labels['id']
    char_list = ch_labels['char']
    freq_list = ch_labels['freq']

    for (id, char, freq) in zip(id_list, char_list, freq_list):
        char2id[char] = id
        id2char[id] = char
    return char2id, id2char

In [14]:
def sentence_to_target(sentence, char2id):
    target = ''
    for ch in sentence:
        target += (str(char2id[ch]) + ' ')
    return target[:-1]

In [15]:
def target_to_sentence(target, id2char):
    sentence = ''
    targets = target.split()

    for n in targets:
        sentence += id2char[int(n)]
    return sentence

In [None]:
file_path = ''
char2id, id2char = load_label(file_path)

test = '인공지능 사관학교 화이팅!'
a = sentence_to_target(test, char2id)
print(a)
b = target_to_sentence(a, id2char)
print(b)

In [None]:
import pandas as pd
from tqdm import trange

BASE_PATH = '/content/drive/My Drive/googledrive/'
fname = 'KsponSpeech_'
format = '.txt'
new_fname = 'KsponSpeech_label_'

total_f1 = 1
total_f2 = 8
total_fn = 7200
char2id, id2char = load_label('test_label.csv')

print('started...')
for f1 in trange(1, total_f1+1):
    for f2 in trange(1, total_f2+1):
        for fn in trange(1, total_fn+1):
            with open(get_path(BASE_PATH, fname, f1, f2, fn, format), 'r', encoding='ms949') as f:
                sentence = f.readline()

            with open(get_path(BASE_PATH, new_fname, f1, f2, fn, format), 'w', encoding='ms949') as f:
                target = sentence_to_target(sentence, char2id)
                f.write(target)

## 5. Create data list

In [None]:
import pandas as pd

total_fn = 7200
train_num = int(total_num * 0.98)
test_num = total_fn - train_num

