In [1]:
!pip install pandas
!pip install sentencepiece
!pip install hgtk
!pip install gluonnlp

!git clone https://github.com/SKTBrain/KoBERT.git
!pip install -r KoBERT/requirements.txt
!pip install KoBERT/.

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 3.4MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.91
Collecting hgtk
  Downloading https://files.pythonhosted.org/packages/79/04/04758ed8c086fb1d9a5a267f90239533d33dbc1646ac32f8bf80e38b0ec7/hgtk-0.1.3.tar.gz
Building wheels for collected packages: hgtk
  Building wheel for hgtk (setup.py) ... [?25l[?25hdone
  Created wheel for hgtk: filename=hgtk-0.1.3-py2.py3-none-any.whl size=6688 sha256=789333700c5c2810735f1c46b044d572589d69b4c9eaf2e6ebae4103f598d5e6
  Stored in directory: /root/.cache/pip/wheels/73/72/06/6065a57fe68264f35d7e52e37f56831eb3e9ec75656880de20
Successfully built hgtk
Installing collected packages: hgtk
Successfully installed hgtk-0.1.3
Collecting gluonnlp
[?25

## 1. Base Function

In [2]:
def file_num_padding(file_num):
    if file_num < 10:
        return '00000' + str(file_num)
    elif file_num < 100:
        return '0000' + str(file_num)
    elif file_num < 1000:
        return '000' + str(file_num)
    elif file_num < 10000:
        return '00' + str(file_num)
    elif file_num < 100000:
        return '0' + str(file_num)
    else:
        return str(file_num)

def folder_1_padding(folder_num):
    if folder_num < 10:
        return '0' + str(folder_num) + '/'
    else:
        return str(folder_num) + '/'

def folder_2_padding(folder_num):
    if folder_num < 10:
        return '000' + str(folder_num) + '/'
    elif folder_num < 100:
        return '00' + str(folder_num) + '/'
    elif folder_num < 1000:
        return '0' + str(folder_num) + '/'
    else:
        return str(folder_num) + '/'

In [3]:
def get_path(path, fname, folder_1_num, folder_2_num, file_num, format):
    folder_1_num = folder_1_padding(folder_1_num)
    folder_2_num = folder_2_padding(folder_2_num)
    file_num = file_num_padding(file_num)
    return path + fname + folder_1_num + fname + folder_2_num + fname + file_num + format

In [4]:
BASE_PATH = '/content/drive/My Drive/googledrive/'
fname = 'KsponSpeech_'
folder_1_num = 1
folder_2_num = 1
file_num = 1
format = '.txt'

TEMP = get_path(BASE_PATH, fname, folder_1_num, folder_2_num, file_num, format)
print(TEMP)

/content/drive/My Drive/googledrive/KsponSpeech_01/KsponSpeech_0001/KsponSpeech_000001.txt


## 2. Data-Preprocess

In [5]:
def bracket_filter(sentence):
    new_sentence = ''
    flag = False

    for ch in sentence:
        if ch == '(' and flag == False:
            flag = True
            continue
        if ch == ')' and flag == True:
            flag = False
            continue
        if ch != ')' and flag == False:
            new_sentence += ch
            
    return new_sentence

In [6]:
with open(TEMP, 'r', encoding='ms949') as f:
   r = f.read()
   print(bracket_filter(r))

아/ 몬 소리야, 그건 또. b/



In [7]:
def special_filter(sentence):
    SENTENCE_MARK = ['.', '?', ',', '!']
    NOISE = ['o', 'n', 'u', 'b', 'l']
    EXCEPT = ['/', '+', '*', '-', '@', '$', '^', '&', '[', ']', '=', ':', ';']

    import re
    
    new_sentence = ''
    for idx, ch in enumerate(sentence):
        if ch not in SENTENCE_MARK:
            # o/, n/ 등을 처리
            if idx + 1 < len(sentence) and ch in NOISE and sentence[idx+1] == '/':
                continue
        if ch not in EXCEPT:
            new_sentence += ch
    pattern = re.compile(r'\s\s+')
    new_sentence = re.sub(pattern, ' ', new_sentence.strip())
    return new_sentence

In [8]:
with open(TEMP, 'r', encoding='ms949') as f:
   r = f.read()
   print(special_filter(r))

아 몬 소리야, 그건 또.


In [9]:
def sentence_filter(raw_sentence):
    return special_filter(bracket_filter(raw_sentence))

In [10]:
with open(TEMP, 'r', encoding='ms949') as f:
   r = f.read()
   print(sentence_filter(r))

아 몬 소리야, 그건 또.


## 3. Create Character labels

In [11]:
import pandas as pd
from tqdm import trange

BASE_PATH = '/content/drive/My Drive/googledrive/'
fname = 'KsponSpeech_'
format = '.txt'

total_f1 = 1
total_f2 = 2
total_fn = 2000

label_list = []
label_freq = []

now1 = 1
now2 = 1

print('started...')
for f1 in trange(1, total_f1+1):
    for f2 in trange(now1, now1+100):
        if f2 > total_f2:
            break
        for fn in trange(now2, now2+1000):
            if fn > total_fn:
                break
            with open(get_path(BASE_PATH, fname, f1, f2, fn, format), 'r', encoding='ms949') as f:
                sentence = f.readline()
            for ch in sentence:
                if ch not in label_list:
                    label_list.append(ch)
                    label_freq.append(1)
                else:
                    label_freq[label_list.index(ch)] += 1
        now2 += 1000
    now1 += 100
    

# sort together Using zip
label_freq, label_list = zip(*sorted(zip(label_freq, label_list), reverse=True))
label = {'id': [0, 1, 2], 'char': ['_', '<s>', '</s>'], 'freq': [0, 0, 0]}
for idx, (ch, freq) in enumerate(zip(label_list, label_freq)):
    label['id'].append(idx)
    label['char'].append(ch)
    label['freq'].append(freq)

# dictionary to csv
label_df = pd.DataFrame(label)
label_df.to_csv('aihub_labels.csv', encoding='ms949', index=False)
print(label_df)

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A

  0%|          | 0/1000 [00:00<?, ?it/s][A[A

started...




  0%|          | 2/1000 [00:00<02:19,  7.16it/s][A[A

  0%|          | 3/1000 [00:00<02:27,  6.74it/s][A[A

  0%|          | 4/1000 [00:00<02:58,  5.58it/s][A[A

  0%|          | 5/1000 [00:01<05:08,  3.22it/s][A[A

  1%|          | 6/1000 [00:01<06:36,  2.51it/s][A[A

  1%|          | 7/1000 [00:02<07:52,  2.10it/s][A[A

  1%|          | 8/1000 [00:03<08:01,  2.06it/s][A[A

  1%|          | 9/1000 [00:03<07:55,  2.09it/s][A[A

  1%|          | 10/1000 [00:04<08:21,  1.98it/s][A[A

  1%|          | 11/1000 [00:04<09:37,  1.71it/s][A[A

  1%|          | 12/1000 [00:05<08:20,  1.97it/s][A[A

  1%|▏         | 13/1000 [00:05<07:48,  2.11it/s][A[A

  1%|▏         | 14/1000 [00:05<06:32,  2.51it/s][A[A

  2%|▏         | 15/1000 [00:06<05:48,  2.82it/s][A[A

  2%|▏         | 16/1000 [00:06<04:53,  3.35it/s][A[A

  2%|▏         | 17/1000 [00:06<04:32,  3.61it/s][A[A

  2%|▏         | 18/1000 [00:06<04:50,  3.39it/s][A[A

  2%|▏         | 19/1000 [00:07<04:21

        id  char   freq
0        0     _      0
1        1   <s>      0
2        2  </s>      0
3        0        20256
4        1     /   3022
...    ...   ...    ...
1023  1020     겪      1
1024  1021     겄      1
1025  1022     객      1
1026  1023     X      1
1027  1024     F      1

[1028 rows x 3 columns]





## 4. Create target text

In [12]:
import pandas as pd

def load_label(file_path):
    char2id = {}
    id2char = {}
    ch_labels = pd.read_csv(file_path, encoding='cp949')
    id_list = ch_labels['id']
    char_list = ch_labels['char']
    freq_list = ch_labels['freq']

    for (id, char, freq) in zip(id_list, char_list, freq_list):
        char2id[char] = id
        id2char[id] = char
    return char2id, id2char

In [13]:
def sentence_to_target(sentence, char2id):
    target = ''
    for ch in sentence:
        target += (str(char2id[ch]) + ' ')
    return target[:-1]

In [14]:
def target_to_sentence(target, id2char):
    sentence = ''
    targets = target.split()

    for n in targets:
        sentence += id2char[int(n)]
    return sentence

In [15]:
file_path = '/content/aihub_labels.csv'
char2id, id2char = load_label(file_path)

test = '인공지능 사관학교 화이팅!'
a = sentence_to_target(test, char2id)
print(a)
b = target_to_sentence(a, id2char)
print(b)

67 162 13 462 0 42 212 72 121 0 142 5 490 665
인공지능 사관학교 화이팅!


In [33]:
import pandas as pd
from tqdm import trange

BASE_PATH = '/content/drive/My Drive/googledrive/'
fname = 'KsponSpeech_'
format = '.txt'
new_fname = 'KsponSpeech_label_'

total_f1 = 1
total_f2 = 2
total_fn = 2000
char2id, id2char = load_label('aihub_labels.csv')

now1 = 1
now2 = 1

print('started...')
for f1 in trange(1, total_f1+1):
    for f2 in trange(now1, now1+100):
        if f2 > total_f2:
            break
        for fn in trange(now2, now2+1000):
            if fn > total_fn:
                break
            with open(get_path(BASE_PATH, fname, f1, f2, fn, format), 'r', encoding='ms949') as f:
                sentence = f.readline()

            with open(get_path(BASE_PATH, new_fname, f1, f2, fn, format), 'w', encoding='ms949') as f:
                target = sentence_to_target(sentence, char2id)
                f.write(target)
        now2 += 1000
    now1 += 100












  0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A











  0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 0/1000 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

started...





FileNotFoundError: ignored

## 5. Create data list

In [None]:
import pandas as pd

total_fn = 7200
train_num = int(total_num * 0.98)
test_num = total_fn - train_num

train_data_list = {'audio': [], 'label': []}
test_data_list = {'audio': [], 'label': []}
aihub_labels = pd.read_csv('test_labels.csv', encoding='cd949')
rare_labels = aihub_labels['char'][2037:]

In [None]:
from tqdm import trange

fname = 'KsponSpeech_'
target_fname = 'KsponSpeech_label_'

audio_paths = []
target_paths = []

for fn in trange(1, total_fn):
    audio_paths.append(fname + file_num_padding(fn) + '.pcm')
    target_paths.appedn(target_fname + file_num_padding(fn) + '.txt')

In [None]:
import random

data_paths = list(zip(audio_paths, target_paths))
random.shuffle(data_paths)
audio_paths, target_paths = zip(*data_paths)

In [None]:
from tqdm import trange

path = '/content/drive/My Drive/googledrive/'
train_full = False
train_dict = {}
test_dict = {}

print('started...')
for idx in trange(length = len(audio_paths)):
    audio = audio_paths[idx]
    target = target_paths[idx]
    if len(train_dict['audio']) == train_num:
        train_full = True
    if train_full:
        test_dict['audio'].append(audio)
        test_dict['label'].append(label)
    else:
        rare_in = False
        sentence = None
        with open((path+audio).split('.')[0]+'.txt') as f:
            sentence = f.readline()

        for rare in rare_labels:
            if rare in sentence:
                rare_in = True
                break
        if rare_in:
            test_dict['audio'].append(audio)
            test_dict['label'].append(label)
        else:
            train_dict['audio'].append(audio)
            train_dict['label'].append(label)
            
print('Ended!!!')

In [None]:
test_df = pd.DataFrame(test_dict)
train_df = pd.DataFrame(train_dict)

test_df.to_csv('test_list.csv', encoding='cp949', index=False)
train_df.to_csv('train_list.csv', encoding='cp949', index=False)