## Prerequisites

In [None]:
!pip install pandas
!pip install sentencepiece
!pip install hgtk
!pip install gluonnlp

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 1.1MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.91
Collecting hgtk
  Downloading https://files.pythonhosted.org/packages/79/04/04758ed8c086fb1d9a5a267f90239533d33dbc1646ac32f8bf80e38b0ec7/hgtk-0.1.3.tar.gz
Building wheels for collected packages: hgtk
  Building wheel for hgtk (setup.py) ... [?25l[?25hdone
  Created wheel for hgtk: filename=hgtk-0.1.3-py2.py3-none-any.whl size=6688 sha256=90829c1abb57e4fece62d374291acbe8ac91b59ffeca4a3e6a61dcfc0a4d9072
  Stored in directory: /root/.cache/pip/wheels/73/72/06/6065a57fe68264f35d7e52e37f56831eb3e9ec75656880de20
Successfully built hgtk
Installing collected packages: hgtk
Successfully installed hgtk-0.1.3
Collecting gluonnlp
[?25

## Base Function

In [None]:
def file_num_padding(file_num) :
    if file_num < 10 :
        return '00000' + str(file_num)
    elif file_num < 100 :
        return '0000' + str(file_num)
    elif file_num < 1000 :
        return '000' + str(file_num)
    elif file_num < 10000 :
        return '00' + str(file_num)
    elif file_num < 100000 :
        return '0' + str(file_num)
    else :
        return str(file_num)

In [None]:
def get_path(path, fname, file_num, format):
    return path + fname +file_num + format

In [None]:
BASE_PATH = './content/drive/My Drive/googledrive/'
fname = 'KsponSpeech_'
file_num = 1
format = '.txt'

print(get_path(BASE_PATH, fname, file_num_padding(file_num), format))

./content/drive/My Drive/googledrive/KsponSpeech_000001.txt


## preprocess

### bracket_filter()

In [1]:
import os
import re

In [2]:
test1 = "o/ 근데 (70%)/(칠십 퍼센트)가 커 보이긴 하는데 (200)/(이백) 벌다 (140)/(백 사십) 벌면 빡셀걸? b/"
test2 = "근데 (3학년)/(삼 학년) 때 까지는 국가장학금 바+ 받으면서 다녔던 건가?"

In [3]:
"""
(A) / (B) 일 때, B만을 가져와주는 함수
(철자전사) / (발음전사) 중 철자전사를 선택하기 위해 정의
"""

def bracket_filter(sentence):
    # new_sentence = str()
    new_sentence = ""
    # 디폴트 값은 False
    flag = False

    for ch in sentence :
        if ch == ')' and flag == False :
            flag = True
            continue
        if ch == ')' and flag == True :
            flag = False
            continue
        if ch != '(' and flag == False :
            new_sentence += ch
    return new_sentence

In [4]:
print(bracket_filter(test1))
print(bracket_filter(test2))

o/ 근데 70%가 커 보이긴 하는데 200 벌다 140 벌면 빡셀걸? b/
근데 3학년 때 까지는 국가장학금 바+ 받으면서 다녔던 건가?


### special_filter()

In [34]:
test3 = "o/ 근데 70%가 커 보이긴 하는데 200 벌다 140 벌면 빡셀걸? b/"
test4 = "근데 3학년 때 까지는 국가장학금 바+ 받으면서 다녔던 건가?"
test5 = "그런 거 하자. 설 명절. l/"

In [36]:
# 특수 문자를 아예 필터링 해버리면 문제가 되는 '#', '%'와 같은 문자를 확인하고, 문제가 되는 특수문자는 해당 발음으로 바꿔주었다.

def special_filter(sentence):
    SENTENCE_MARK = ['?', '!', '.', ',']
    # noise라고 인식하면 안 되는 것들 추려내기!
    # b = 숨소리, l = 웃음 소리, o = 다른 사람의 말소리가 포함된 경우 문장의 맨 앞에 표기, n = 주변의 잡음
    NOT_NOISE = ['l', '+']
    # u = 문맥을 보아도 도저히 알 수 없는 발음
    NOISE = ['o', 'n', 'u', 'b']
    EXCEPT = ['/', '*', '-', '@', '$', '^', '&', '[', ']', '=', ':', ';']

    new_sentence = ""
    for idx, ch in enumerate(sentence):
        if ch not in SENTENCE_MARK:
            # o/, n/ 등 처리(noise 처리)
            if idx + 1 < len(sentence) and ch in NOISE and sentence[idx + 1] == '/':
                continue
        # if ch == 'b' :
        #     new_sentence += '(한숨)'
        if ch == 'l' :
            new_sentence += '(웃으며)'
        elif ch == '+' :
            new_sentence += ','
        # 여기에다가 not_noise 부분 처리해야 되나?
        elif ch not in EXCEPT:
            new_sentence += ch
    pattern = re.compile(r'\s\s+')
    new_sentence = re.sub(pattern, ' ', new_sentence.strip())
    return new_sentence

In [37]:
print(special_filter(test3))
print(special_filter(test4))
print(special_filter(test5))

근데 70%가 커 보이긴 하는데 200 벌다 140 벌면 빡셀걸?
근데 3학년 때 까지는 국가장학금 바, 받으면서 다녔던 건가?
그런 거 하자. 설 명절. (웃으며)


In [30]:
import wave

def pcm2wav( pcm_file, wav_file, channels=1, bit_depth=16, sampling_rate=16000 ):

    # Check if the options are valid.
    if bit_depth % 8 != 0:
        raise ValueError("bit_depth "+str(bit_depth)+" must be a multiple of 8.")
        
    # Read the .pcm file as a binary file and store the data to pcm_data
    with open( pcm_file, 'rb') as opened_pcm_file:
        pcm_data = opened_pcm_file.read()
        obj2write = wave.open( wav_file, 'wb')
        obj2write.setnchannels( channels )
        obj2write.setsampwidth( bit_depth // 8 )
        obj2write.setframerate( sampling_rate )
        obj2write.writeframes( pcm_data )
        obj2write.close()

pcm2wav( '/content/drive/My Drive/googledrive/feature_test/KsponSpeech_586827.pcm', '/content/drive/My Drive/googledrive/feature_test/KsponSpeech_586827.wav', 1, 16, 16000 )