# Korean preprocessing
- Record ways to preprocessing korean sentences.

In [None]:
import re
import hanja

## By function (1)

Simple way to remove special characters -- just define them.

In [None]:
removal_list =  "‘, ’, ◇, ‘, ”,  ’, ', ·, \“, ·, △, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, .,?, !,【,】, …, ◆,%"

In [None]:
sen_list =[]
for sen in sentences:
    sen_clean = sen.translate(str.maketrans(removal_list, ' '*len(removal_list)))
    sen_clean = re.sub('\s+', ' ', sen_clean)
    sen_list.append(sen_clean)

Default form : Remove pre-defined special characters and convert multiple spaces to one.

In [None]:
sen_list =[]
for sen in sentences:
    #sen_clean = re.sub(r'  ', ' ', sen_clean)
    sen_clean = sen.translate(str.maketrans(removal_list, ' '*len(removal_list)))
    sen_clean = re.sub('\s+', ' ', sen_clean)
    sen_clean = '<s ' + sen_clean + ' /s>'
    sen_clean = re.sub('\s+/s>', ' /s>', sen_clean)
    sen_clean = re.sub('<s\s+', '<s ', sen_clean)
    sen_list.append(sen_clean)

If you need to seperate sentences by <s(start of sentence) </s(end of sentence) sign, use code below.

## By function (2)

In [None]:
EMAIL_PATTERN = re.compile(r'''(([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)(\.[a-zA-Z]{2,4}))''', re.VERBOSE)
URL_PATTERN = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.VERBOSE)
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)

Define what to delete in advance.

In [None]:
def cleansing_other(sentence: str = None) -> str:
    
    sentence = re.sub(EMAIL_PATTERN, ' ', sentence)
    sentence = re.sub(URL_PATTERN, ' ', sentence)
    sentence = re.sub(MULTIPLE_SPACES, ' ', sentence)
    sentence = sentence.replace(", )", "")
    
    return sentence

Remove e-mail, url, space using defined pattern above.

In [None]:
def cleansing_chinese(sentence: str = None) -> str:
    
    sentence = re.sub("\([\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]+\)", "", sentence)
    
    if re.search("[\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]", sentence) is not None:
        sentence = hanja.translate(sentence, 'substitution')

    return sentence

Remove chinese characters. If parentheses are enclosed in front and back, most of them are Korean translations so just remove them without space. If there is another Chinese character, replace it to Korean using `hanja` module( ex. 軍 -> 군 ).

In [None]:
def cleansing_special(sentence: str = None) -> str:

    sentence = re.sub("[.,\'\"’‘”“!?]", "", sentence)
    sentence = re.sub("[^가-힣0-9a-zA-Z\\s]", " ", sentence)
    sentence = re.sub("\s+", " ", sentence)
    sentence = sentence.strip()
    
    return sentence

Remove special characters and leave only Korean, English, numbers. There may be multiple spaces if multiple special characters are attached. Delete it using `sentence = re.sub("\s+", " ", sentence)`.

In [None]:
def cleansing_numbers(sentence: str = None) -> str:

    sentence = re.sub('[0-9]+', 'NUM', sentence)
    sentence = re.sub('NUM\s+', "NUM", sentence)
    sentence = re.sub('[NUM]+', "NUM", sentence)
    
    return sentence

If necessary, remove the number as well.

In [None]:
def preprocess_sent(sentence: str = None) -> str:
    
    sent_clean = sentence
    sent_clean = cleansing_other(sent_clean)
    sent_clean = cleansing_chinese(sent_clean)
    sent_clean = cleansing_special(sent_clean)
    sent_clean = cleansing_numbers(sent_clean)
    sent_clean = re.sub('\s+', ' ', sent_clean)

    return sent_clean

Running all preprocessing.

### Example of preprocessing

In [None]:
# 전처리 예시
new_sents = []
original_sents = {SOURCE_SENTENCE}
for sent in original_sents:
    new_sent = preprocess_sent(sent)
    new_sents.append(new_sent)

for ori, new in zip(original_sents, new_sents):
    print("----------")
    print("● 기존: ", ori)
    print("● 변경: ", new)

## By Mecab
- Mecab module provides high quality tokenization and Korean preprocessing.
- But mecab is difficult to install jupyter notebook / anaconda environment.

In [None]:
'''
!sudo apt-get install python-dev; pip install konlpy
!sudo apt-get install curl
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
'''

In [None]:
def tokenizer(paragraph):
    mecab = Mecab()
    total_nouns = []
    
    for sentence in paragraph:
        nouns= mecab.nouns(sentence) # provide preprocessing
        nouns = [n for n in nouns if len(n) >1]
        
        total_nouns += nouns
        
    return total_nouns