In [3]:
from KoreanDataCleaning import KoreanDataCleaning
from KoreanTokenization import KoreanTokenization
from KoreanStopwordsRemoval import KoreanStopwordsRemoval
from KoreanLemmatization import KoreanLemmatization
from KoreanEncoding import KoreanEncoding
from KoreanNormalization import KoreanNormalization

In [4]:
def main():
    
    korean_text = "이것은 한국어 테스트 문장 입니다."
    
    # step 1 datacleaning 
    cleaner = KoreanDataCleaning(korean_text)
    cleaner.remove_special_characters()
    cleaner.remove_html_tags()
    cleaner.to_lowercase()
    cleanned_text = cleaner.get_text()
    print(f"KoreanDataCleaning => {cleanned_text}")
    
    # step 2 tokenization
    tokenizer = KoreanTokenization(cleanned_text)
    korean_tokens = tokenizer.tokenize()
    print(f"KoreanTokenization => {korean_tokens}")
    
    #step 3 stopwordsRemoval
    stopwords_removal = KoreanStopwordsRemoval(korean_tokens)
    filterd_korean_tokens = stopwords_removal.remove_stopwords()
    print(f"KoreanStopwordsRemoval => {filterd_korean_tokens}")

    #step 4 lemmatization
    lemmatizer= KoreanLemmatization(filterd_korean_tokens)
    lemmatized_korean_tokens = lemmatizer.apply_lemmatization()
    print(f"KoreanLemmatization => {lemmatized_korean_tokens}")
    
    #step 5 encoding
    encoder = KoreanEncoding(lemmatized_korean_tokens)
    encoded_korean_tokens = encoder.encode()
    print(f"KoreanEncoding => {encoded_korean_tokens}")

    #step 6 normalization
    normalizer = KoreanNormalization(encoded_korean_tokens)
    normalized_data = normalizer.normalize(max_length=10)
    print(f"KoreanNormalization => {normalized_data}")

if __name__ == "__main__":
    main()

KoreanDataCleaning => 이것은 한국어 테스트 문장 입니다.
KoreanTokenization => ['이', '것', '은', '한국어', '테스트', '문장', '입니다', '.']
KoreanStopwordsRemoval => ['것', '한국어', '테스트', '문장', '입니다', '.']
KoreanLemmatization => ['것', '한국어', '테스트', '문장', '입니다', '.']
KoreanEncoding => [3, 6, 5, 4, 2, 7]
 tensor_sequences is [tensor(3), tensor(6), tensor(5), tensor(4), tensor(2), tensor(7)]
 shape of tensor 0 is torch.Size([])
 shape of tensor 1 is torch.Size([])
 shape of tensor 2 is torch.Size([])
 shape of tensor 3 is torch.Size([])
 shape of tensor 4 is torch.Size([])
 shape of tensor 5 is torch.Size([])
 padded_data is tensor([[3],
        [6],
        [5],
        [4],
        [2],
        [7]])
KoreanNormalization => tensor([[3],
        [6],
        [5],
        [4],
        [2],
        [7]])
