In [60]:
import json
import os
import unicodedata
import sys
import pandas as pd
import matplotlib.pyplot as plt
import re
sys.path.append("..")
from scripts._mecab import Mecab
from typing import Dict, List, Tuple


In [78]:
from tokenizer import (
    # CharTokenizer,
    # JamoTokenizer,
    MeCabSentencePieceTokenizer_orig,
    MeCabSentencePieceTokenizer_fixed,
    MeCabSentencePieceTokenizer,
    MeCabWordPieceTokenizer,
    # MeCabTokenizer,
    MeCabTokenizer_orig,
    MeCabTokenizer_fixed,
    MeCabTokenizer_all,
    # MeCabSentencePieceTokenizer_kortok,
    # MeCabTokenizer_kortok,
    SentencePieceTokenizer,
    WordPieceTokenizer,
    Vocab,
    # WordTokenizer,
)

# 1. 토큰화 후 저장

In [112]:
def get_tokenizer(tokenizer_name: str, resource_dir: str, token_type, tokenizer_type: str , decomposition_type: str, space_symbol: str, dummy_letter: str, nfd: bool, grammatical_symbol: list = ["", ""], skip_special_tokens: bool = False, lexical_grammatical: bool = False):   # for LG
    tokenizer_dir = os.path.join(resource_dir, tokenizer_name)

    if tokenizer_name.startswith("sp-"):
        tokenizer = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model"))

    elif tokenizer_name.startswith("mecab_"):

        sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model"))

        if "orig" in tokenizer_name:
            mecab = MeCabTokenizer_orig(tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter)
            tokenizer = MeCabSentencePieceTokenizer_orig(mecab, sp, use_fixed=False) # mecab_sp_orig.py

        elif "fixed" in tokenizer_name:
            mecab = MeCabTokenizer_fixed(tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter)
            tokenizer = MeCabSentencePieceTokenizer_fixed(mecab, sp, use_fixed=True) # mecab_fixed.py


    # elif tokenizer_name.startswith("eojeol") or tokenizer_name.startswith("morpheme"):
    elif tokenizer_name.startswith("eojeol") or tokenizer_name.startswith("morpheme") or tokenizer_name.startswith("LG"):   # LG도 처리할 수 있도록
        wp = WordPieceTokenizer(os.path.join(tokenizer_dir, "bert_tokenizer.json"), skip_special_tokens=False)
        # mecab = MeCabTokenizer_all(token_type=token_type, tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter, nfd=nfd, grammatical_symbol=grammatical_symbol)
        mecab = MeCabTokenizer_all(token_type=token_type, tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter, nfd=nfd, grammatical_symbol=grammatical_symbol, lexical_grammatical=lexical_grammatical)   # for LG
        tokenizer = MeCabWordPieceTokenizer(mecab=mecab, wp=wp) # mecab_wp.py


    else:
        raise ValueError("Wrong tokenizer name.")

    return tokenizer

In [113]:
def get_tokenized_result(tokenizer, string, nfd: bool = True):
    # if nfd == True:
    #     string = str_to_nfd(string)

    tokenized = tokenizer.tokenize(string)
    # print(" ".join(tokenized))
    
    return " ".join(tokenized)

In [118]:

# 64k
tokenizer_eojeol_composed_F_64k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_composed_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_eojeol_decomposed_pure_F_64k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_composed_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_composed_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_lexical_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_lexical_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_lexical",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_pure_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)


# 32k
tokenizer_eojeol_composed_F_32k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_composed_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_eojeol_decomposed_pure_F_32k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_composed_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_composed_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_lexical_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_lexical_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_lexical",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_pure_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)



In [119]:
def show_tokenizations(string):
    # grammatical symbol F
    eojeol_composed_F_64k = string + '\t' + 'eojeol_composed_F_64k' + '\t' + get_tokenized_result(tokenizer_eojeol_composed_F_64k, string)
    eojeol_pure_F_64k = string + '\t' + 'eojeol_pure_F_64k' + '\t' + get_tokenized_result(tokenizer_eojeol_decomposed_pure_F_64k, string)
    fixed_composed_F_64k = string + '\t' + 'fixed_composed_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_composed_F_64k, string)
    fixed_lexical_F_64k = string + '\t' + 'fixed_lexical_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_lexical_F_64k, string)
    fixed_pure_F_64k = string + '\t' + 'fixed_pure_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_pure_F_64k, string)

    eojeol_composed_F_32k = string + '\t' + 'eojeol_composed_F_32k' + '\t' + get_tokenized_result(tokenizer_eojeol_composed_F_32k, string)
    eojeol_pure_F_32k = string + '\t' + 'eojeol_pure_F_32k' + '\t' + get_tokenized_result(tokenizer_eojeol_decomposed_pure_F_32k, string)
    fixed_composed_F_32k = string + '\t' + 'fixed_composed_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_composed_F_32k, string)
    fixed_lexical_F_32k = string + '\t' + 'fixed_lexical_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_lexical_F_32k, string)
    fixed_pure_F_32k = string + '\t' + 'fixed_pure_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_pure_F_32k, string)
    
    return "\n".join(['\n'+eojeol_composed_F_64k , eojeol_pure_F_64k , fixed_composed_F_64k, fixed_lexical_F_64k, fixed_pure_F_64k,
    eojeol_composed_F_32k , eojeol_pure_F_32k , fixed_composed_F_32k, fixed_lexical_F_32k, fixed_pure_F_32k])

In [120]:
show_tokenizations('갑자기 더워彭肽꿿뜛땭뜎حمق')

'\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_composed_F_64k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_pure_F_64k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_composed_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] [UNK] [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_lexical_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_pure_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_composed_F_32k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_pure_F_32k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_composed_F_32k\t[CLS] 갑자기 덥 어 [UNK] [UNK] [UNK] [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_lexical_F_32k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_pure_F_32\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]'

In [121]:
# 토큰화 및 저장
def analysis(task, corpus_list, corpus_name_list, sent2=True):
    for corpus, corpus_name in zip(corpus_list, corpus_name_list):
        with open('dataset_analysis/' + task+'_'+corpus_name+'.tsv', 'w', encoding='utf-8') as f:
            f.write('source'+'\t'+'tokenizer'+'\t'+'tokenize_result')
            if sent2:
                for sent1, sent2 in zip(corpus[0], corpus[1]):
                    f.write(show_tokenizations(string=sent1))
                    f.write(show_tokenizations(string=sent2))
            else:
                for sent1 in (corpus):
                    f.write(show_tokenizations(string=sent1))

In [122]:
# cola
def load_data(file_path: str) -> Tuple[List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence
    3. label
    """
    sentences: List[str] = []

    # for test set
    if file_path == "./dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_test_with_answer.tsv":
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f.readlines()[1:]):
                splitted = line.strip().split("\t")
                sentences.append(splitted[2])

    else:
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f.readlines()[1:]):
                splitted = line.strip().split("\t")
                sentences.append(splitted[3])

    return sentences

task = 'cola'
train = load_data('./dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_train.tsv')
dev = load_data('./dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_dev.tsv')
test = load_data('./dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_test_with_answer.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [123]:
# nsmc

def load_data(file_path: str) -> Tuple[List[str], List[int]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence
    3. label
    """
    sentences: List[str] = []
    labels: List[int] = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 2:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentences.append(splitted[0])
            
#             labels.append(label_to_index[splitted[1]])

    return sentences


task = 'nsmc'
train = load_data('./dataset/nlu_tasks/nsmc/ratings_train.tsv')
dev = load_data('./dataset/nlu_tasks/nsmc/ratings_dev.tsv')
test = load_data('./dataset/nlu_tasks/nsmc/ratings_test.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [124]:
# hsd

def load_data(file_path: str) -> Tuple[List[str], List[int]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence = []
    # sentence_bs = []
    labels = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 4:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue

            sentence.append(splitted[0])
                # sentence_bs.append(splitted[1])
            # sentence_as.append(splitted[0])
            # sentence_bs.append(splitted[1])

#             labels.append(label_to_index[splitted[3]])

    return sentence

task = 'hsd'
train = load_data('./dataset/nlu_tasks/hsd/train.tsv')
dev = load_data('./dataset/nlu_tasks/hsd/dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [125]:
# paws
def load_data(file_path: str) -> Tuple[List[str], List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence_as = []
    sentence_bs = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 4:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            if splitted[1] == "" or splitted[2] == "":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            # 문장이 "NS"로만 표기된 라인 제외
            if splitted[1] == "NS" or splitted[2] == "NS":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentence_as.append(splitted[1])
            sentence_bs.append(splitted[2])
        
    return sentence_as, sentence_bs

task = 'paws'
train = load_data('./dataset/nlu_tasks/paws/translated_train.tsv')
dev = load_data('./dataset/nlu_tasks/paws/dev_2k.tsv')
test = load_data('./dataset/nlu_tasks/paws/test_2k.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=True)

In [138]:
# KLUE-dp
dp_train = pd.read_csv('./KLUE-baseline/data/klue_benchmark/klue-dp-v1.1/klue-dp-v1.1_train.tsv',sep='delimiter', header=None)
dp_dev = pd.read_csv('./KLUE-baseline/data/klue_benchmark/klue-dp-v1.1/klue-dp-v1.1_dev.tsv', sep='delimiter', header=None)
dp_train.columns = ['text']
dp_dev.columns = ['text']
df2_train = dp_train['text'].str.contains('## klue-dp')
df2_dev = dp_dev['text'].str.contains('## klue-dp')
df2_train = dp_train[df2_train]
df2_dev = dp_dev[df2_dev]
df2_train['text'] = df2_train['text'].apply(lambda x: re.sub('## klue-dp-v1_train_.*\t', '', x))
df2_dev['text'] = df2_dev['text'].apply(lambda x: re.sub('## klue-dp-v1_dev_.*\t', '', x))
df2_train.to_csv('dp_orig_train.tsv', index=False, sep='\t')
df2_dev.to_csv('dp_orig_dev.tsv', index=False, sep='\t')

def load_data(file_path: str) -> Tuple[List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentenced
    3. label
    """
    sentences: List[str] = []
    #labels: List[int] = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip()
            # if len(splitted) != 2:
            #     #print(f"[ERROR] {repr(line)}, line {i}")
            #     continue
            sentences.append(splitted)
     #       labels.append(label_to_index[splitted[1]])

    return sentences

task = 'dp'
train = load_data('dp_orig_train.tsv')
dev = load_data('dp_orig_dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=False)

  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_train['text'] = df2_train['text'].apply(lambda x: re.sub('## klue-dp-v1_train_.*\t', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_dev['text'] = df2_dev['text'].apply(lambda x: re.sub('## klue-dp-v1_dev_.*\t', '', x))


In [140]:
# KLUE-nli
# json to tsv
import json
import pandas as pd
with open('./KLUE-baseline/data/klue_benchmark/klue-nli-v1.1/klue-nli-v1.1_train.json', 'r') as f:
    nli_train = json.load(f)
with open('./KLUE-baseline/data/klue_benchmark/klue-nli-v1.1/klue-nli-v1.1_dev.json') as f2:
    nli_dev = json.load(f2)


nli_train = pd.DataFrame(nli_train)
nli_train = nli_train[['premise', 'hypothesis']]
nli_train.to_csv('nli_orig_train.tsv', index=False, sep='\t')
nli_dev = pd.DataFrame(nli_dev)
nli_dev = nli_dev[['premise', 'hypothesis']]
nli_dev.to_csv('nli_orig_dev.tsv', index=False, sep='\t')

def load_data(file_path: str) -> Tuple[List[str], List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence_as = []
    sentence_bs = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if splitted[0] == "" or splitted[1] == "":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentence_as.append(splitted[0])
            sentence_bs.append(splitted[1])
        
    return sentence_as, sentence_bs

task = 'nli'
train = load_data('nli_orig_train.tsv')
dev = load_data('nli_orig_dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=True)


# 2. OOV rate, ## rate 분석

In [215]:
# [UNK]의 개수 / 문장의 길이 * 100
from typing import List

# 문장당 oov rate (OR)
def getOOVRatePerSentence(sentence):
    # [CLS], [SEP] 제거
    sentence = sentence.split()

    OOV_rate = sentence.count('[UNK]') / len(sentence) * 100   
    
    return OOV_rate

# count of all OOV tokens (OC)
def getCountofAllOOV(sentence):
    cnt = 0
    cnt += len(re.findall('[UNK]', sentence))
    
    return cnt

# [CLS, SEP] 제거
def removeCS(sentence):
    sentence = sentence.replace('[CLS]', '')
    sentence = sentence.replace('[SEP]', '')
    
    return sentence

# [UNK]수 /전체 토큰 수
def getOOVdividedbyAllTokens(corpus):
    corpus['OC'] = corpus['sentence'].apply(lambda x: getCountofAllOOV(x))
    corpus['sentence'].apply(lambda x: removeCS(x))
    corpus['token_count'] = corpus['sentence'].apply(lambda x: len(x))
    OOV_count = corpus['OC'].sum()
    token_count = corpus['token_count'].sum()
    
    return OOV_count, token_count, OOV_count/token_count*100



# "##"" 세기
# 문장당 oov rate (SR)
def getShopRatePerSentence(sentence):

    OOV_rate = sentence.count('##') / len(sentence) * 100   
    
    return OOV_rate

# count of all ## tokens (SC)
def getCountofAllShop(sentence):
    cnt = 0
    cnt += len(re.findall('##', sentence))
    
    return cnt

# [CLS, SEP] 제거
def removeCS(sentence):
    sentence = sentence.replace('[CLS]', '')
    sentence = sentence.replace('[SEP]', '')
    
    return sentence

# ##수 /전체 토큰 수
def getOOVdividedbyAllTokens(corpus):
    corpus['SC'] = corpus['sentence'].apply(lambda x: getCountofAllShop(x))
    corpus['sentence'].apply(lambda x: removeCS(x))
    corpus['token_count'] = corpus['sentence'].apply(lambda x: len(x))
    Shop_count = corpus['SC'].sum()
    token_count = corpus['token_count'].sum()
    
    return Shop_count, token_count, Shop_count/token_count*100


In [216]:
cola_train = pd.read_csv('dataset_analysis/cola_train.tsv', sep='\t')
cola_dev = pd.read_csv('dataset_analysis/cola_dev.tsv', sep='\t')
cola_test = pd.read_csv('dataset_analysis/cola_test.tsv', sep='\t')
cola = pd.concat([cola_train, cola_dev, cola_test])
cola['tokenize_result'] = cola['tokenize_result'].apply(lambda x: removeCS(x))
cola['source_len'] = cola['source'].apply(lambda x: len(x))
cola['tokenized_len'] = cola['tokenize_result'].apply(lambda x: len(x))

cola['OOV_per_sent'] = cola['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
cola['OOV_count'] = cola['tokenize_result'].apply(lambda x: getCountofAllOOV(x))

cola['##_per_sent'] = cola['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
cola['##_count'] = cola['tokenize_result'].apply(lambda x: getCountofAllShop(x))


In [193]:
cola

Unnamed: 0,source,tokenizer,tokenize_result,source_len,tokenized_len,OOV_per_sent,OOV_count,##_per_sent,##_count
0,높은 달이 떴다.,eojeol_composed_F_64k,높은 달이 떴다.,9,11,0.0,0,0.000000,0
1,높은 달이 떴다.,eojeol_pure_F_64k,높은 달이 떴다.,9,21,0.0,0,0.000000,0
2,높은 달이 떴다.,fixed_composed_F_64k,높 은 달 이 뜨 었 다 .,9,17,0.0,0,0.000000,0
3,높은 달이 떴다.,fixed_lexical_F_64k,높 은 달 이 뜨 었 다 .,9,22,0.0,0,0.000000,0
4,높은 달이 떴다.,fixed_pure_F_64k,높 은 달 이 뜨 었 다 .,9,28,0.0,0,0.000000,0
...,...,...,...,...,...,...,...,...,...
10595,나는 할아버지가 제일 무서우시다.,eojeol_composed_F_32k,나는 할아버지 ##가 제일 무서 ##우 ##시 ##다.,18,32,0.0,0,12.500000,4
10596,나는 할아버지가 제일 무서우시다.,eojeol_pure_F_32k,나는 할아버지가 제일 무서 ##우 ##시다.,18,43,0.0,0,4.651163,2
10597,나는 할아버지가 제일 무서우시다.,fixed_composed_F_32k,나 는 할아버지 가 제일 무서 ##우 시 다 .,18,28,0.0,0,3.571429,1
10598,나는 할아버지가 제일 무서우시다.,fixed_lexical_F_32k,나 는 할아버지 가 제일 무서 ##우 시 다 .,18,40,0.0,0,2.500000,1


In [217]:
cola.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_sent': ['mean'],
    'OOV_count': ['sum'],
    '##_per_sent': ['mean'],
    '##_count': ['sum']
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_sent,OOV_count,##_per_sent,##_count
Unnamed: 0_level_1,mean,mean,mean,sum,mean,sum
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
eojeol_composed_F_32k,19.115748,31.454434,0.032876,189,10.523272,62072
eojeol_composed_F_64k,19.115748,28.359246,0.036712,189,8.088648,43505
eojeol_pure_F_32k,19.115748,49.622583,0.035132,189,5.560999,50988
eojeol_pure_F_64k,19.115748,47.775672,0.037975,189,4.520664,39909
fixed_composed_F_32k,19.115748,30.536341,0.01988,216,1.689564,10372
fixed_composed_F_64k,19.115748,29.31407,0.020558,216,0.508337,3040
fixed_lexical_F_32k,19.115748,42.783563,0.020423,216,0.87146,7377
fixed_lexical_F_64k,19.115748,41.893365,0.020611,216,0.239431,2037
fixed_pure_F_32,19.115748,50.271394,0.020423,216,0.7405,7258
fixed_pure_F_64k,19.115748,49.392698,0.020611,216,0.202259,1987


In [218]:
nsmc_train = pd.read_csv('dataset_analysis/nsmc_train.tsv', sep='\t')
nsmc_dev = pd.read_csv('dataset_analysis/nsmc_dev.tsv', sep='\t')
nsmc_test = pd.read_csv('dataset_analysis/nsmc_test.tsv', sep='\t')
nsmc = pd.concat([nsmc_train, nsmc_dev, nsmc_test])
nsmc['tokenize_result'] = nsmc['tokenize_result'].apply(lambda x: removeCS(x))
nsmc['source_len'] = nsmc['source'].apply(lambda x: len(x))
nsmc['tokenized_len'] = nsmc['tokenize_result'].apply(lambda x: len(x))

nsmc['OOV_per_sent'] = nsmc['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
nsmc['OOV_count'] = nsmc['tokenize_result'].apply(lambda x: getCountofAllOOV(x))

nsmc['##_per_sent'] = nsmc['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
nsmc['##_count'] = nsmc['tokenize_result'].apply(lambda x: getCountofAllShop(x))


In [220]:
nsmc.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_sent': ['mean'],
    'OOV_count': ['sum'],
    '##_per_sent': ['mean'],
    '##_count': ['sum']
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_sent,OOV_count,##_per_sent,##_count
Unnamed: 0_level_1,mean,mean,mean,sum,mean,sum
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
eojeol_composed_F_32k,35.25899,67.63031,0.346157,8989,14.242295,2027121
eojeol_composed_F_64k,35.25899,61.456978,0.356818,8989,12.307936,1615582
eojeol_pure_F_32k,35.25899,100.643131,0.455792,9813,8.840555,1817988
eojeol_pure_F_64k,35.25899,96.386365,0.464452,9813,7.755301,1534215
fixed_composed_F_32k,35.25899,56.690843,0.127833,10730,2.627349,317082
fixed_composed_F_64k,35.25899,54.285001,0.1332,10730,1.325125,156699
fixed_lexical_F_32k,35.25899,82.471494,0.122378,10448,1.480511,245851
fixed_lexical_F_64k,35.25899,80.988145,0.126059,10448,0.89772,146965
fixed_pure_F_32,35.25899,92.037466,0.122186,10430,1.351018,240872
fixed_pure_F_64k,35.25899,90.602929,0.125915,10430,0.826526,145240


In [221]:
paws_train = pd.read_csv('dataset_analysis/paws_train.tsv', sep='\t')
paws_dev = pd.read_csv('dataset_analysis/paws_dev.tsv', sep='\t')
paws_test = pd.read_csv('dataset_analysis/paws_test.tsv', sep='\t')
paws = pd.concat([paws_train, paws_dev, paws_test])

paws['tokenize_result'] = paws['tokenize_result'].apply(str)
paws['tokenize_result'] = paws['tokenize_result'].apply(lambda x: removeCS(x))
paws['source_len'] = paws['source'].apply(lambda x: len(x))
paws['tokenized_len'] = paws['tokenize_result'].apply(lambda x: len(x))
paws['OOV_per_sent'] = paws['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
paws['OOV_count'] = paws['tokenize_result'].apply(lambda x: getCountofAllOOV(x))

paws['##_per_sent'] = paws['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
paws['##_count'] = paws['tokenize_result'].apply(lambda x: getCountofAllShop(x))


paws.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_sent': ['mean'],
    'OOV_count': ['sum'],
    '##_per_sent': ['mean'],
    '##_count': ['sum']
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_sent,OOV_count,##_per_sent,##_count
Unnamed: 0_level_1,mean,mean,mean,sum,mean,sum
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
eojeol_composed_F_32k,69.43389,123.754225,0.332959,52211,13.44936,1856386
eojeol_composed_F_64k,69.43389,110.640632,0.378464,52211,11.111521,1392650
eojeol_pure_F_32k,69.43389,161.549397,0.063106,30829,9.001736,1622116
eojeol_pure_F_64k,69.43389,153.039033,0.068572,30829,7.712552,1321164
fixed_composed_F_32k,69.43389,110.192734,0.282816,52346,6.317381,846546
fixed_composed_F_64k,69.43389,100.792288,0.307464,52346,4.134036,514118
fixed_lexical_F_32k,69.43389,135.968008,0.296267,52346,4.323391,684479
fixed_lexical_F_64k,69.43389,129.764123,0.311179,52346,3.099956,465091
fixed_pure_F_32,69.43389,150.212727,0.049922,30904,4.037302,698147
fixed_pure_F_64k,69.43389,143.956357,0.052145,30904,2.902907,476903


In [222]:
hsd_train = pd.read_csv('dataset_analysis/hsd_train.tsv', sep='\t')
hsd_dev = pd.read_csv('dataset_analysis/hsd_dev.tsv', sep='\t')
hsd = pd.concat([hsd_train, hsd_dev])

hsd['tokenize_result'] = hsd['tokenize_result'].apply(str)
hsd['tokenize_result'] = hsd['tokenize_result'].apply(lambda x: removeCS(x))
hsd['source_len'] = hsd['source'].apply(lambda x: len(x))
hsd['tokenized_len'] = hsd['tokenize_result'].apply(lambda x: len(x))
hsd['OOV_per_sent'] = hsd['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
hsd['OOV_count'] = hsd['tokenize_result'].apply(lambda x: getCountofAllOOV(x))

hsd['##_per_sent'] = hsd['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
hsd['##_count'] = hsd['tokenize_result'].apply(lambda x: getCountofAllShop(x))


hsd.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_sent': ['mean'],
    'OOV_count': ['sum'],
    '##_per_sent': ['mean'],
    '##_count': ['sum']
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_sent,OOV_count,##_per_sent,##_count
Unnamed: 0_level_1,mean,mean,mean,sum,mean,sum
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
eojeol_composed_F_32k,38.718298,77.993546,0.270432,803,15.587404,104113
eojeol_composed_F_64k,38.718298,70.834349,0.292663,803,13.781274,84146
eojeol_pure_F_32k,38.718298,114.948369,0.276332,794,9.805903,95085
eojeol_pure_F_64k,38.718298,109.92076,0.296624,794,8.724854,81063
fixed_composed_F_32k,38.718298,63.71292,0.210914,1031,3.394665,18142
fixed_composed_F_64k,38.718298,60.107685,0.222857,1031,1.607559,8087
fixed_lexical_F_32k,38.718298,91.906179,0.215083,1028,1.827213,13682
fixed_lexical_F_64k,38.718298,89.587068,0.223857,1028,0.997295,7214
fixed_pure_F_32,38.718298,102.566989,0.214752,1025,1.645531,13433
fixed_pure_F_64k,38.718298,100.297717,0.223262,1025,0.898466,7104


In [224]:
dp_train = pd.read_csv('dataset_analysis/dp_train.tsv', sep='\t')
dp_dev = pd.read_csv('dataset_analysis/dp_dev.tsv', sep='\t')
dp = pd.concat([dp_train, dp_dev])

dp['tokenize_result'] = dp['tokenize_result'].apply(str)
dp['tokenize_result'] = dp['tokenize_result'].apply(lambda x: removeCS(x))
dp['source_len'] = dp['source'].apply(lambda x: len(x))
dp['tokenized_len'] = dp['tokenize_result'].apply(lambda x: len(x))
dp['OOV_per_sent'] = dp['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
dp['OOV_count'] = dp['tokenize_result'].apply(lambda x: getCountofAllOOV(x))

dp['##_per_sent'] = dp['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
dp['##_count'] = dp['tokenize_result'].apply(lambda x: getCountofAllShop(x))


dp.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_sent': ['mean'],
    'OOV_count': ['sum'],
    '##_per_sent': ['mean'],
    '##_count': ['sum']
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_sent,OOV_count,##_per_sent,##_count
Unnamed: 0_level_1,mean,mean,mean,sum,mean,sum
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
eojeol_composed_F_32k,48.684417,80.909833,0.24062,2881,12.104248,121449
eojeol_composed_F_64k,48.684417,71.796333,0.276842,2881,9.435488,84995
eojeol_pure_F_32k,48.684417,126.51825,0.259793,2881,6.472703,101114
eojeol_pure_F_64k,48.684417,120.8305,0.285277,2881,5.216587,78363
fixed_composed_F_32k,48.684417,72.965417,0.185928,3346,2.118451,21074
fixed_composed_F_64k,48.684417,69.509167,0.194508,3346,0.742135,7249
fixed_lexical_F_32k,48.684417,109.196667,0.190371,3346,0.945381,13891
fixed_lexical_F_64k,48.684417,107.169167,0.195732,3346,0.395972,5781
fixed_pure_F_32,48.684417,121.9195,0.190566,3346,0.829199,13536
fixed_pure_F_64k,48.684417,119.96475,0.195764,3346,0.352165,5717


In [225]:
nli_train = pd.read_csv('dataset_analysis/nli_train.tsv', sep='\t')
nli_dev = pd.read_csv('dataset_analysis/nli_dev.tsv', sep='\t')
nli = pd.concat([nli_train, nli_dev])

nli['tokenize_result'] = nli['tokenize_result'].apply(str)
nli['tokenize_result'] = nli['tokenize_result'].apply(lambda x: removeCS(x))
nli['source_len'] = nli['source'].apply(lambda x: len(x))
nli['tokenized_len'] = nli['tokenize_result'].apply(lambda x: len(x))
nli['OOV_per_sent'] = nli['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
nli['OOV_count'] = nli['tokenize_result'].apply(lambda x: getCountofAllOOV(x))

nli['##_per_sent'] = nli['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
nli['##_count'] = nli['tokenize_result'].apply(lambda x: getCountofAllShop(x))


nli.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_sent': ['mean'],
    'OOV_count': ['sum'],
    '##_per_sent': ['mean'],
    '##_count': ['sum']
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_sent,OOV_count,##_per_sent,##_count
Unnamed: 0_level_1,mean,mean,mean,sum,mean,sum
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
eojeol_composed_F_32k,35.146314,58.309951,0.003902,151,11.634948,395046
eojeol_composed_F_64k,35.146314,51.842721,0.004501,151,8.980216,274333
eojeol_pure_F_32k,35.146314,93.31029,0.0041,151,6.063119,327231
eojeol_pure_F_64k,35.146314,89.269537,0.0046,151,4.866531,251809
fixed_composed_F_32k,35.146314,53.292289,0.00323,196,2.181656,73156
fixed_composed_F_64k,35.146314,50.789199,0.003463,196,0.824155,26435
fixed_lexical_F_32k,35.146314,80.524662,0.003309,196,1.003572,49285
fixed_lexical_F_64k,35.146314,79.003929,0.003484,196,0.431059,20900
fixed_pure_F_32,35.146314,90.281431,0.003309,196,0.877533,48027
fixed_pure_F_64k,35.146314,88.820273,0.003484,196,0.383633,20754


In [252]:
# size
print(len(cola)/10,
len(nsmc)/10,
len(paws)/10,
len(hsd)/10,
len(dp)/10,
len(nli)/10
)

# save results
cola.to_csv('dataset_analysis/cola_result.tsv', sep='\t', index=False)
nsmc.to_csv('dataset_analysis/nsmc_result.tsv', sep='\t', index=False)
paws.to_csv('dataset_analysis/paws_result.tsv', sep='\t', index=False)
hsd.to_csv('dataset_analysis/hsd_result.tsv', sep='\t', index=False)
dp.to_csv('dataset_analysis/dp_result.tsv', sep='\t', index=False)
nli.to_csv('dataset_analysis/nli_result.tsv', sep='\t', index=False)

17996.0 199992.0 106132.0 8367.0 12000.0 55996.0
