In [1]:
import json
import os
import unicodedata
import sys
import pandas as pd
import matplotlib.pyplot as plt
import re
sys.path.append("..")
from scripts._mecab import Mecab
from typing import Dict, List, Tuple


In [2]:
from tokenizer import (
    # CharTokenizer,
    # JamoTokenizer,
    MeCabSentencePieceTokenizer_orig,
    MeCabSentencePieceTokenizer_fixed,
    MeCabSentencePieceTokenizer,
    MeCabWordPieceTokenizer,
    # MeCabTokenizer,
    MeCabTokenizer_orig,
    MeCabTokenizer_fixed,
    MeCabTokenizer_all,
    # MeCabSentencePieceTokenizer_kortok,
    # MeCabTokenizer_kortok,
    SentencePieceTokenizer,
    WordPieceTokenizer,
    Vocab,
    # WordTokenizer,
)

# 1. 토큰화 후 저장

In [112]:
def get_tokenizer(tokenizer_name: str, resource_dir: str, token_type, tokenizer_type: str , decomposition_type: str, space_symbol: str, dummy_letter: str, nfd: bool, grammatical_symbol: list = ["", ""], skip_special_tokens: bool = False, lexical_grammatical: bool = False):   # for LG
    tokenizer_dir = os.path.join(resource_dir, tokenizer_name)

    if tokenizer_name.startswith("sp-"):
        tokenizer = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model"))

    elif tokenizer_name.startswith("mecab_"):

        sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model"))

        if "orig" in tokenizer_name:
            mecab = MeCabTokenizer_orig(tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter)
            tokenizer = MeCabSentencePieceTokenizer_orig(mecab, sp, use_fixed=False) # mecab_sp_orig.py

        elif "fixed" in tokenizer_name:
            mecab = MeCabTokenizer_fixed(tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter)
            tokenizer = MeCabSentencePieceTokenizer_fixed(mecab, sp, use_fixed=True) # mecab_fixed.py


    # elif tokenizer_name.startswith("eojeol") or tokenizer_name.startswith("morpheme"):
    elif tokenizer_name.startswith("eojeol") or tokenizer_name.startswith("morpheme") or tokenizer_name.startswith("LG"):   # LG도 처리할 수 있도록
        wp = WordPieceTokenizer(os.path.join(tokenizer_dir, "bert_tokenizer.json"), skip_special_tokens=False)
        # mecab = MeCabTokenizer_all(token_type=token_type, tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter, nfd=nfd, grammatical_symbol=grammatical_symbol)
        mecab = MeCabTokenizer_all(token_type=token_type, tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter, nfd=nfd, grammatical_symbol=grammatical_symbol, lexical_grammatical=lexical_grammatical)   # for LG
        tokenizer = MeCabWordPieceTokenizer(mecab=mecab, wp=wp) # mecab_wp.py


    else:
        raise ValueError("Wrong tokenizer name.")

    return tokenizer

In [113]:
def get_tokenized_result(tokenizer, string, nfd: bool = True):
    # if nfd == True:
    #     string = str_to_nfd(string)

    tokenized = tokenizer.tokenize(string)
    # print(" ".join(tokenized))
    
    return " ".join(tokenized)

In [118]:

# 64k
tokenizer_eojeol_composed_F_64k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_composed_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_eojeol_decomposed_pure_F_64k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_composed_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_composed_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_lexical_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_lexical_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_lexical",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_pure_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-64k",
resource_dir = "./resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)


# 32k
tokenizer_eojeol_composed_F_32k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_composed_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_eojeol_decomposed_pure_F_32k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_composed_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_composed_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_lexical_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_lexical_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_lexical",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_pure_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-32k",
resource_dir = "./resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)



In [119]:
def show_tokenizations(string):
    # grammatical symbol F
    eojeol_composed_F_64k = string + '\t' + 'eojeol_composed_F_64k' + '\t' + get_tokenized_result(tokenizer_eojeol_composed_F_64k, string)
    eojeol_pure_F_64k = string + '\t' + 'eojeol_pure_F_64k' + '\t' + get_tokenized_result(tokenizer_eojeol_decomposed_pure_F_64k, string)
    fixed_composed_F_64k = string + '\t' + 'fixed_composed_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_composed_F_64k, string)
    fixed_lexical_F_64k = string + '\t' + 'fixed_lexical_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_lexical_F_64k, string)
    fixed_pure_F_64k = string + '\t' + 'fixed_pure_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_pure_F_64k, string)

    eojeol_composed_F_32k = string + '\t' + 'eojeol_composed_F_32k' + '\t' + get_tokenized_result(tokenizer_eojeol_composed_F_32k, string)
    eojeol_pure_F_32k = string + '\t' + 'eojeol_pure_F_32k' + '\t' + get_tokenized_result(tokenizer_eojeol_decomposed_pure_F_32k, string)
    fixed_composed_F_32k = string + '\t' + 'fixed_composed_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_composed_F_32k, string)
    fixed_lexical_F_32k = string + '\t' + 'fixed_lexical_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_lexical_F_32k, string)
    fixed_pure_F_32k = string + '\t' + 'fixed_pure_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_pure_F_32k, string)
    
    return "\n".join(['\n'+eojeol_composed_F_64k , eojeol_pure_F_64k , fixed_composed_F_64k, fixed_lexical_F_64k, fixed_pure_F_64k,
    eojeol_composed_F_32k , eojeol_pure_F_32k , fixed_composed_F_32k, fixed_lexical_F_32k, fixed_pure_F_32k])

In [120]:
show_tokenizations('갑자기 더워彭肽꿿뜛땭뜎حمق')

'\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_composed_F_64k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_pure_F_64k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_composed_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] [UNK] [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_lexical_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_pure_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_composed_F_32k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_pure_F_32k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_composed_F_32k\t[CLS] 갑자기 덥 어 [UNK] [UNK] [UNK] [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_lexical_F_32k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_pure_F_32\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]'

In [121]:
# 토큰화 및 저장
def analysis(task, corpus_list, corpus_name_list, sent2=True):
    for corpus, corpus_name in zip(corpus_list, corpus_name_list):
        with open('dataset_analysis/' + task+'_'+corpus_name+'.tsv', 'w', encoding='utf-8') as f:
            f.write('source'+'\t'+'tokenizer'+'\t'+'tokenize_result')
            if sent2:
                for sent1, sent2 in zip(corpus[0], corpus[1]):
                    f.write(show_tokenizations(string=sent1))
                    f.write(show_tokenizations(string=sent2))
            else:
                for sent1 in (corpus):
                    f.write(show_tokenizations(string=sent1))

In [122]:
# cola
def load_data(file_path: str) -> Tuple[List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence
    3. label
    """
    sentences: List[str] = []

    # for test set
    if file_path == "./dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_test_with_answer.tsv":
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f.readlines()[1:]):
                splitted = line.strip().split("\t")
                sentences.append(splitted[2])

    else:
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f.readlines()[1:]):
                splitted = line.strip().split("\t")
                sentences.append(splitted[3])

    return sentences

task = 'cola'
train = load_data('./dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_train.tsv')
dev = load_data('./dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_dev.tsv')
test = load_data('./dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_test_with_answer.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [123]:
# nsmc

def load_data(file_path: str) -> Tuple[List[str], List[int]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence
    3. label
    """
    sentences: List[str] = []
    labels: List[int] = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 2:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentences.append(splitted[0])
            
#             labels.append(label_to_index[splitted[1]])

    return sentences


task = 'nsmc'
train = load_data('./dataset/nlu_tasks/nsmc/ratings_train.tsv')
dev = load_data('./dataset/nlu_tasks/nsmc/ratings_dev.tsv')
test = load_data('./dataset/nlu_tasks/nsmc/ratings_test.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [124]:
# hsd

def load_data(file_path: str) -> Tuple[List[str], List[int]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence = []
    # sentence_bs = []
    labels = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 4:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue

            sentence.append(splitted[0])
                # sentence_bs.append(splitted[1])
            # sentence_as.append(splitted[0])
            # sentence_bs.append(splitted[1])

#             labels.append(label_to_index[splitted[3]])

    return sentence

task = 'hsd'
train = load_data('./dataset/nlu_tasks/hsd/train.tsv')
dev = load_data('./dataset/nlu_tasks/hsd/dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [125]:
# paws
def load_data(file_path: str) -> Tuple[List[str], List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence_as = []
    sentence_bs = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 4:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            if splitted[1] == "" or splitted[2] == "":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            # 문장이 "NS"로만 표기된 라인 제외
            if splitted[1] == "NS" or splitted[2] == "NS":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentence_as.append(splitted[1])
            sentence_bs.append(splitted[2])
        
    return sentence_as, sentence_bs

task = 'paws'
train = load_data('./dataset/nlu_tasks/paws/translated_train.tsv')
dev = load_data('./dataset/nlu_tasks/paws/dev_2k.tsv')
test = load_data('./dataset/nlu_tasks/paws/test_2k.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=True)

In [138]:
# KLUE-dp
dp_train = pd.read_csv('./KLUE-baseline/data/klue_benchmark/klue-dp-v1.1/klue-dp-v1.1_train.tsv',sep='delimiter', header=None)
dp_dev = pd.read_csv('./KLUE-baseline/data/klue_benchmark/klue-dp-v1.1/klue-dp-v1.1_dev.tsv', sep='delimiter', header=None)
dp_train.columns = ['text']
dp_dev.columns = ['text']
df2_train = dp_train['text'].str.contains('## klue-dp')
df2_dev = dp_dev['text'].str.contains('## klue-dp')
df2_train = dp_train[df2_train]
df2_dev = dp_dev[df2_dev]
df2_train['text'] = df2_train['text'].apply(lambda x: re.sub('## klue-dp-v1_train_.*\t', '', x))
df2_dev['text'] = df2_dev['text'].apply(lambda x: re.sub('## klue-dp-v1_dev_.*\t', '', x))
df2_train.to_csv('dp_orig_train.tsv', index=False, sep='\t')
df2_dev.to_csv('dp_orig_dev.tsv', index=False, sep='\t')

def load_data(file_path: str) -> Tuple[List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentenced
    3. label
    """
    sentences: List[str] = []
    #labels: List[int] = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip()
            # if len(splitted) != 2:
            #     #print(f"[ERROR] {repr(line)}, line {i}")
            #     continue
            sentences.append(splitted)
     #       labels.append(label_to_index[splitted[1]])

    return sentences

task = 'dp'
train = load_data('dp_orig_train.tsv')
dev = load_data('dp_orig_dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=False)

  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_train['text'] = df2_train['text'].apply(lambda x: re.sub('## klue-dp-v1_train_.*\t', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_dev['text'] = df2_dev['text'].apply(lambda x: re.sub('## klue-dp-v1_dev_.*\t', '', x))


In [140]:
# KLUE-nli
# json to tsv
import json
import pandas as pd
with open('./KLUE-baseline/data/klue_benchmark/klue-nli-v1.1/klue-nli-v1.1_train.json', 'r') as f:
    nli_train = json.load(f)
with open('./KLUE-baseline/data/klue_benchmark/klue-nli-v1.1/klue-nli-v1.1_dev.json') as f2:
    nli_dev = json.load(f2)


nli_train = pd.DataFrame(nli_train)
nli_train = nli_train[['premise', 'hypothesis']]
nli_train.to_csv('nli_orig_train.tsv', index=False, sep='\t')
nli_dev = pd.DataFrame(nli_dev)
nli_dev = nli_dev[['premise', 'hypothesis']]
nli_dev.to_csv('nli_orig_dev.tsv', index=False, sep='\t')

def load_data(file_path: str) -> Tuple[List[str], List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence_as = []
    sentence_bs = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if splitted[0] == "" or splitted[1] == "":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentence_as.append(splitted[0])
            sentence_bs.append(splitted[1])
        
    return sentence_as, sentence_bs

task = 'nli'
train = load_data('nli_orig_train.tsv')
dev = load_data('nli_orig_dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=True)


# 2. OOV rate, ## rate 분석

In [7]:
# [UNK]의 개수 / 문장의 길이 * 100
from typing import List

# 문장당 oov rate (OR)
def getOOVRatePerSentence(sentence):
    # [CLS], [SEP] 제거
    sentence = sentence.split()

    OOV_rate = sentence.count('[UNK]') / len(sentence) * 100   
    
    return OOV_rate

# count of all OOV tokens (OC)
def getCountofAllOOV(sentence):
    cnt = 0
    cnt += len(re.findall('[UNK]', sentence))
    
    return cnt

# [CLS, SEP] 제거
def removeCS(sentence):
    sentence = sentence.replace('[CLS]', '')
    sentence = sentence.replace('[SEP]', '')
    
    return sentence

# [UNK]수 /전체 토큰 수
def getOOVdividedbyAllTokens(corpus):
    corpus['OC'] = corpus['sentence'].apply(lambda x: getCountofAllOOV(x))
    corpus['sentence'].apply(lambda x: removeCS(x))
    corpus['token_count'] = corpus['sentence'].apply(lambda x: len(x))
    OOV_count = corpus['OC'].sum()
    token_count = corpus['token_count'].sum()
    
    return OOV_count, token_count, OOV_count/token_count*100



# "##"" 세기
# 문장당 oov rate (SR)
def getShopRatePerSentence(sentence):

    OOV_rate = sentence.count('##') / len(sentence) * 100   
    
    return OOV_rate

# count of all ## tokens (SC)
def getCountofAllShop(sentence):
    cnt = 0
    cnt += len(re.findall('##', sentence))
    
    return cnt

# [CLS, SEP] 제거
def removeCS(sentence):
    sentence = sentence.replace('[CLS]', '')
    sentence = sentence.replace('[SEP]', '')
    
    return sentence

# ##수 /전체 토큰 수
def getOOVdividedbyAllTokens(corpus):
    corpus['SC'] = corpus['sentence'].apply(lambda x: getCountofAllShop(x))
    corpus['sentence'].apply(lambda x: removeCS(x))
    corpus['token_count'] = corpus['sentence'].apply(lambda x: len(x))
    Shop_count = corpus['SC'].sum()
    token_count = corpus['token_count'].sum()
    
    return Shop_count, token_count, Shop_count/token_count*100


In [19]:
import pandas as pd
import re

cola_train = pd.read_csv('dataset_analysis/tokenized/cola_train.tsv', sep='\t')
cola_dev = pd.read_csv('dataset_analysis/tokenized/cola_dev.tsv', sep='\t')
cola_test = pd.read_csv('dataset_analysis/tokenized/cola_test.tsv', sep='\t')
cola = pd.concat([cola_train, cola_dev, cola_test])
cola['tokenize_result'] = cola['tokenize_result'].apply(lambda x: removeCS(x))
cola['source_len'] = cola['source'].apply(lambda x: len(x.split()))
cola['tokenized_len'] = cola['tokenize_result'].apply(lambda x: len(x.split()))

cola['OOV_per_tokenized_sent'] = cola['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
cola['OOV_count'] = cola['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
cola['OOV_per_source_sent'] = cola['OOV_count'] / cola['source_len'] * 100
cola['##_per_tokenized_sent'] = cola['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
cola['##_count'] = cola['tokenize_result'].apply(lambda x: getCountofAllShop(x))
cola['##_per_source_sent'] = cola['##_count'] / cola['source_len'] * 100


In [21]:
cola[cola['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [22]:
cola.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean', 'size'], 
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent,##_per_source_sent
Unnamed: 0_level_1,mean,mean,mean,sum,mean,mean,sum,mean,size
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,5.011058,8.460269,0.032876,189,0.162933,10.523272,62072,72.816467,17996
eojeol_composed_F_64k,5.011058,7.42854,0.036712,189,0.162933,8.088648,43505,51.650842,17996
eojeol_pure_F_32k,5.011058,7.844354,0.035132,189,0.162933,5.560999,50988,60.109946,17996
eojeol_pure_F_64k,5.011058,7.228717,0.037975,189,0.162933,4.520664,39909,47.571706,17996
fixed_composed_F_32k,5.011058,12.380196,0.01988,216,0.181257,1.689564,10372,12.154478,17996
fixed_composed_F_64k,5.011058,11.972772,0.020558,216,0.181257,0.508337,3040,3.629573,17996
fixed_lexical_F_32k,5.011058,12.21377,0.020423,216,0.181257,0.87146,7377,8.710197,17996
fixed_lexical_F_64k,5.011058,11.917037,0.020611,216,0.181257,0.239431,2037,2.420483,17996
fixed_pure_F_32,5.011058,12.207157,0.020423,216,0.181257,0.7405,7258,8.578487,17996
fixed_pure_F_64k,5.011058,11.914259,0.020611,216,0.181257,0.202259,1987,2.356606,17996


In [23]:
nsmc_train = pd.read_csv('dataset_analysis/tokenized/nsmc_train.tsv', sep='\t')
nsmc_dev = pd.read_csv('dataset_analysis/tokenized/nsmc_dev.tsv', sep='\t')
nsmc_test = pd.read_csv('dataset_analysis/tokenized/nsmc_test.tsv', sep='\t')
nsmc = pd.concat([nsmc_train, nsmc_dev, nsmc_test])
nsmc['tokenize_result'] = nsmc['tokenize_result'].apply(lambda x: removeCS(x))
nsmc['source_len'] = nsmc['source'].apply(lambda x: len(x.split()))
nsmc['tokenized_len'] = nsmc['tokenize_result'].apply(lambda x: len(x.split()))

nsmc['OOV_per_tokenized_sent'] = nsmc['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
nsmc['OOV_count'] = nsmc['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
nsmc['OOV_per_source_sent'] = nsmc['OOV_count'] / nsmc['source_len'] * 100
nsmc['##_per_tokenized_sent'] = nsmc['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
nsmc['##_count'] = nsmc['tokenize_result'].apply(lambda x: getCountofAllShop(x))
nsmc['##_per_source_sent'] = nsmc['##_count'] / nsmc['source_len'] * 100


In [24]:
nsmc.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean', 'size'], 
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent,##_per_source_sent
Unnamed: 0_level_1,mean,mean,mean,sum,mean,mean,sum,mean,size
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,7.591344,17.727344,0.346157,8989,1.392632,14.242295,2027121,186.080295,199992
eojeol_composed_F_64k,7.591344,15.669567,0.356818,8989,1.392632,12.307936,1615582,154.626038,199992
eojeol_pure_F_32k,7.591344,16.681637,0.455792,9813,1.738694,8.840555,1817988,167.590889,199992
eojeol_pure_F_64k,7.591344,15.262716,0.464452,9813,1.738694,7.755301,1534215,144.851687,199992
fixed_composed_F_32k,7.591344,21.892581,0.127833,10730,1.883802,2.627349,317082,32.287948,199992
fixed_composed_F_64k,7.591344,21.090634,0.1332,10730,1.883802,1.325125,156699,17.575249,199992
fixed_lexical_F_32k,7.591344,21.536411,0.122378,10448,1.797754,1.480511,245851,26.269544,199992
fixed_lexical_F_64k,7.591344,21.041962,0.126059,10448,1.797754,0.89772,146965,17.113886,199992
fixed_pure_F_32,7.591344,21.511515,0.122186,10430,1.795306,1.351018,240872,25.793359,199992
fixed_pure_F_64k,7.591344,21.033336,0.125915,10430,1.795306,0.826526,145240,16.91989,199992


In [38]:
nsmc[nsmc['tokenized_len'] > 128]

Unnamed: 0,source,tokenizer,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
166967,재미없다ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ...,fixed_composed_F_32k,재미없 다 ㅠ ##ㅠ ㅠ ##ㅠ ㅠ ##ㅠ ㅠ ##ㅠ ㅠ ##ㅠ ㅠ ##ㅠ ㅠ #...,1,138,0.0,0,0.0,18.077803,79,7900.0
306632,"""""""너무 충격적입니다. 어떻게 이런걸 영화라고 내놓나요? 보는내내 물음표만 가득했...",fixed_composed_F_64k,""""" ##"" ##"" ##"" ##"" ##"" 너무 충격 적 이 ᄇ니다 . 어떻게 이런...",22,141,0.0,0,0.0,15.845824,74,336.363636
306633,"""""""너무 충격적입니다. 어떻게 이런걸 영화라고 내놓나요? 보는내내 물음표만 가득했...",fixed_lexical_F_64k,""""" ##"" ##"" ##"" ##"" ##"" 너무 충격 적 이 ᄇ니다 ...",22,138,0.0,0,0.0,13.320826,71,322.727273
306634,"""""""너무 충격적입니다. 어떻게 이런걸 영화라고 내놓나요? 보는내내 물음표만 가득했...",fixed_pure_F_64k,""""" ##"" ##"" ##"" ##"" ##"" 너무 충격 적 이 ᄇ니...",22,138,0.0,0,0.0,12.701252,71,322.727273
306637,"""""""너무 충격적입니다. 어떻게 이런걸 영화라고 내놓나요? 보는내내 물음표만 가득했...",fixed_composed_F_32k,""""" ##"" ##"" ##"" ##"" ##"" 너무 충격 적 이 ᄇ니다 . 어떻게 이런...",22,147,0.0,0,0.0,16.494845,80,363.636364
306638,"""""""너무 충격적입니다. 어떻게 이런걸 영화라고 내놓나요? 보는내내 물음표만 가득했...",fixed_lexical_F_32k,""""" ##"" ##"" ##"" ##"" ##"" 너무 충격 적 이 ᄇ니다 ...",22,142,0.0,0,0.0,13.761468,75,340.909091
306639,"""""""너무 충격적입니다. 어떻게 이런걸 영화라고 내놓나요? 보는내내 물음표만 가득했...",fixed_pure_F_32,""""" ##"" ##"" ##"" ##"" ##"" 너무 충격 적 이 ᄇ니...",22,142,0.0,0,0.0,13.134851,75,340.909091
607922,"""""""이말년만화로 이 드라마를 설명할수 있다.드라마에 출연한 배우들 """"""""미친놈아...",fixed_composed_F_64k,""""" ##"" ##"" ##"" ##"" ##"" 이말년 만화 로 이 드라마 를 설명 하 ...",23,130,0.0,0,0.0,12.182741,48,208.695652
607923,"""""""이말년만화로 이 드라마를 설명할수 있다.드라마에 출연한 배우들 """"""""미친놈아...",fixed_lexical_F_64k,""""" ##"" ##"" ##"" ##"" ##"" 이말년 만화 로 이 드...",23,130,0.0,0,0.0,10.084034,48,208.695652
607924,"""""""이말년만화로 이 드라마를 설명할수 있다.드라마에 출연한 배우들 """"""""미친놈아...",fixed_pure_F_64k,""""" ##"" ##"" ##"" ##"" ##"" 이말년 만화 로 이 ᄃ...",23,130,0.0,0,0.0,9.561753,48,208.695652


In [27]:
paws_train = pd.read_csv('dataset_analysis/tokenized/paws_train.tsv', sep='\t')
paws_dev = pd.read_csv('dataset_analysis/tokenized/paws_dev.tsv', sep='\t')
paws_test = pd.read_csv('dataset_analysis/tokenized/paws_test.tsv', sep='\t')
paws = pd.concat([paws_train, paws_dev, paws_test])

paws['tokenize_result'] = paws['tokenize_result'].apply(str)
paws['tokenize_result'] = paws['tokenize_result'].apply(lambda x: removeCS(x))
paws['source_len'] = paws['source'].apply(lambda x: len(x.split()))
paws['tokenized_len'] = paws['tokenize_result'].apply(lambda x: len(x.split()))
paws['OOV_per_tokenized_sent'] = paws['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
paws['OOV_count'] = paws['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
paws['OOV_per_source_sent'] = paws['OOV_count'] / paws['source_len'] * 100
paws['##_per_tokenized_sent'] = paws['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
paws['##_count'] = paws['tokenize_result'].apply(lambda x: getCountofAllShop(x))
paws['##_per_source_sent'] = paws['##_count'] / paws['source_len'] * 100

paws.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean', 'size'], 
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent,##_per_source_sent
Unnamed: 0_level_1,mean,mean,mean,sum,mean,mean,sum,mean,size
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,14.261856,31.760239,0.332959,52211,3.706431,13.44936,1856386,125.130972,106089
eojeol_composed_F_64k,14.261856,27.389041,0.378464,52211,3.706431,11.111521,1392650,94.064866,106089
eojeol_pure_F_32k,14.261856,29.551999,0.063106,30829,2.05507,9.001736,1622116,109.734915,106089
eojeol_pure_F_64k,14.261856,26.715211,0.068572,30829,2.05507,7.712552,1321164,89.547106,106089
fixed_composed_F_32k,14.261856,35.557918,0.282816,52346,3.715277,6.317381,846546,57.006188,106089
fixed_composed_F_64k,14.261856,32.424436,0.307464,52346,3.715277,4.134036,514118,34.930494,106089
fixed_lexical_F_32k,14.261856,34.030267,0.296267,52346,3.715277,4.323391,684479,46.16072,106089
fixed_lexical_F_64k,14.261856,31.962305,0.311179,52346,3.715277,3.099956,465091,31.684449,106089
fixed_pure_F_32,14.261856,34.159102,0.049922,30904,2.059366,4.037302,698147,47.287916,106089
fixed_pure_F_64k,14.261856,32.073646,0.052145,30904,2.059366,2.902907,476903,32.651603,106089


In [29]:
paws[paws['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [30]:
hsd_train = pd.read_csv('dataset_analysis/tokenized/hsd_train.tsv', sep='\t')
hsd_dev = pd.read_csv('dataset_analysis/tokenized/hsd_dev.tsv', sep='\t')
hsd = pd.concat([hsd_train, hsd_dev])

hsd['tokenize_result'] = hsd['tokenize_result'].apply(str)
hsd['tokenize_result'] = hsd['tokenize_result'].apply(lambda x: removeCS(x))
hsd['source_len'] = hsd['source'].apply(lambda x: len(x.split()))
hsd['tokenized_len'] = hsd['tokenize_result'].apply(lambda x: len(x.split()))
hsd['OOV_per_tokenized_sent'] = hsd['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
hsd['OOV_count'] = hsd['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
hsd['OOV_per_source_sent'] = hsd['OOV_count'] / hsd['source_len'] * 100
hsd['##_per_tokenized_sent'] = hsd['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
hsd['##_count'] = hsd['tokenize_result'].apply(lambda x: getCountofAllShop(x))
hsd['##_per_source_sent'] = hsd['##_count'] / hsd['source_len'] * 100

hsd.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean', 'size'], 
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent,##_per_source_sent
Unnamed: 0_level_1,mean,mean,mean,sum,mean,mean,sum,mean,size
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,8.406,20.849289,0.270432,803,1.595408,15.587404,104113,169.389726,8367
eojeol_composed_F_64k,8.406,18.46289,0.292663,803,1.595408,13.781274,84146,139.137183,8367
eojeol_pure_F_32k,8.406,19.770288,0.276332,794,1.589262,9.805903,95085,155.923334,8367
eojeol_pure_F_64k,8.406,18.094419,0.296624,794,1.589262,8.724854,81063,134.37966,8367
fixed_composed_F_32k,8.406,24.718179,0.210914,1031,1.95171,3.394665,18142,30.930292,8367
fixed_composed_F_64k,8.406,23.516434,0.222857,1031,1.95171,1.607559,8087,14.188006,8367
fixed_lexical_F_32k,8.406,24.185132,0.215083,1028,1.941751,1.827213,13682,23.700267,8367
fixed_lexical_F_64k,8.406,23.412095,0.223857,1028,1.941751,0.997295,7214,12.934445,8367
fixed_pure_F_32,8.406,24.155372,0.214752,1025,1.935775,1.645531,13433,23.295903,8367
fixed_pure_F_64k,8.406,23.398948,0.223262,1025,1.935775,0.898466,7104,12.729737,8367


In [31]:
hsd[hsd['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [32]:
dp_train = pd.read_csv('dataset_analysis/tokenized/dp_train.tsv', sep='\t')
dp_dev = pd.read_csv('dataset_analysis/tokenized/dp_dev.tsv', sep='\t')
dp = pd.concat([dp_train, dp_dev])

dp['tokenize_result'] = dp['tokenize_result'].apply(str)
dp['tokenize_result'] = dp['tokenize_result'].apply(lambda x: removeCS(x))
dp['source_len'] = dp['source'].apply(lambda x: len(x.split()))
dp['tokenized_len'] = dp['tokenize_result'].apply(lambda x: len(x.split()))
dp['OOV_per_tokenized_sent'] = dp['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
dp['OOV_count'] = dp['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
dp['OOV_per_source_sent'] = dp['OOV_count'] / dp['source_len'] * 100
dp['##_per_tokenized_sent'] = dp['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
dp['##_count'] = dp['tokenize_result'].apply(lambda x: getCountofAllShop(x))
dp['##_per_source_sent'] = dp['##_count'] / dp['source_len'] * 100

dp.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean', 'size'], 
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent,##_per_source_sent
Unnamed: 0_level_1,mean,mean,mean,sum,mean,mean,sum,mean,size
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,11.415583,21.536333,0.24062,2881,1.451239,12.104248,121449,91.453097,12000
eojeol_composed_F_64k,11.415583,18.4985,0.276842,2881,1.451239,9.435488,84995,64.573298,12000
eojeol_pure_F_32k,11.415583,19.84175,0.259793,2881,1.451239,6.472703,101114,76.673673,12000
eojeol_pure_F_64k,11.415583,17.945833,0.285277,2881,1.451239,5.216587,78363,59.605604,12000
fixed_composed_F_32k,11.415583,27.81375,0.185928,3346,1.665835,2.118451,21074,14.80853,12000
fixed_composed_F_64k,11.415583,26.661667,0.194508,3346,1.665835,0.742135,7249,5.042086,12000
fixed_lexical_F_32k,11.415583,27.215167,0.190371,3346,1.665835,0.945381,13891,9.798267,12000
fixed_lexical_F_64k,11.415583,26.539333,0.195732,3346,1.665835,0.395972,5781,4.088706,12000
fixed_pure_F_32,11.415583,27.185583,0.190566,3346,1.665835,0.829199,13536,9.538848,12000
fixed_pure_F_64k,11.415583,26.534,0.195764,3346,1.665835,0.352165,5717,4.038697,12000


In [33]:
dp[dp['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [34]:
nli_train = pd.read_csv('dataset_analysis/tokenized/nli_train.tsv', sep='\t')
nli_dev = pd.read_csv('dataset_analysis/tokenized/nli_dev.tsv', sep='\t')
nli = pd.concat([nli_train, nli_dev])

nli['tokenize_result'] = nli['tokenize_result'].apply(str)
nli['tokenize_result'] = nli['tokenize_result'].apply(lambda x: removeCS(x))
nli['source_len'] = nli['source'].apply(lambda x: len(x.split()))
nli['tokenized_len'] = nli['tokenize_result'].apply(lambda x: len(x.split()))
nli['OOV_per_tokenized_sent'] = nli['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
nli['OOV_count'] = nli['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
nli['OOV_per_source_sent'] = nli['OOV_count'] / nli['source_len'] * 100
nli['##_per_tokenized_sent'] = nli['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
nli['##_count'] = nli['tokenize_result'].apply(lambda x: getCountofAllShop(x))
nli['##_per_source_sent'] = nli['##_count'] / nli['source_len'] * 100

nli.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean', 'size'], 
})

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent,##_per_source_sent
Unnamed: 0_level_1,mean,mean,mean,sum,mean,mean,sum,mean,size
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,8.277216,15.332113,0.003902,151,0.02578,11.634948,395046,90.25258,55996
eojeol_composed_F_64k,8.277216,13.17637,0.004501,151,0.02578,8.980216,274333,63.527354,55996
eojeol_pure_F_32k,8.277216,14.121044,0.0041,151,0.02578,6.063119,327231,75.100799,55996
eojeol_pure_F_64k,8.277216,12.774127,0.0046,151,0.02578,4.866531,251809,58.397642,55996
fixed_composed_F_32k,8.277216,20.357936,0.00323,196,0.03105,2.181656,73156,16.203469,55996
fixed_composed_F_64k,8.277216,19.523573,0.003463,196,0.03105,0.824155,26435,5.897201,55996
fixed_lexical_F_32k,8.277216,19.931638,0.003309,196,0.03105,1.003572,49285,11.089676,55996
fixed_lexical_F_64k,8.277216,19.424727,0.003484,196,0.03105,0.431059,20900,4.699381,55996
fixed_pure_F_32,8.277216,19.909172,0.003309,196,0.03105,0.877533,48027,10.805589,55996
fixed_pure_F_64k,8.277216,19.422119,0.003484,196,0.03105,0.383633,20754,4.670615,55996


In [35]:
nli[nli['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [36]:
# size
print(len(cola)/10,
len(nsmc)/10,
len(paws)/10,
len(hsd)/10,
len(dp)/10,
len(nli)/10
)

# save results
cola.to_csv('dataset_analysis/results/cola_result.tsv', sep='\t', index=False)
nsmc.to_csv('dataset_analysis/results/nsmc_result.tsv', sep='\t', index=False)
paws.to_csv('dataset_analysis/results/paws_result.tsv', sep='\t', index=False)
hsd.to_csv('dataset_analysis/results/hsd_result.tsv', sep='\t', index=False)
dp.to_csv('dataset_analysis/results/dp_result.tsv', sep='\t', index=False)
nli.to_csv('dataset_analysis/results/nli_result.tsv', sep='\t', index=False)

17996.0 199992.0 106132.0 8367.0 12000.0 55996.0
