In [1]:
import json
import os
import unicodedata
import sys
import pandas as pd
import matplotlib.pyplot as plt
import re
sys.path.append("..")
from scripts._mecab import Mecab
from typing import Dict, List, Tuple


In [2]:
from tokenizer import (
    # CharTokenizer,
    # JamoTokenizer,
    MeCabSentencePieceTokenizer_fixed,
    MeCabSentencePieceTokenizer,
    MeCabWordPieceTokenizer,
    # MeCabTokenizer,
    MeCabTokenizer_fixed,
    MeCabTokenizer_all,
    # MeCabSentencePieceTokenizer_kortok,
    # MeCabTokenizer_kortok,
    SentencePieceTokenizer,
    WordPieceTokenizer,
    Vocab,
    # WordTokenizer,
)

# 1. 토큰화 후 저장

In [3]:
def get_tokenizer(tokenizer_name: str, resource_dir: str, token_type, tokenizer_type: str , decomposition_type: str, space_symbol: str, dummy_letter: str, nfd: bool, grammatical_symbol: list = ["", ""], skip_special_tokens: bool = False, lexical_grammatical: bool = False):   # for LG
    tokenizer_dir = os.path.join(resource_dir, tokenizer_name)

    if tokenizer_name.startswith("sp-"):
        tokenizer = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model"))

    elif tokenizer_name.startswith("mecab_"):

        sp = SentencePieceTokenizer(os.path.join(tokenizer_dir, "tok.model"))

        if "orig" in tokenizer_name:
            mecab = MeCabTokenizer_orig(tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter)
            tokenizer = MeCabSentencePieceTokenizer_orig(mecab, sp, use_fixed=False) # mecab_sp_orig.py

        elif "fixed" in tokenizer_name:
            mecab = MeCabTokenizer_fixed(tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter)
            tokenizer = MeCabSentencePieceTokenizer_fixed(mecab, sp, use_fixed=True) # mecab_fixed.py


    # elif tokenizer_name.startswith("eojeol") or tokenizer_name.startswith("morpheme"):
    elif tokenizer_name.startswith("eojeol") or tokenizer_name.startswith("morpheme") or tokenizer_name.startswith("LG"):   # LG도 처리할 수 있도록
        wp = WordPieceTokenizer(os.path.join(tokenizer_dir, "bert_tokenizer.json"), skip_special_tokens=False)
        # mecab = MeCabTokenizer_all(token_type=token_type, tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter, nfd=nfd, grammatical_symbol=grammatical_symbol)
        mecab = MeCabTokenizer_all(token_type=token_type, tokenizer_type=tokenizer_type, decomposition_type=decomposition_type, space_symbol=space_symbol, dummy_letter=dummy_letter, nfd=nfd, grammatical_symbol=grammatical_symbol, lexical_grammatical=lexical_grammatical)   # for LG
        tokenizer = MeCabWordPieceTokenizer(mecab=mecab, wp=wp) # mecab_wp.py


    else:
        raise ValueError("Wrong tokenizer name.")

    return tokenizer

In [4]:
def get_tokenized_result(tokenizer, string, nfd: bool = True):
    # if nfd == True:
    #     string = str_to_nfd(string)

    tokenized = tokenizer.tokenize(string)
    # print(" ".join(tokenized))
    
    return " ".join(tokenized)

In [8]:

# 64k
tokenizer_eojeol_composed_F_64k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_composed_grammatical_symbol_F_wp-64k",
resource_dir = "../resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_eojeol_decomposed_pure_F_64k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-64k",
resource_dir = "../resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_composed_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_composed_grammatical_symbol_F_wp-64k",
resource_dir = "../resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_lexical_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_lexical_grammatical_symbol_F_wp-64k",
resource_dir = "../resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_lexical",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_pure_F_64k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-64k",
resource_dir = "../resources/v6_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)


# 32k
tokenizer_eojeol_composed_F_32k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_composed_grammatical_symbol_F_wp-32k",
resource_dir = "../resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_eojeol_decomposed_pure_F_32k = get_tokenizer(tokenizer_name = "eojeol_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-32k",
resource_dir = "../resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "eojeol",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_composed_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_composed_grammatical_symbol_F_wp-32k",
resource_dir = "../resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "composed",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_lexical_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_lexical_grammatical_symbol_F_wp-32k",
resource_dir = "../resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_lexical",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)

tokenizer_fixed_decomposed_pure_F_32k = get_tokenizer(tokenizer_name = "morpheme_mecab_fixed_decomposed_pure_grammatical_symbol_F_wp-32k",
resource_dir = "../resources/v7_without_dummy_letter_grammatical_symbol_F",
tokenizer_type = "mecab_fixed",
token_type= "morpheme",
decomposition_type= "decomposed_pure",
space_symbol= "",
dummy_letter= "",
nfd= True,
grammatical_symbol= ["",""],
lexical_grammatical= False)



In [9]:
def show_tokenizations(string):
    # grammatical symbol F
    eojeol_composed_F_64k = string + '\t' + 'eojeol_composed_F_64k' + '\t' + get_tokenized_result(tokenizer_eojeol_composed_F_64k, string)
    eojeol_pure_F_64k = string + '\t' + 'eojeol_pure_F_64k' + '\t' + get_tokenized_result(tokenizer_eojeol_decomposed_pure_F_64k, string)
    fixed_composed_F_64k = string + '\t' + 'fixed_composed_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_composed_F_64k, string)
    fixed_lexical_F_64k = string + '\t' + 'fixed_lexical_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_lexical_F_64k, string)
    fixed_pure_F_64k = string + '\t' + 'fixed_pure_F_64k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_pure_F_64k, string)

    eojeol_composed_F_32k = string + '\t' + 'eojeol_composed_F_32k' + '\t' + get_tokenized_result(tokenizer_eojeol_composed_F_32k, string)
    eojeol_pure_F_32k = string + '\t' + 'eojeol_pure_F_32k' + '\t' + get_tokenized_result(tokenizer_eojeol_decomposed_pure_F_32k, string)
    fixed_composed_F_32k = string + '\t' + 'fixed_composed_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_composed_F_32k, string)
    fixed_lexical_F_32k = string + '\t' + 'fixed_lexical_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_lexical_F_32k, string)
    fixed_pure_F_32k = string + '\t' + 'fixed_pure_F_32k' + '\t' + get_tokenized_result(tokenizer_fixed_decomposed_pure_F_32k, string)
    
    return "\n".join(['\n'+eojeol_composed_F_64k , eojeol_pure_F_64k , fixed_composed_F_64k, fixed_lexical_F_64k, fixed_pure_F_64k,
    eojeol_composed_F_32k , eojeol_pure_F_32k , fixed_composed_F_32k, fixed_lexical_F_32k, fixed_pure_F_32k])

In [10]:
show_tokenizations('갑자기 더워彭肽꿿뜛땭뜎حمق')

'\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_composed_F_64k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_pure_F_64k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_composed_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] [UNK] [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_lexical_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_pure_F_64k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_composed_F_32k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\teojeol_pure_F_32k\t[CLS] 갑자기 [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_composed_F_32k\t[CLS] 갑자기 덥 어 [UNK] [UNK] [UNK] [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_lexical_F_32k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]\n갑자기 더워彭肽꿿뜛땭뜎حمق\tfixed_pure_F_32k\t[CLS] 갑자기 덥 어 [UNK] [UNK] 꿰 ##ᆶ ##ᄄ ##ᅲ ##ᆶ ##ᄄ ##ᅣ ##ᆬ ##ᄄ ##ᅲ ##ᆩ [UNK] [SEP]'

In [16]:
# 토큰화 및 저장
def analysis(task, corpus_list, corpus_name_list, sent2=True):
    for corpus, corpus_name in zip(corpus_list, corpus_name_list):
        with open(task+'_'+corpus_name+'.tsv', 'w', encoding='utf-8') as f:
            f.write('source'+'\t'+'tokenizer'+'\t'+'tokenize_result')
            if sent2:
                for sent1, sent2 in zip(corpus[0], corpus[1]):
                    f.write(show_tokenizations(string=sent1))
                    f.write(show_tokenizations(string=sent2))
            else:
                for sent1 in (corpus):
                    f.write(show_tokenizations(string=sent1))

In [17]:
# cola
def load_data(file_path: str) -> Tuple[List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence
    3. label
    """
    sentences: List[str] = []

    # for test set
    if file_path == "../dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_test_with_answer.tsv":
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f.readlines()[1:]):
                splitted = line.strip().split("\t")
                sentences.append(splitted[2])

    else:
        with open(file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f.readlines()[1:]):
                splitted = line.strip().split("\t")
                sentences.append(splitted[3])

    return sentences

task = 'cola'
train = load_data('../dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_train.tsv')
dev = load_data('../dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_dev.tsv')
test = load_data('../dataset/nlu_tasks/cola/NIKL_CoLA_in_domain_test_with_answer.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [18]:
# nsmc

def load_data(file_path: str):
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence
    3. label
    """
    sentences: List[str] = []
    labels: List[int] = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 2:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentences.append(splitted[0])
            
#             labels.append(label_to_index[splitted[1]])

    return sentences


task = 'nsmc'
train = load_data('../dataset/nlu_tasks/nsmc/ratings_train.tsv')
dev = load_data('../dataset/nlu_tasks/nsmc/ratings_dev.tsv')
test = load_data('../dataset/nlu_tasks/nsmc/ratings_test.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [19]:
# hsd

def load_data(file_path: str):
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence = []
    # sentence_bs = []
    labels = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 4:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue

            sentence.append(splitted[0])
                # sentence_bs.append(splitted[1])
            # sentence_as.append(splitted[0])
            # sentence_bs.append(splitted[1])

#             labels.append(label_to_index[splitted[3]])

    return sentence

task = 'hsd'
train = load_data('../dataset/nlu_tasks/hsd/train.tsv')
dev = load_data('../dataset/nlu_tasks/hsd/dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=False)

In [96]:
# paws
def load_data(file_path: str):
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence_as = []
    sentence_bs = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if len(splitted) != 4:
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            if splitted[1] == "" or splitted[2] == "":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            # 문장이 "NS"로만 표기된 라인 제외
            if splitted[1] == "NS" or splitted[2] == "NS":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentence_as.append(splitted[1])
            sentence_bs.append(splitted[2])
        
    return sentence_as, sentence_bs

task = 'paws'
train = load_data('../dataset/nlu_tasks/paws/translated_train.tsv')
dev = load_data('../dataset/nlu_tasks/paws/dev_2k.tsv')
test = load_data('../dataset/nlu_tasks/paws/test_2k.tsv')

corpus_list = [train, dev, test]
corpus_name_list = ['train', 'dev', 'test']

analysis(task, corpus_list, corpus_name_list, sent2=True)

In [22]:
# KLUE-dp
dp_train = pd.read_csv('../KLUE-baseline/data/klue_benchmark/klue-dp-v1.1/klue-dp-v1.1_train.tsv',sep='delimiter', header=None)
dp_dev = pd.read_csv('../KLUE-baseline/data/klue_benchmark/klue-dp-v1.1/klue-dp-v1.1_dev.tsv', sep='delimiter', header=None)
dp_train.columns = ['text']
dp_dev.columns = ['text']
df2_train = dp_train['text'].str.contains('## klue-dp')
df2_dev = dp_dev['text'].str.contains('## klue-dp')
df2_train = dp_train[df2_train]
df2_dev = dp_dev[df2_dev]
df2_train['text'] = df2_train['text'].apply(lambda x: re.sub('## klue-dp-v1_train_.*\t', '', x))
df2_dev['text'] = df2_dev['text'].apply(lambda x: re.sub('## klue-dp-v1_dev_.*\t', '', x))
df2_train.to_csv('dp_orig_train.tsv', index=False, sep='\t')
df2_dev.to_csv('dp_orig_dev.tsv', index=False, sep='\t')

def load_data(file_path: str) -> Tuple[List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentenced
    3. label
    """
    sentences: List[str] = []
    #labels: List[int] = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip()
            # if len(splitted) != 2:
            #     #print(f"[ERROR] {repr(line)}, line {i}")
            #     continue
            sentences.append(splitted)
     #       labels.append(label_to_index[splitted[1]])

    return sentences

task = 'dp'
train = load_data('dp_orig_train.tsv')
dev = load_data('dp_orig_dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=False)

  dp_train = pd.read_csv('../KLUE-baseline/data/klue_benchmark/klue-dp-v1.1/klue-dp-v1.1_train.tsv',sep='delimiter', header=None)
  dp_dev = pd.read_csv('../KLUE-baseline/data/klue_benchmark/klue-dp-v1.1/klue-dp-v1.1_dev.tsv', sep='delimiter', header=None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_train['text'] = df2_train['text'].apply(lambda x: re.sub('## klue-dp-v1_train_.*\t', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_dev['text'] = df2_dev['text'].apply(lambda x: re.sub('## klue-dp-v1_dev_.*\t', '', x))


In [23]:
# KLUE-nli
# json to tsv
import json
import pandas as pd
with open('../KLUE-baseline/data/klue_benchmark/klue-nli-v1.1/klue-nli-v1.1_train.json', 'r') as f:
    nli_train = json.load(f)
with open('../KLUE-baseline/data/klue_benchmark/klue-nli-v1.1/klue-nli-v1.1_dev.json') as f2:
    nli_dev = json.load(f2)


nli_train = pd.DataFrame(nli_train)
nli_train = nli_train[['premise', 'hypothesis']]
nli_train.to_csv('nli_orig_train.tsv', index=False, sep='\t')
nli_dev = pd.DataFrame(nli_dev)
nli_dev = nli_dev[['premise', 'hypothesis']]
nli_dev.to_csv('nli_orig_dev.tsv', index=False, sep='\t')

def load_data(file_path: str) -> Tuple[List[str], List[str]]:
    """
    file_path에 존재하는 tsv를 읽어서 bert_data.InputIds 형태로 변경해주는 함수입니다.
    각각의 row를 bert input으로 바꾸어주기 위한 함수입니다.
    각 row는 아래처럼 구성되어야 합니다.
    1. sentence_a
    2. sentence_b
    3. label
    """
    sentence_as = []
    sentence_bs = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f.readlines()[1:]):
            splitted = line.strip().split("\t")
            if splitted[0] == "" or splitted[1] == "":
                #print(f"[ERROR] {repr(line)}, line {i}")
                continue
            sentence_as.append(splitted[0])
            sentence_bs.append(splitted[1])
        
    return sentence_as, sentence_bs

task = 'nli'
train = load_data('nli_orig_train.tsv')
dev = load_data('nli_orig_dev.tsv')

corpus_list = [train, dev]
corpus_name_list = ['train', 'dev']

analysis(task, corpus_list, corpus_name_list, sent2=True)


# 2. OOV rate, ## rate 분석

In [30]:
# [UNK]의 개수 / 문장의 길이 * 100
from typing import List

# 문장당 oov rate (OR)
def getOOVRatePerSentence(sentence):
    sentence = removeCS(sentence)
    OOV_rate = sentence.count('[UNK]') / len(sentence.split()) * 100   
    
    return OOV_rate

# count of all OOV tokens (OC)
def getCountofAllOOV(sentence):
    cnt = 0
    cnt += len(re.findall('[UNK]', sentence))
    
    return cnt

# [CLS, SEP] 제거
def removeCS(sentence):
    sentence = sentence.replace('[CLS]', '')
    sentence = sentence.replace('[SEP]', '')
    
    return sentence

# [UNK]수 /전체 토큰 수
# def getOOVdividedbyAllTokens(corpus):
#     corpus['OC'] = corpus['sentence'].apply(lambda x: getCountofAllOOV(x))
#     corpus['sentence'].apply(lambda x: removeCS(x))
#     corpus['token_count'] = corpus['sentence'].apply(lambda x: len(x.split()))
#     OOV_count = corpus['OC'].sum()
#     token_count = corpus['token_count'].sum()
    
#     return OOV_count, token_count, OOV_count/token_count*100


# "##"" 세기
# 문장당 ## rate (SR)
def getShopRatePerSentence(sentence):
    sentence = removeCS(sentence)
    shop_rate = sentence.count('##') / len(sentence.split()) * 100   
    
    return shop_rate

# count of all ## tokens (SC)
def getCountofAllShop(sentence):
    cnt = 0
    cnt += len(re.findall('##', sentence))
    
    return cnt

# [CLS, SEP] 제거
def removeCS(sentence):
    sentence = sentence.replace('[CLS]', '')
    sentence = sentence.replace('[SEP]', '')
    
    return sentence

# ##수 /전체 토큰 수
# def getOOVdividedbyAllTokens(corpus):
#     corpus['SC'] = corpus['sentence'].apply(lambda x: getCountofAllShop(x))
#     corpus['sentence'].apply(lambda x: removeCS(x))
#     corpus['token_count'] = corpus['sentence'].apply(lambda x: len(x.split()))
#     Shop_count = corpus['SC'].sum()
#     token_count = corpus['token_count'].sum()
    
#     return Shop_count, token_count, Shop_count/token_count*100


In [31]:
import pandas as pd
import re

cola_train = pd.read_csv('cola_train.tsv', sep='\t')
cola_dev = pd.read_csv('cola_dev.tsv', sep='\t')
cola_test = pd.read_csv('cola_test.tsv', sep='\t')
cola = pd.concat([cola_train, cola_dev, cola_test])
cola['tokenize_result'] = cola['tokenize_result'].apply(lambda x: removeCS(x))
cola['source_len'] = cola['source'].apply(lambda x: len(x.split()))
cola['tokenized_len'] = cola['tokenize_result'].apply(lambda x: len(x.split()))

cola['OOV_per_tokenized_sent'] = cola['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
cola['OOV_count'] = cola['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
cola['OOV_per_source_sent'] = cola['OOV_count'] / cola['source_len'] * 100
cola['##_per_tokenized_sent'] = cola['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
cola['##_count'] = cola['tokenize_result'].apply(lambda x: getCountofAllShop(x))
cola['##_per_source_sent'] = cola['##_count'] / cola['source_len'] * 100

In [91]:
cola.isna().sum()

source                    0
tokenizer                 0
tokenize_result           0
source_len                0
tokenized_len             0
OOV_per_tokenized_sent    0
OOV_count                 0
OOV_per_source_sent       0
##_per_tokenized_sent     0
##_count                  0
##_per_source_sent        0
task                      0
dtype: int64

In [32]:
cola[cola['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [80]:
cola_group = cola.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean', 'sum'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean'],  
})
cola_group

Unnamed: 0_level_0,source_len,tokenized_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
Unnamed: 0_level_1,mean,mean,sum,mean,sum,mean,mean,sum,mean
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,5.011058,8.460269,152251,0.032876,189,0.162933,39.408974,62072,72.816467
eojeol_composed_F_64k,5.011058,7.42854,133684,0.036712,189,0.162933,31.16514,43505,51.650842
eojeol_pure_F_32k,5.011058,7.844354,141167,0.035132,189,0.162933,34.697582,50988,60.109946
eojeol_pure_F_64k,5.011058,7.228717,130088,0.037975,189,0.162933,29.298988,39909,47.571706
fixed_composed_F_32k,5.011058,12.380196,222794,0.01988,216,0.181257,4.353754,10372,12.154478
fixed_composed_F_64k,5.011058,11.972772,215462,0.020558,216,0.181257,1.323949,3040,3.629573
fixed_lexical_F_32k,5.011058,12.21377,219799,0.020423,216,0.181257,3.129198,7377,8.710197
fixed_lexical_F_64k,5.011058,11.917037,214459,0.020611,216,0.181257,0.884107,2037,2.420483
fixed_pure_F_32k,5.011058,12.207157,219680,0.020423,216,0.181257,3.083478,7258,8.578487
fixed_pure_F_64k,5.011058,11.914259,214409,0.020611,216,0.181257,0.861383,1987,2.356606


In [66]:
nsmc_train = pd.read_csv('nsmc_train.tsv', sep='\t')
nsmc_dev = pd.read_csv('nsmc_dev.tsv', sep='\t')
nsmc_test = pd.read_csv('nsmc_test.tsv', sep='\t')
nsmc = pd.concat([nsmc_train, nsmc_dev, nsmc_test])
nsmc['tokenize_result'] = nsmc['tokenize_result'].apply(lambda x: removeCS(x))
nsmc['source_len'] = nsmc['source'].apply(lambda x: len(x.split()))
nsmc['tokenized_len'] = nsmc['tokenize_result'].apply(lambda x: len(x.split()))

nsmc['OOV_per_tokenized_sent'] = nsmc['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
nsmc['OOV_count'] = nsmc['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
nsmc['OOV_per_source_sent'] = nsmc['OOV_count'] / nsmc['source_len'] * 100
nsmc['##_per_tokenized_sent'] = nsmc['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
nsmc['##_count'] = nsmc['tokenize_result'].apply(lambda x: getCountofAllShop(x))
nsmc['##_per_source_sent'] = nsmc['##_count'] / nsmc['source_len'] * 100


In [92]:
nsmc.isna().sum()

source                    0
tokenizer                 0
tokenize_result           0
source_len                0
tokenized_len             0
OOV_per_tokenized_sent    0
OOV_count                 0
OOV_per_source_sent       0
##_per_tokenized_sent     0
##_count                  0
##_per_source_sent        0
task                      0
dtype: int64

In [81]:
nsmc_group = nsmc.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean', 'sum'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean'],  
})
nsmc_group

Unnamed: 0_level_0,source_len,tokenized_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
Unnamed: 0_level_1,mean,mean,sum,mean,sum,mean,mean,sum,mean
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,7.591344,17.727344,3545327,0.346157,8989,1.392632,54.90391,2027121,186.080295
eojeol_composed_F_64k,7.591344,15.669567,3133788,0.356818,8989,1.392632,48.89581,1615582,154.626038
eojeol_pure_F_32k,7.591344,16.681637,3336194,0.455792,9813,1.738694,52.051229,1817988,167.590889
eojeol_pure_F_64k,7.591344,15.262716,3052421,0.464452,9813,1.738694,47.621194,1534215,144.851687
fixed_composed_F_32k,7.591344,21.892581,4378341,0.127833,10730,1.883802,7.515631,317082,32.287948
fixed_composed_F_64k,7.591344,21.090634,4217958,0.1332,10730,1.883802,3.894379,156699,17.575249
fixed_lexical_F_32k,7.591344,21.536411,4307110,0.122378,10448,1.797754,5.956904,245851,26.269544
fixed_lexical_F_64k,7.591344,21.041962,4208224,0.126059,10448,1.797754,3.669871,146965,17.113886
fixed_pure_F_32k,7.591344,21.511515,4302131,0.122186,10430,1.795306,5.855999,240872,25.793359
fixed_pure_F_64k,7.591344,21.033336,4206499,0.125915,10430,1.795306,3.624766,145240,16.91989


In [68]:
print(len(nsmc[nsmc['tokenized_len'] > 128]))

20


In [97]:
paws_train = pd.read_csv('paws_train.tsv', sep='\t')
paws_dev = pd.read_csv('paws_dev.tsv', sep='\t')
paws_test = pd.read_csv('paws_test.tsv', sep='\t')
paws = pd.concat([paws_train, paws_dev, paws_test])

paws['tokenize_result'] = paws['tokenize_result'].apply(str)
paws['tokenize_result'] = paws['tokenize_result'].apply(lambda x: removeCS(x))
paws['source_len'] = paws['source'].apply(lambda x: len(x.split()))
paws['tokenized_len'] = paws['tokenize_result'].apply(lambda x: len(x.split()))
paws['OOV_per_tokenized_sent'] = paws['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
paws['OOV_count'] = paws['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
paws['OOV_per_source_sent'] = paws['OOV_count'] / paws['source_len'] * 100
paws['##_per_tokenized_sent'] = paws['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
paws['##_count'] = paws['tokenize_result'].apply(lambda x: getCountofAllShop(x))
paws['##_per_source_sent'] = paws['##_count'] / paws['source_len'] * 100

paws_group = paws.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean', 'sum'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean'],  
})
paws_group

: 

: 

In [70]:
paws[paws['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [1]:
paws.isna().sum()
print(len(paws[paws['tokenizer'].isna()]))
print(len(paws_train), len(paws_dev), len(paws_test))

NameError: name 'paws' is not defined

In [71]:
hsd_train = pd.read_csv('hsd_train.tsv', sep='\t')
hsd_dev = pd.read_csv('hsd_dev.tsv', sep='\t')
hsd = pd.concat([hsd_train, hsd_dev])

hsd['tokenize_result'] = hsd['tokenize_result'].apply(str)
hsd['tokenize_result'] = hsd['tokenize_result'].apply(lambda x: removeCS(x))
hsd['source_len'] = hsd['source'].apply(lambda x: len(x.split()))
hsd['tokenized_len'] = hsd['tokenize_result'].apply(lambda x: len(x.split()))
hsd['OOV_per_tokenized_sent'] = hsd['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
hsd['OOV_count'] = hsd['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
hsd['OOV_per_source_sent'] = hsd['OOV_count'] / hsd['source_len'] * 100
hsd['##_per_tokenized_sent'] = hsd['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
hsd['##_count'] = hsd['tokenize_result'].apply(lambda x: getCountofAllShop(x))
hsd['##_per_source_sent'] = hsd['##_count'] / hsd['source_len'] * 100

hsd_group = hsd.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean', 'sum'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean'],  
})
hsd_group

Unnamed: 0_level_0,source_len,tokenized_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
Unnamed: 0_level_1,mean,mean,sum,mean,sum,mean,mean,sum,mean
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,8.406,20.849289,174446,0.270432,803,1.595408,58.494769,104113,169.389726
eojeol_composed_F_64k,8.406,18.46289,154479,0.292663,803,1.595408,53.111395,84146,139.137183
eojeol_pure_F_32k,8.406,19.770288,165418,0.276332,794,1.589262,56.128817,95085,155.923334
eojeol_pure_F_64k,8.406,18.094419,151396,0.296624,794,1.589262,52.066382,81063,134.37966
fixed_composed_F_32k,8.406,24.718179,206817,0.210914,1031,1.95171,9.324234,18142,30.930292
fixed_composed_F_64k,8.406,23.516434,196762,0.222857,1031,1.95171,4.448516,8087,14.188006
fixed_lexical_F_32k,8.406,24.185132,202357,0.215083,1028,1.941751,7.246773,13682,23.700267
fixed_lexical_F_64k,8.406,23.412095,195889,0.223857,1028,1.941751,4.007282,7214,12.934445
fixed_pure_F_32k,8.406,24.155372,202108,0.214752,1025,1.935775,7.132793,13433,23.295903
fixed_pure_F_64k,8.406,23.398948,195779,0.223262,1025,1.935775,3.939866,7104,12.729737


In [72]:
hsd[hsd['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [73]:
dp_train = pd.read_csv('dp_train.tsv', sep='\t')
dp_dev = pd.read_csv('dp_dev.tsv', sep='\t')
dp = pd.concat([dp_train, dp_dev])

dp['tokenize_result'] = dp['tokenize_result'].apply(str)
dp['tokenize_result'] = dp['tokenize_result'].apply(lambda x: removeCS(x))
dp['source_len'] = dp['source'].apply(lambda x: len(x.split()))
dp['tokenized_len'] = dp['tokenize_result'].apply(lambda x: len(x.split()))
dp['OOV_per_tokenized_sent'] = dp['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
dp['OOV_count'] = dp['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
dp['OOV_per_source_sent'] = dp['OOV_count'] / dp['source_len'] * 100
dp['##_per_tokenized_sent'] = dp['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
dp['##_count'] = dp['tokenize_result'].apply(lambda x: getCountofAllShop(x))
dp['##_per_source_sent'] = dp['##_count'] / dp['source_len'] * 100

dp_group = dp.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean', 'sum'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean'],  
})
dp_group

Unnamed: 0_level_0,source_len,tokenized_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
Unnamed: 0_level_1,mean,mean,sum,mean,sum,mean,mean,sum,mean
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
eojeol_composed_F_32k,11.415583,21.536333,258436,0.24062,2881,1.451239,45.853804,121449,91.453097
eojeol_composed_F_64k,11.415583,18.4985,221982,0.276842,2881,1.451239,36.989648,84995,64.573298
eojeol_pure_F_32k,11.415583,19.84175,238101,0.259793,2881,1.451239,41.279413,101114,76.673673
eojeol_pure_F_64k,11.415583,17.945833,215350,0.285277,2881,1.451239,35.03791,78363,59.605604
fixed_composed_F_32k,11.415583,27.81375,333765,0.185928,3346,1.665835,5.7103,21074,14.80853
fixed_composed_F_64k,11.415583,26.661667,319940,0.194508,3346,1.665835,2.004738,7249,5.042086
fixed_lexical_F_32k,11.415583,27.215167,326582,0.190371,3346,1.665835,3.849567,13891,9.798267
fixed_lexical_F_64k,11.415583,26.539333,318472,0.195732,3346,1.665835,1.630206,5781,4.088706
fixed_pure_F_32k,11.415583,27.185583,326227,0.190566,3346,1.665835,3.749132,13536,9.538848
fixed_pure_F_64k,11.415583,26.534,318408,0.195764,3346,1.665835,1.611258,5717,4.038697


In [74]:
dp[dp['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [75]:
nli_train = pd.read_csv('nli_train.tsv', sep='\t')
nli_dev = pd.read_csv('nli_dev.tsv', sep='\t')
nli = pd.concat([nli_train, nli_dev])

nli['tokenize_result'] = nli['tokenize_result'].apply(str)
nli['tokenize_result'] = nli['tokenize_result'].apply(lambda x: removeCS(x))
nli['source_len'] = nli['source'].apply(lambda x: len(x.split()))
nli['tokenized_len'] = nli['tokenize_result'].apply(lambda x: len(x.split()))
nli['OOV_per_tokenized_sent'] = nli['tokenize_result'].apply(lambda x: getOOVRatePerSentence(x))
nli['OOV_count'] = nli['tokenize_result'].apply(lambda x: getCountofAllOOV(x))
nli['OOV_per_source_sent'] = nli['OOV_count'] / nli['source_len'] * 100
nli['##_per_tokenized_sent'] = nli['tokenize_result'].apply(lambda x: getShopRatePerSentence(x))
nli['##_count'] = nli['tokenize_result'].apply(lambda x: getCountofAllShop(x))
nli['##_per_source_sent'] = nli['##_count'] / nli['source_len'] * 100

nli_group = nli.groupby('tokenizer').agg({
    'source_len': ['mean'],
    'tokenized_len': ['mean'],
    'OOV_per_tokenized_sent': ['mean'],
    'OOV_count': ['sum'],
    'OOV_per_source_sent': ['mean'],
    '##_per_tokenized_sent': ['mean'],
    '##_count': ['sum'],
    '##_per_source_sent': ['mean'],  
})
nli_group

Unnamed: 0_level_0,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
Unnamed: 0_level_1,mean,mean,mean,sum,mean,mean,sum,mean
tokenizer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
eojeol_composed_F_32k,8.277216,15.332113,0.003902,151,0.02578,44.817167,395046,90.25258
eojeol_composed_F_64k,8.277216,13.17637,0.004501,151,0.02578,35.874211,274333,63.527354
eojeol_pure_F_32k,8.277216,14.121044,0.0041,151,0.02578,40.045834,327231,75.100799
eojeol_pure_F_64k,8.277216,12.774127,0.0046,151,0.02578,33.88747,251809,58.397642
fixed_composed_F_32k,8.277216,20.357936,0.00323,196,0.03105,5.90786,73156,16.203469
fixed_composed_F_64k,8.277216,19.523573,0.003463,196,0.03105,2.234514,26435,5.897201
fixed_lexical_F_32k,8.277216,19.931638,0.003309,196,0.03105,4.104835,49285,11.089676
fixed_lexical_F_64k,8.277216,19.424727,0.003484,196,0.03105,1.784534,20900,4.699381
fixed_pure_F_32k,8.277216,19.909172,0.003309,196,0.03105,4.00357,48027,10.805589
fixed_pure_F_64k,8.277216,19.422119,0.003484,196,0.03105,1.772856,20754,4.670615


In [76]:
nli[nli['tokenized_len'] > 128].groupby('tokenizer').count()

Unnamed: 0_level_0,source,tokenize_result,source_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent
tokenizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [77]:
# size
print(len(cola)/10,
len(nsmc)/10,
len(paws)/10,
len(hsd)/10,
len(dp)/10,
len(nli)/10
)

os.makedirs('results', exist_ok=True)
# save results
cola.to_csv('results/cola_result.tsv', sep='\t', index=False)
nsmc.to_csv('results/nsmc_result.tsv', sep='\t', index=False)
paws.to_csv('results/paws_result.tsv', sep='\t', index=False)
hsd.to_csv('results/hsd_result.tsv', sep='\t', index=False)
dp.to_csv('results/dp_result.tsv', sep='\t', index=False)
nli.to_csv('results/nli_result.tsv', sep='\t', index=False)

17996.0 199992.0 106132.0 8367.0 12000.0 55996.0


In [87]:
cola_group['task'] = 'cola'
dp_group['task'] = 'dp'
hsd_group['task'] = 'hsd'
nli_group['task'] = 'nli'
nsmc_group['task'] = 'nsmc'
paws_group['task'] = 'paws'

cola['task'] = 'cola'
dp['task'] = 'dp'
hsd['task'] = 'hsd'
nli['task'] = 'nli'
nsmc['task'] = 'nsmc'
paws['task'] = 'paws'

In [89]:
total_result = pd.concat([cola.reset_index(),
                          dp.reset_index(),
                          hsd.reset_index(),
                          nli.reset_index(),
                          nsmc.reset_index(),
                          paws.reset_index()])
total_result
total_result.to_csv('results/total_result.csv', index=False)

In [84]:
total_group_result = pd.concat([cola_group.reset_index(),
           dp_group.reset_index(),
           hsd_group.reset_index(),
           nli_group.reset_index(),
           nsmc_group.reset_index(),
           paws_group.reset_index()])

total_group_result

Unnamed: 0_level_0,tokenizer,source_len,tokenized_len,tokenized_len,OOV_per_tokenized_sent,OOV_count,OOV_per_source_sent,##_per_tokenized_sent,##_count,##_per_source_sent,task
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,sum,mean,sum,mean,mean,sum,mean,Unnamed: 11_level_1
0,eojeol_composed_F_32k,5.011058,8.460269,152251.0,0.032876,189,0.162933,39.408974,62072,72.816467,cola
1,eojeol_composed_F_64k,5.011058,7.42854,133684.0,0.036712,189,0.162933,31.16514,43505,51.650842,cola
2,eojeol_pure_F_32k,5.011058,7.844354,141167.0,0.035132,189,0.162933,34.697582,50988,60.109946,cola
3,eojeol_pure_F_64k,5.011058,7.228717,130088.0,0.037975,189,0.162933,29.298988,39909,47.571706,cola
4,fixed_composed_F_32k,5.011058,12.380196,222794.0,0.01988,216,0.181257,4.353754,10372,12.154478,cola
5,fixed_composed_F_64k,5.011058,11.972772,215462.0,0.020558,216,0.181257,1.323949,3040,3.629573,cola
6,fixed_lexical_F_32k,5.011058,12.21377,219799.0,0.020423,216,0.181257,3.129198,7377,8.710197,cola
7,fixed_lexical_F_64k,5.011058,11.917037,214459.0,0.020611,216,0.181257,0.884107,2037,2.420483,cola
8,fixed_pure_F_32k,5.011058,12.207157,219680.0,0.020423,216,0.181257,3.083478,7258,8.578487,cola
9,fixed_pure_F_64k,5.011058,11.914259,214409.0,0.020611,216,0.181257,0.861383,1987,2.356606,cola


In [85]:
total_group_result.to_csv('total_group_result.csv', index=False)