In [1]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [50]:
class Vocabulary:
    def __init__(self, TOKENS=["[PAD]", "FOS", "EOS", "[SEP]", "[UNK]", "[NONE]"]):
        self.index2item = []
        self.item2index = {}

        for sp_token in TOKENS:
            self.add_item(sp_token)

    # 単語数
    def __len__(self):
        return len(self.item2index)

    # 単語が含まれているか
    def __contains__(self, item):
        return item in self.item2index.keys()
    
    def __str__(self) -> str:
        return str(self.item2index)

    # 単語の追加
    def add_item(self, item):
        # もう登録されてたら登録しないよ
        if item in self.item2index:
            return
        index = len(self.item2index)
        self.index2item.append(item)
        self.item2index[item] = index
    
    def add_items(self, items:list):
        for item in items:
            self.add_item(item)

    # 単語の取得
    def get_item(self, index):
        if len(self.index2item) <= index:
            return "[UNK]"
        return self.index2item[index]

    # 単語をidへ
    def get_index(self, item):
        if item not in self.item2index:
            return self.item2index["[UNK]"]
        return self.item2index[item]

    # def save_vocab(self, )

In [3]:
def load_utt_ntt():
    ntt_path = "../../corpus/NTT/"
    utt_list = []
    for file_ in os.listdir(ntt_path):
        if not "json" in file_:
            continue 
        with open(ntt_path+file_, "r",  encoding="utf-8") as f:
            convs = json.load(f)
            for did in convs["convs"]:
                dids = list( did.keys() )[0]
                conv = did[dids]
                # conv = did[dids][3::3]
                utt_list.extend( [ utt for utt in conv])
    
    print(len(utt_list))
    return utt_list

In [19]:
ntt_utt_ = load_utt_ntt()
docs = sentence2docs(ntt_utt_[:200])
ntt_utt = list(map(str, docs))

  0%|          | 0/141777 [09:07<?, ?it/s]


141777


In [20]:
def clean_text_plain(text):
    text_ = neologdn.normalize(text)
    text_ = re.sub(r'\(.*\)', "", text_)
    text_ = re.sub(r'\d+', "0", text_)
    return text_

In [21]:
all_utt = [clean_text_plain(t) for t in tqdm(ntt_utt)]

100%|██████████| 408/408 [00:00<00:00, 117832.13it/s]


In [26]:
filled_chars = fill_SYMBOL_ONE( [ list("".join(L)) for L in tqdm(all_utt) ] )

100%|██████████| 408/408 [00:00<00:00, 405515.65it/s]


In [51]:
TOKENS = ["[PAD]", "FOS", "EOS", "[SEP]", "[UNK]", "[NONE]"]
vocab = Vocabulary(TOKENS=TOKENS)

In [52]:
vocab.add_items(sum(filled_chars, []))

In [53]:
vocab.get_item(6)

'こ'

In [54]:
def sentence2ids(sentence, vocab):
    filled_chars = fill_SYMBOL_ONE( [ list("".join(sentence)) ] )[0]
    ids = [ vocab.get_index(c) for c in filled_chars ]
    return ids

In [55]:
sentence = "麻生太郎は漢字が読めない"
sentence2ids(sentence, vocab)

[1, 4, 92, 4, 4, 10, 4, 4, 29, 463, 146, 53, 19, 2]

In [56]:
vocab.item2index

{'[PAD]': 0,
 'FOS': 1,
 'EOS': 2,
 '[SEP]': 3,
 '[UNK]': 4,
 '[NONE]': 5,
 'こ': 6,
 'ん': 7,
 'に': 8,
 'ち': 9,
 'は': 10,
 '。': 11,
 'お': 12,
 '元': 13,
 '気': 14,
 'で': 15,
 'す': 16,
 'か': 17,
 '?': 18,
 'い': 19,
 '、': 20,
 '広': 21,
 '告': 22,
 '代': 23,
 '理': 24,
 '店': 25,
 'の': 26,
 '仕': 27,
 '事': 28,
 'が': 29,
 '忙': 30,
 'し': 31,
 '疲': 32,
 'れ': 33,
 'さ': 34,
 'ま': 35,
 '私': 36,
 '介': 37,
 '護': 38,
 '福': 39,
 '祉': 40,
 '士': 41,
 'を': 42,
 'て': 43,
 'と': 44,
 '働': 45,
 'ら': 46,
 'っ': 47,
 'ゃ': 48,
 'る': 49,
 'ね': 50,
 '大': 51,
 '変': 52,
 'な': 53,
 'よ': 54,
 'え': 55,
 'も': 56,
 'み': 57,
 '北': 58,
 '海': 59,
 '道': 60,
 '一': 61,
 '軒': 62,
 '家': 63,
 '住': 64,
 'ご': 65,
 '飯': 66,
 '美': 67,
 '味': 68,
 'ろ': 69,
 '羨': 70,
 '青': 71,
 '森': 72,
 '県': 73,
 '出': 74,
 '身': 75,
 '山': 76,
 '多': 77,
 '好': 78,
 'き': 79,
 '近': 80,
 'く': 81,
 'た': 82,
 'つ': 83,
 '東': 84,
 '京': 85,
 '華': 86,
 'や': 87,
 '場': 88,
 '所': 89,
 '田': 90,
 '舎': 91,
 '生': 92,
 '魅': 93,
 '力': 94,
 '的': 95,
 '休': 96,
 '日': 97,
 '何': 98,